From dc72ec808d97a83fe9d3c1889302067cbee24c91 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 18 Dec 2024 19:19:14 -0800
Subject: [PATCH 001/209] [RISCV] Custom legalize vp.merge for mask vectors.
 (#120479)

The default legalization uses vmslt with a vector of XLen to compute a
mask. This doesn't work if the type isn't legal. For fixed vectors it
will scalarize. For scalable vectors it crashes the compiler.

This patch uses an alternate strategy that promotes the i1 vector to an
i8 vector and does the merge. I don't claim this to be the best
lowering. I wrote it quickly almost 3 years ago when a crash was
reported in our downstream.

Fixes #120405.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  72 +++++-
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |   1 +
 .../RISCV/rvv/fixed-vectors-vpmerge.ll        | 184 ++++++++++++++-
 llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll | 211 +++++++++++++++++-
 4 files changed, 454 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index b703eb90e8ef3..affc29ec18ff7 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -758,9 +758,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                          Custom);
 
       setOperationAction(ISD::SELECT, VT, Custom);
-      setOperationAction(
-          {ISD::SELECT_CC, ISD::VSELECT, ISD::VP_MERGE, ISD::VP_SELECT}, VT,
-          Expand);
+      setOperationAction({ISD::SELECT_CC, ISD::VSELECT, ISD::VP_SELECT}, VT,
+                         Expand);
+      setOperationAction(ISD::VP_MERGE, VT, Custom);
 
       setOperationAction({ISD::VP_CTTZ_ELTS, ISD::VP_CTTZ_ELTS_ZERO_UNDEF}, VT,
                          Custom);
@@ -1237,6 +1237,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                               ISD::VP_SETCC, ISD::VP_TRUNCATE},
                              VT, Custom);
 
+          setOperationAction(ISD::VP_MERGE, VT, Custom);
+
           setOperationAction(ISD::EXPERIMENTAL_VP_SPLICE, VT, Custom);
           setOperationAction(ISD::EXPERIMENTAL_VP_REVERSE, VT, Custom);
           continue;
@@ -7492,8 +7494,11 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     return lowerSET_ROUNDING(Op, DAG);
   case ISD::EH_DWARF_CFA:
     return lowerEH_DWARF_CFA(Op, DAG);
-  case ISD::VP_SELECT:
   case ISD::VP_MERGE:
+    if (Op.getSimpleValueType().getVectorElementType() == MVT::i1)
+      return lowerVPMergeMask(Op, DAG);
+    [[fallthrough]];
+  case ISD::VP_SELECT:
   case ISD::VP_ADD:
   case ISD::VP_SUB:
   case ISD::VP_MUL:
@@ -12078,6 +12083,65 @@ SDValue RISCVTargetLowering::lowerVPFPIntConvOp(SDValue Op,
   return convertFromScalableVector(VT, Result, DAG, Subtarget);
 }
 
+SDValue RISCVTargetLowering::lowerVPMergeMask(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
+  MVT XLenVT = Subtarget.getXLenVT();
+
+  SDValue Mask = Op.getOperand(0);
+  SDValue TrueVal = Op.getOperand(1);
+  SDValue FalseVal = Op.getOperand(2);
+  SDValue VL = Op.getOperand(3);
+
+  // Use default legalization if a vector of EVL type would be legal.
+  EVT EVLVecVT = EVT::getVectorVT(*DAG.getContext(), VL.getValueType(),
+                                  VT.getVectorElementCount());
+  if (isTypeLegal(EVLVecVT))
+    return SDValue();
+
+  MVT ContainerVT = VT;
+  if (VT.isFixedLengthVector()) {
+    ContainerVT = getContainerForFixedLengthVector(VT);
+    Mask = convertToScalableVector(ContainerVT, Mask, DAG, Subtarget);
+    TrueVal = convertToScalableVector(ContainerVT, TrueVal, DAG, Subtarget);
+    FalseVal = convertToScalableVector(ContainerVT, FalseVal, DAG, Subtarget);
+  }
+
+  // Promote to a vector of i8.
+  MVT PromotedVT = ContainerVT.changeVectorElementType(MVT::i8);
+
+  // Promote TrueVal and FalseVal using VLMax.
+  // FIXME: Is there a better way to do this?
+  SDValue VLMax = DAG.getRegister(RISCV::X0, XLenVT);
+  SDValue SplatOne = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, PromotedVT,
+                                 DAG.getUNDEF(PromotedVT),
+                                 DAG.getConstant(1, DL, XLenVT), VLMax);
+  SDValue SplatZero = DAG.getNode(RISCVISD::VMV_V_X_VL, DL, PromotedVT,
+                                  DAG.getUNDEF(PromotedVT),
+                                  DAG.getConstant(0, DL, XLenVT), VLMax);
+  TrueVal = DAG.getNode(RISCVISD::VMERGE_VL, DL, PromotedVT, TrueVal, SplatOne,
+                        SplatZero, DAG.getUNDEF(PromotedVT), VL);
+  // Any element past VL uses FalseVal, so use VLMax
+  FalseVal = DAG.getNode(RISCVISD::VMERGE_VL, DL, PromotedVT, FalseVal,
+                         SplatOne, SplatZero, DAG.getUNDEF(PromotedVT), VLMax);
+
+  // VP_MERGE the two promoted values.
+  SDValue VPMerge = DAG.getNode(RISCVISD::VMERGE_VL, DL, PromotedVT, Mask,
+                                TrueVal, FalseVal, FalseVal, VL);
+
+  // Convert back to mask.
+  SDValue TrueMask = DAG.getNode(RISCVISD::VMSET_VL, DL, ContainerVT, VL);
+  SDValue Result = DAG.getNode(
+      RISCVISD::SETCC_VL, DL, ContainerVT,
+      {VPMerge, DAG.getConstant(0, DL, PromotedVT), DAG.getCondCode(ISD::SETNE),
+       DAG.getUNDEF(getMaskTypeFor(ContainerVT)), TrueMask, VLMax});
+
+  if (VT.isFixedLengthVector())
+    Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
+  return Result;
+}
+
 SDValue
 RISCVTargetLowering::lowerVPSpliceExperimental(SDValue Op,
                                                SelectionDAG &DAG) const {
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 0944bb8793a94..4c78fd784a3c8 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -996,6 +996,7 @@ class RISCVTargetLowering : public TargetLowering {
   SDValue lowerLogicVPOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVPExtMaskOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVPSetCCMaskOp(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerVPMergeMask(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVPSplatExperimental(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVPSpliceExperimental(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVPReverseExperimental(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
index a53d33e6120d5..6394542479d1b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vpmerge.ll
@@ -58,6 +58,182 @@ define <4 x i1> @vpmerge_vv_v4i1(<4 x i1> %va, <4 x i1> %vb, <4 x i1> %m, i32 ze
   ret <4 x i1> %v
 }
 
+define <8 x i1> @vpmerge_vv_v8i1(<8 x i1> %va, <8 x i1> %vb, <8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpmerge_vv_v8i1:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vid.v v10
+; RV32-NEXT:    vmsltu.vx v12, v10, a0
+; RV32-NEXT:    vmand.mm v9, v9, v12
+; RV32-NEXT:    vmandn.mm v8, v8, v9
+; RV32-NEXT:    vmand.mm v9, v0, v9
+; RV32-NEXT:    vmor.mm v0, v9, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpmerge_vv_v8i1:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT:    vid.v v12
+; RV64-NEXT:    vmsltu.vx v10, v12, a0
+; RV64-NEXT:    vmand.mm v9, v9, v10
+; RV64-NEXT:    vmandn.mm v8, v8, v9
+; RV64-NEXT:    vmand.mm v9, v0, v9
+; RV64-NEXT:    vmor.mm v0, v9, v8
+; RV64-NEXT:    ret
+;
+; RV32ZVFHMIN-LABEL: vpmerge_vv_v8i1:
+; RV32ZVFHMIN:       # %bb.0:
+; RV32ZVFHMIN-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32ZVFHMIN-NEXT:    vid.v v10
+; RV32ZVFHMIN-NEXT:    vmsltu.vx v12, v10, a0
+; RV32ZVFHMIN-NEXT:    vmand.mm v9, v9, v12
+; RV32ZVFHMIN-NEXT:    vmandn.mm v8, v8, v9
+; RV32ZVFHMIN-NEXT:    vmand.mm v9, v0, v9
+; RV32ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
+; RV32ZVFHMIN-NEXT:    ret
+;
+; RV64ZVFHMIN-LABEL: vpmerge_vv_v8i1:
+; RV64ZVFHMIN:       # %bb.0:
+; RV64ZVFHMIN-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV64ZVFHMIN-NEXT:    vid.v v12
+; RV64ZVFHMIN-NEXT:    vmsltu.vx v10, v12, a0
+; RV64ZVFHMIN-NEXT:    vmand.mm v9, v9, v10
+; RV64ZVFHMIN-NEXT:    vmandn.mm v8, v8, v9
+; RV64ZVFHMIN-NEXT:    vmand.mm v9, v0, v9
+; RV64ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
+; RV64ZVFHMIN-NEXT:    ret
+  %v = call <8 x i1> @llvm.vp.merge.v8i1(<8 x i1> %m, <8 x i1> %va, <8 x i1> %vb, i32 %evl)
+  ret <8 x i1> %v
+}
+
+define <16 x i1> @vpmerge_vv_v16i1(<16 x i1> %va, <16 x i1> %vb, <16 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpmerge_vv_v16i1:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vid.v v12
+; RV32-NEXT:    vmsltu.vx v10, v12, a0
+; RV32-NEXT:    vmand.mm v9, v9, v10
+; RV32-NEXT:    vmandn.mm v8, v8, v9
+; RV32-NEXT:    vmand.mm v9, v0, v9
+; RV32-NEXT:    vmor.mm v0, v9, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpmerge_vv_v16i1:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vid.v v16
+; RV64-NEXT:    vmsltu.vx v10, v16, a0
+; RV64-NEXT:    vmand.mm v9, v9, v10
+; RV64-NEXT:    vmandn.mm v8, v8, v9
+; RV64-NEXT:    vmand.mm v9, v0, v9
+; RV64-NEXT:    vmor.mm v0, v9, v8
+; RV64-NEXT:    ret
+;
+; RV32ZVFHMIN-LABEL: vpmerge_vv_v16i1:
+; RV32ZVFHMIN:       # %bb.0:
+; RV32ZVFHMIN-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32ZVFHMIN-NEXT:    vid.v v12
+; RV32ZVFHMIN-NEXT:    vmsltu.vx v10, v12, a0
+; RV32ZVFHMIN-NEXT:    vmand.mm v9, v9, v10
+; RV32ZVFHMIN-NEXT:    vmandn.mm v8, v8, v9
+; RV32ZVFHMIN-NEXT:    vmand.mm v9, v0, v9
+; RV32ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
+; RV32ZVFHMIN-NEXT:    ret
+;
+; RV64ZVFHMIN-LABEL: vpmerge_vv_v16i1:
+; RV64ZVFHMIN:       # %bb.0:
+; RV64ZVFHMIN-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64ZVFHMIN-NEXT:    vid.v v16
+; RV64ZVFHMIN-NEXT:    vmsltu.vx v10, v16, a0
+; RV64ZVFHMIN-NEXT:    vmand.mm v9, v9, v10
+; RV64ZVFHMIN-NEXT:    vmandn.mm v8, v8, v9
+; RV64ZVFHMIN-NEXT:    vmand.mm v9, v0, v9
+; RV64ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
+; RV64ZVFHMIN-NEXT:    ret
+  %v = call <16 x i1> @llvm.vp.merge.v16i1(<16 x i1> %m, <16 x i1> %va, <16 x i1> %vb, i32 %evl)
+  ret <16 x i1> %v
+}
+
+define <32 x i1> @vpmerge_vv_v32i1(<32 x i1> %va, <32 x i1> %vb, <32 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpmerge_vv_v32i1:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    vid.v v16
+; RV32-NEXT:    vmsltu.vx v10, v16, a0
+; RV32-NEXT:    vmand.mm v9, v9, v10
+; RV32-NEXT:    vmandn.mm v8, v8, v9
+; RV32-NEXT:    vmand.mm v9, v0, v9
+; RV32-NEXT:    vmor.mm v0, v9, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpmerge_vv_v32i1:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV64-NEXT:    vmv.v.i v10, 0
+; RV64-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; RV64-NEXT:    vmerge.vim v12, v10, 1, v0
+; RV64-NEXT:    vmv1r.v v0, v8
+; RV64-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV64-NEXT:    vmerge.vim v10, v10, 1, v0
+; RV64-NEXT:    vmv1r.v v0, v9
+; RV64-NEXT:    vsetvli zero, a0, e8, m2, tu, ma
+; RV64-NEXT:    vmerge.vvm v10, v10, v12, v0
+; RV64-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; RV64-NEXT:    vmsne.vi v0, v10, 0
+; RV64-NEXT:    ret
+;
+; RV32ZVFHMIN-LABEL: vpmerge_vv_v32i1:
+; RV32ZVFHMIN:       # %bb.0:
+; RV32ZVFHMIN-NEXT:    li a1, 32
+; RV32ZVFHMIN-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32ZVFHMIN-NEXT:    vid.v v16
+; RV32ZVFHMIN-NEXT:    vmsltu.vx v10, v16, a0
+; RV32ZVFHMIN-NEXT:    vmand.mm v9, v9, v10
+; RV32ZVFHMIN-NEXT:    vmandn.mm v8, v8, v9
+; RV32ZVFHMIN-NEXT:    vmand.mm v9, v0, v9
+; RV32ZVFHMIN-NEXT:    vmor.mm v0, v9, v8
+; RV32ZVFHMIN-NEXT:    ret
+;
+; RV64ZVFHMIN-LABEL: vpmerge_vv_v32i1:
+; RV64ZVFHMIN:       # %bb.0:
+; RV64ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV64ZVFHMIN-NEXT:    vmv.v.i v10, 0
+; RV64ZVFHMIN-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; RV64ZVFHMIN-NEXT:    vmerge.vim v12, v10, 1, v0
+; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; RV64ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV64ZVFHMIN-NEXT:    vmerge.vim v10, v10, 1, v0
+; RV64ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; RV64ZVFHMIN-NEXT:    vsetvli zero, a0, e8, m2, tu, ma
+; RV64ZVFHMIN-NEXT:    vmerge.vvm v10, v10, v12, v0
+; RV64ZVFHMIN-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; RV64ZVFHMIN-NEXT:    vmsne.vi v0, v10, 0
+; RV64ZVFHMIN-NEXT:    ret
+  %v = call <32 x i1> @llvm.vp.merge.v32i1(<32 x i1> %m, <32 x i1> %va, <32 x i1> %vb, i32 %evl)
+  ret <32 x i1> %v
+}
+
+define <64 x i1> @vpmerge_vv_v64i1(<64 x i1> %va, <64 x i1> %vb, <64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpmerge_vv_v64i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, tu, ma
+; CHECK-NEXT:    vmerge.vvm v12, v12, v16, v0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v12, 0
+; CHECK-NEXT:    ret
+  %v = call <64 x i1> @llvm.vp.merge.v64i1(<64 x i1> %m, <64 x i1> %va, <64 x i1> %vb, i32 %evl)
+  ret <64 x i1> %v
+}
+
 declare <2 x i8> @llvm.vp.merge.v2i8(<2 x i1>, <2 x i8>, <2 x i8>, i32)
 
 define <2 x i8> @vpmerge_vv_v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 zeroext %evl) {
@@ -1188,10 +1364,10 @@ define <32 x double> @vpmerge_vv_v32f64(<32 x double> %va, <32 x double> %vb, <3
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:    mv a0, a2
-; CHECK-NEXT:    bltu a2, a1, .LBB79_2
+; CHECK-NEXT:    bltu a2, a1, .LBB83_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a0, 16
-; CHECK-NEXT:  .LBB79_2:
+; CHECK-NEXT:  .LBB83_2:
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m8, tu, ma
 ; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; CHECK-NEXT:    addi a0, a2, -16
@@ -1221,10 +1397,10 @@ define <32 x double> @vpmerge_vf_v32f64(double %a, <32 x double> %vb, <32 x i1>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 16
 ; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    bltu a0, a2, .LBB80_2
+; CHECK-NEXT:    bltu a0, a2, .LBB84_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 16
-; CHECK-NEXT:  .LBB80_2:
+; CHECK-NEXT:  .LBB84_2:
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, tu, ma
 ; CHECK-NEXT:    vfmerge.vfm v8, v8, fa0, v0
 ; CHECK-NEXT:    addi a1, a0, -16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
index 88a8ebcc90054..4cd77185e6930 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vpmerge-sdnode.ll
@@ -35,6 +35,205 @@ define <vscale x 1 x i1> @vpmerge_nxv1i1(<vscale x 1 x i1> %va, <vscale x 1 x i1
   ret <vscale x 1 x i1> %v
 }
 
+define <vscale x 2 x i1> @vpmerge_nxv2i1(<vscale x 2 x i1> %va, <vscale x 2 x i1> %vb, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpmerge_nxv2i1:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; RV32-NEXT:    vid.v v10
+; RV32-NEXT:    vmsltu.vx v10, v10, a0
+; RV32-NEXT:    vmand.mm v9, v9, v10
+; RV32-NEXT:    vmandn.mm v8, v8, v9
+; RV32-NEXT:    vmand.mm v9, v0, v9
+; RV32-NEXT:    vmor.mm v0, v9, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpmerge_nxv2i1:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV64-NEXT:    vid.v v10
+; RV64-NEXT:    vmsltu.vx v12, v10, a0
+; RV64-NEXT:    vmand.mm v9, v9, v12
+; RV64-NEXT:    vmandn.mm v8, v8, v9
+; RV64-NEXT:    vmand.mm v9, v0, v9
+; RV64-NEXT:    vmor.mm v0, v9, v8
+; RV64-NEXT:    ret
+  %v = call <vscale x 2 x i1> @llvm.vp.merge.nxv2i1(<vscale x 2 x i1> %m, <vscale x 2 x i1> %va, <vscale x 2 x i1> %vb, i32 %evl)
+  ret <vscale x 2 x i1> %v
+}
+
+define <vscale x 4 x i1> @vpmerge_nxv4i1(<vscale x 4 x i1> %va, <vscale x 4 x i1> %vb, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpmerge_nxv4i1:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; RV32-NEXT:    vid.v v10
+; RV32-NEXT:    vmsltu.vx v12, v10, a0
+; RV32-NEXT:    vmand.mm v9, v9, v12
+; RV32-NEXT:    vmandn.mm v8, v8, v9
+; RV32-NEXT:    vmand.mm v9, v0, v9
+; RV32-NEXT:    vmor.mm v0, v9, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpmerge_nxv4i1:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV64-NEXT:    vid.v v12
+; RV64-NEXT:    vmsltu.vx v10, v12, a0
+; RV64-NEXT:    vmand.mm v9, v9, v10
+; RV64-NEXT:    vmandn.mm v8, v8, v9
+; RV64-NEXT:    vmand.mm v9, v0, v9
+; RV64-NEXT:    vmor.mm v0, v9, v8
+; RV64-NEXT:    ret
+  %v = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> %m, <vscale x 4 x i1> %va, <vscale x 4 x i1> %vb, i32 %evl)
+  ret <vscale x 4 x i1> %v
+}
+
+define <vscale x 8 x i1> @vpmerge_nxv8i1(<vscale x 8 x i1> %va, <vscale x 8 x i1> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpmerge_nxv8i1:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; RV32-NEXT:    vid.v v12
+; RV32-NEXT:    vmsltu.vx v10, v12, a0
+; RV32-NEXT:    vmand.mm v9, v9, v10
+; RV32-NEXT:    vmandn.mm v8, v8, v9
+; RV32-NEXT:    vmand.mm v9, v0, v9
+; RV32-NEXT:    vmor.mm v0, v9, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpmerge_nxv8i1:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT:    vid.v v16
+; RV64-NEXT:    vmsltu.vx v10, v16, a0
+; RV64-NEXT:    vmand.mm v9, v9, v10
+; RV64-NEXT:    vmandn.mm v8, v8, v9
+; RV64-NEXT:    vmand.mm v9, v0, v9
+; RV64-NEXT:    vmor.mm v0, v9, v8
+; RV64-NEXT:    ret
+  %v = call <vscale x 8 x i1> @llvm.vp.merge.nxv8i1(<vscale x 8 x i1> %m, <vscale x 8 x i1> %va, <vscale x 8 x i1> %vb, i32 %evl)
+  ret <vscale x 8 x i1> %v
+}
+
+define <vscale x 16 x i1> @vpmerge_nxv16i1(<vscale x 16 x i1> %va, <vscale x 16 x i1> %vb, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vpmerge_nxv16i1:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vid.v v16
+; RV32-NEXT:    vmsltu.vx v10, v16, a0
+; RV32-NEXT:    vmand.mm v9, v9, v10
+; RV32-NEXT:    vmandn.mm v8, v8, v9
+; RV32-NEXT:    vmand.mm v9, v0, v9
+; RV32-NEXT:    vmor.mm v0, v9, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vpmerge_nxv16i1:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV64-NEXT:    vmv.v.i v10, 0
+; RV64-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; RV64-NEXT:    vmerge.vim v12, v10, 1, v0
+; RV64-NEXT:    vmv1r.v v0, v8
+; RV64-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; RV64-NEXT:    vmerge.vim v10, v10, 1, v0
+; RV64-NEXT:    vmv1r.v v0, v9
+; RV64-NEXT:    vsetvli zero, a0, e8, m2, tu, ma
+; RV64-NEXT:    vmerge.vvm v10, v10, v12, v0
+; RV64-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; RV64-NEXT:    vmsne.vi v0, v10, 0
+; RV64-NEXT:    ret
+  %v = call <vscale x 16 x i1> @llvm.vp.merge.nxv16i1(<vscale x 16 x i1> %m, <vscale x 16 x i1> %va, <vscale x 16 x i1> %vb, i32 %evl)
+  ret <vscale x 16 x i1> %v
+}
+
+define <vscale x 32 x i1> @vpmerge_nxv32i1(<vscale x 32 x i1> %va, <vscale x 32 x i1> %vb, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpmerge_nxv32i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, tu, ma
+; CHECK-NEXT:    vmerge.vvm v12, v12, v16, v0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m4, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v12, 0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i1> @llvm.vp.merge.nxv32i1(<vscale x 32 x i1> %m, <vscale x 32 x i1> %va, <vscale x 32 x i1> %vb, i32 %evl)
+  ret <vscale x 32 x i1> %v
+}
+
+define <vscale x 64 x i1> @vpmerge_nxv64i1(<vscale x 64 x i1> %va, <vscale x 64 x i1> %vb, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpmerge_nxv64i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vmerge.vim v24, v16, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, tu, ma
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v16, 0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 64 x i1> @llvm.vp.merge.nxv64i1(<vscale x 64 x i1> %m, <vscale x 64 x i1> %va, <vscale x 64 x i1> %vb, i32 %evl)
+  ret <vscale x 64 x i1> %v
+}
+
+define <vscale x 128 x i1> @vpmerge_nxv128i1(<vscale x 128 x i1> %va, <vscale x 128 x i1> %vb, <vscale x 128 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpmerge_nxv128i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v7, v12
+; CHECK-NEXT:    vmv1r.v v4, v11
+; CHECK-NEXT:    vmv1r.v v6, v10
+; CHECK-NEXT:    vmv1r.v v3, v9
+; CHECK-NEXT:    vmv1r.v v5, v8
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB7_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a1, a2
+; CHECK-NEXT:  .LBB7_2:
+; CHECK-NEXT:    vsetvli a3, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vmv.v.i v16, 0
+; CHECK-NEXT:    sub a2, a0, a2
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmerge.vim v24, v16, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v3
+; CHECK-NEXT:    vsetvli a3, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vmerge.vim v16, v16, 1, v0
+; CHECK-NEXT:    sltu a0, a0, a2
+; CHECK-NEXT:    vmv1r.v v0, v4
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vmsne.vi v9, v16, 0
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    vmv1r.v v0, v5
+; CHECK-NEXT:    vmv.v.i v24, 0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vmerge.vim v16, v24, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v6
+; CHECK-NEXT:    vsetvli a1, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vmerge.vim v24, v24, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v7
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, tu, ma
+; CHECK-NEXT:    vmerge.vvm v24, v24, v16, v0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vmsne.vi v8, v24, 0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    ret
+  %v = call <vscale x 128 x i1> @llvm.vp.merge.nxv128i1(<vscale x 128 x i1> %m, <vscale x 128 x i1> %va, <vscale x 128 x i1> %vb, i32 %evl)
+  ret <vscale x 128 x i1> %v
+}
+
 declare <vscale x 1 x i8> @llvm.vp.merge.nxv1i8(<vscale x 1 x i1>, <vscale x 1 x i8>, <vscale x 1 x i8>, i32)
 
 define <vscale x 1 x i8> @vpmerge_vv_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
@@ -378,10 +577,10 @@ define <vscale x 128 x i8> @vpmerge_vv_nxv128i8(<vscale x 128 x i8> %va, <vscale
 ; CHECK-NEXT:    and a2, a2, a4
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
 ; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
-; CHECK-NEXT:    bltu a3, a1, .LBB28_2
+; CHECK-NEXT:    bltu a3, a1, .LBB35_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a3, a1
-; CHECK-NEXT:  .LBB28_2:
+; CHECK-NEXT:  .LBB35_2:
 ; CHECK-NEXT:    vmv1r.v v0, v7
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
@@ -412,10 +611,10 @@ define <vscale x 128 x i8> @vpmerge_vx_nxv128i8(i8 %a, <vscale x 128 x i8> %vb,
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    vsetvli zero, a3, e8, m8, tu, ma
 ; CHECK-NEXT:    vmerge.vxm v16, v16, a0, v0
-; CHECK-NEXT:    bltu a2, a1, .LBB29_2
+; CHECK-NEXT:    bltu a2, a1, .LBB36_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:  .LBB29_2:
+; CHECK-NEXT:  .LBB36_2:
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
 ; CHECK-NEXT:    vmerge.vxm v8, v8, a0, v0
@@ -440,10 +639,10 @@ define <vscale x 128 x i8> @vpmerge_vi_nxv128i8(<vscale x 128 x i8> %vb, <vscale
 ; CHECK-NEXT:    and a2, a3, a2
 ; CHECK-NEXT:    vsetvli zero, a2, e8, m8, tu, ma
 ; CHECK-NEXT:    vmerge.vim v16, v16, 2, v0
-; CHECK-NEXT:    bltu a1, a0, .LBB30_2
+; CHECK-NEXT:    bltu a1, a0, .LBB37_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:  .LBB30_2:
+; CHECK-NEXT:  .LBB37_2:
 ; CHECK-NEXT:    vmv1r.v v0, v24
 ; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
 ; CHECK-NEXT:    vmerge.vim v8, v8, 2, v0

From f0dcf3240dffe3767c7f3a2e2da5b92ae9fd1bef Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <nathan@kernel.org>
Date: Wed, 18 Dec 2024 20:23:50 -0700
Subject: [PATCH 002/209] [Sema] Fix tautological bounds check warning with
 -fwrapv (#120480)

The tautological bounds check warning added in #120222 does not take
into account whether signed integer overflow is well defined or not,
which could result in a developer removing a bounds check that may not
actually be always false because of different overflow semantics.

```c
int check(const int* foo, unsigned int idx)
{
    return foo + idx < foo;
}
```

```
$ clang -O2 -c test.c
test.c:3:19: warning: pointer comparison always evaluates to false [-Wtautological-compare]
    3 |         return foo + idx < foo;
      |                          ^
1 warning generated.

# Bounds check is eliminated without -fwrapv, warning was correct
$ llvm-objdump -dr test.o
...
0000000000000000 <check>:
       0: 31 c0                         xorl    %eax, %eax
       2: c3                            retq
```

```
$ clang -O2 -fwrapv -c test.c
test.c:3:19: warning: pointer comparison always evaluates to false [-Wtautological-compare]
    3 |         return foo + idx < foo;
      |                          ^
1 warning generated.

# Bounds check remains, warning was wrong
$ llvm-objdump -dr test.o
0000000000000000 <check>:
       0: 89 f0                         movl    %esi, %eax
       2: 48 8d 0c 87                   leaq    (%rdi,%rax,4), %rcx
       6: 31 c0                         xorl    %eax, %eax
       8: 48 39 f9                      cmpq    %rdi, %rcx
       b: 0f 92 c0                      setb    %al
       e: c3                            retq
```
---
 clang/lib/Sema/SemaExpr.cpp                       | 7 ++++---
 clang/test/Sema/tautological-pointer-comparison.c | 3 +++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index e06a092177ef0..24f7d27c69115 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -11789,10 +11789,11 @@ static bool checkForArray(const Expr *E) {
 /// Detect patterns ptr + size >= ptr and ptr + size < ptr, where ptr is a
 /// pointer and size is an unsigned integer. Return whether the result is
 /// always true/false.
-static std::optional<bool> isTautologicalBoundsCheck(const Expr *LHS,
+static std::optional<bool> isTautologicalBoundsCheck(Sema &S, const Expr *LHS,
                                                      const Expr *RHS,
                                                      BinaryOperatorKind Opc) {
-  if (!LHS->getType()->isPointerType())
+  if (!LHS->getType()->isPointerType() ||
+      S.getLangOpts().isSignedOverflowDefined())
     return std::nullopt;
 
   // Canonicalize to >= or < predicate.
@@ -11940,7 +11941,7 @@ static void diagnoseTautologicalComparison(Sema &S, SourceLocation Loc,
                                 << 1 /*array comparison*/
                                 << Result);
     } else if (std::optional<bool> Res =
-                   isTautologicalBoundsCheck(LHS, RHS, Opc)) {
+                   isTautologicalBoundsCheck(S, LHS, RHS, Opc)) {
       S.DiagRuntimeBehavior(Loc, nullptr,
                             S.PDiag(diag::warn_comparison_always)
                                 << 2 /*pointer comparison*/
diff --git a/clang/test/Sema/tautological-pointer-comparison.c b/clang/test/Sema/tautological-pointer-comparison.c
index 19cd20e5f7d21..1c5973b01a30d 100644
--- a/clang/test/Sema/tautological-pointer-comparison.c
+++ b/clang/test/Sema/tautological-pointer-comparison.c
@@ -1,4 +1,7 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fwrapv -verify=fwrapv %s
+
+// fwrapv-no-diagnostics
 
 int add_ptr_idx_ult_ptr(const char *ptr, unsigned index) {
   return ptr + index < ptr; // expected-warning {{pointer comparison always evaluates to false}}

From 1cc926b8b6976ac4a5a411eae564cfde2df1ef9d Mon Sep 17 00:00:00 2001
From: Pavel Samolysov <samolisov@gmail.com>
Date: Thu, 19 Dec 2024 06:36:48 +0300
Subject: [PATCH 003/209] [ADT] Add a unittest for the ScopedHashTable class
 (#120183)

The ScopedHashTable class is particularly used to develop string tables
for parsers and code convertors. For instance, the MLIRGen class from the
toy example for MLIR actively uses this class to define scopes for
declared variables. To demonstrate common use cases for the
ScopedHashTable class as well as to check its behavior in different
situations, the unittest has been added.

Signed-off-by: Pavel Samolysov <samolisov@gmail.com>
---
 llvm/unittests/ADT/CMakeLists.txt          |   1 +
 llvm/unittests/ADT/ScopedHashTableTest.cpp | 145 +++++++++++++++++++++
 2 files changed, 146 insertions(+)
 create mode 100644 llvm/unittests/ADT/ScopedHashTableTest.cpp

diff --git a/llvm/unittests/ADT/CMakeLists.txt b/llvm/unittests/ADT/CMakeLists.txt
index 07568ad0c64e3..dafd73518aedb 100644
--- a/llvm/unittests/ADT/CMakeLists.txt
+++ b/llvm/unittests/ADT/CMakeLists.txt
@@ -67,6 +67,7 @@ add_llvm_unittest(ADTTests
   SCCIteratorTest.cpp
   STLExtrasTest.cpp
   STLForwardCompatTest.cpp
+  ScopedHashTableTest.cpp
   ScopeExitTest.cpp
   SequenceTest.cpp
   SetOperationsTest.cpp
diff --git a/llvm/unittests/ADT/ScopedHashTableTest.cpp b/llvm/unittests/ADT/ScopedHashTableTest.cpp
new file mode 100644
index 0000000000000..64afa948d9a17
--- /dev/null
+++ b/llvm/unittests/ADT/ScopedHashTableTest.cpp
@@ -0,0 +1,145 @@
+//===- ScopedHashTableTest.cpp - ScopedHashTable unit tests ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/ADT/StringRef.h"
+#include "gtest/gtest.h"
+#include <memory>
+#include <stack>
+
+using ::llvm::ScopedHashTable;
+using ::llvm::ScopedHashTableScope;
+using ::llvm::StringLiteral;
+using ::llvm::StringRef;
+
+using ::testing::Test;
+
+class ScopedHashTableTest : public Test {
+protected:
+  ScopedHashTableTest() { symbolTable.insert(kGlobalName, kGlobalValue); }
+
+  ScopedHashTable<StringRef, StringRef> symbolTable{};
+  ScopedHashTableScope<StringRef, StringRef> globalScope{symbolTable};
+
+  static constexpr StringLiteral kGlobalName = "global";
+  static constexpr StringLiteral kGlobalValue = "gvalue";
+  static constexpr StringLiteral kLocalName = "local";
+  static constexpr StringLiteral kLocalValue = "lvalue";
+  static constexpr StringLiteral kLocalValue2 = "lvalue2";
+};
+
+TEST_F(ScopedHashTableTest, AccessWithNoActiveScope) {
+  EXPECT_EQ(symbolTable.count(kGlobalName), 1U);
+}
+
+TEST_F(ScopedHashTableTest, AccessWithAScope) {
+  [[maybe_unused]] ScopedHashTableScope varScope(symbolTable);
+  EXPECT_EQ(symbolTable.count(kGlobalName), 1U);
+}
+
+TEST_F(ScopedHashTableTest, InsertInScope) {
+  [[maybe_unused]] ScopedHashTableScope varScope(symbolTable);
+  symbolTable.insert(kLocalName, kLocalValue);
+  EXPECT_EQ(symbolTable.count(kLocalName), 1U);
+}
+
+TEST_F(ScopedHashTableTest, InsertInLinearSortedScope) {
+  [[maybe_unused]] ScopedHashTableScope varScope(symbolTable);
+  [[maybe_unused]] ScopedHashTableScope varScope2(symbolTable);
+  [[maybe_unused]] ScopedHashTableScope varScope3(symbolTable);
+  symbolTable.insert(kLocalName, kLocalValue);
+  EXPECT_EQ(symbolTable.count(kLocalName), 1U);
+}
+
+TEST_F(ScopedHashTableTest, InsertInOutedScope) {
+  {
+    [[maybe_unused]] ScopedHashTableScope varScope(symbolTable);
+    symbolTable.insert(kLocalName, kLocalValue);
+  }
+  EXPECT_EQ(symbolTable.count(kLocalName), 0U);
+}
+
+TEST_F(ScopedHashTableTest, OverrideInScope) {
+  [[maybe_unused]] ScopedHashTableScope funScope(symbolTable);
+  symbolTable.insert(kLocalName, kLocalValue);
+  {
+    [[maybe_unused]] ScopedHashTableScope varScope(symbolTable);
+    symbolTable.insert(kLocalName, kLocalValue2);
+    EXPECT_EQ(symbolTable.lookup(kLocalName), kLocalValue2);
+  }
+  EXPECT_EQ(symbolTable.lookup(kLocalName), kLocalValue);
+}
+
+TEST_F(ScopedHashTableTest, GetCurScope) {
+  EXPECT_EQ(symbolTable.getCurScope(), &globalScope);
+  {
+    ScopedHashTableScope funScope(symbolTable);
+    ScopedHashTableScope funScope2(symbolTable);
+    EXPECT_EQ(symbolTable.getCurScope(), &funScope2);
+    {
+      ScopedHashTableScope blockScope(symbolTable);
+      EXPECT_EQ(symbolTable.getCurScope(), &blockScope);
+    }
+    EXPECT_EQ(symbolTable.getCurScope(), &funScope2);
+  }
+  EXPECT_EQ(symbolTable.getCurScope(), &globalScope);
+}
+
+TEST_F(ScopedHashTableTest, PopScope) {
+  using SymbolTableScopeTy = ScopedHashTable<StringRef, StringRef>::ScopeTy;
+
+  std::stack<StringRef> ExpectedValues;
+  std::stack<std::unique_ptr<SymbolTableScopeTy>> Scopes;
+
+  Scopes.emplace(std::make_unique<SymbolTableScopeTy>(symbolTable));
+  ExpectedValues.emplace(kLocalValue);
+  symbolTable.insert(kGlobalName, kLocalValue);
+
+  Scopes.emplace(std::make_unique<SymbolTableScopeTy>(symbolTable));
+  ExpectedValues.emplace(kLocalValue2);
+  symbolTable.insert(kGlobalName, kLocalValue2);
+
+  while (symbolTable.getCurScope() != &globalScope) {
+    EXPECT_EQ(symbolTable.getCurScope(), Scopes.top().get());
+    EXPECT_EQ(symbolTable.lookup(kGlobalName), ExpectedValues.top());
+    ExpectedValues.pop();
+    Scopes.pop(); // destructs the SymbolTableScopeTy instance implicitly
+                  // calling Scopes.top()->~SymbolTableScopeTy();
+    EXPECT_NE(symbolTable.getCurScope(), nullptr);
+  }
+  ASSERT_TRUE(ExpectedValues.empty());
+  ASSERT_TRUE(Scopes.empty());
+  EXPECT_EQ(symbolTable.lookup(kGlobalName), kGlobalValue);
+}
+
+TEST_F(ScopedHashTableTest, DISABLED_PopScopeOnStack) {
+  using SymbolTableScopeTy = ScopedHashTable<StringRef, StringRef>::ScopeTy;
+  SymbolTableScopeTy funScope(symbolTable);
+  symbolTable.insert(kGlobalName, kLocalValue);
+  SymbolTableScopeTy funScope2(symbolTable);
+  symbolTable.insert(kGlobalName, kLocalValue2);
+
+  std::stack<StringRef> expectedValues{{kLocalValue, kLocalValue2}};
+  std::stack<SymbolTableScopeTy *> expectedScopes{{&funScope, &funScope2}};
+
+  while (symbolTable.getCurScope() != &globalScope) {
+    EXPECT_EQ(symbolTable.getCurScope(), expectedScopes.top());
+    expectedScopes.pop();
+    EXPECT_EQ(symbolTable.lookup(kGlobalName), expectedValues.top());
+    expectedValues.pop();
+    symbolTable.getCurScope()->~SymbolTableScopeTy();
+    EXPECT_NE(symbolTable.getCurScope(), nullptr);
+  }
+
+  // We have imbalanced scopes here:
+  // Assertion `HT.CurScope == this && "Scope imbalance!"' failed
+  // HT.CurScope is a pointer to the `globalScope` while
+  // `SymbolTableScopeTy.this` is still a pointer to `funScope2`.
+  // There is no way to write an assert on an assert in googletest so that we
+  // mark the test case as DISABLED.
+}

From 76275c0c41739a30939afd1709174861a587a823 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 19 Dec 2024 03:36:52 +0000
Subject: [PATCH 004/209] [gn build] Port 1cc926b8b697

---
 llvm/utils/gn/secondary/llvm/unittests/ADT/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/unittests/ADT/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/ADT/BUILD.gn
index 3541a7ae45291..92e596ea6a004 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/ADT/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/ADT/BUILD.gn
@@ -76,6 +76,7 @@ unittest("ADTTests") {
     "STLExtrasTest.cpp",
     "STLForwardCompatTest.cpp",
     "ScopeExitTest.cpp",
+    "ScopedHashTableTest.cpp",
     "SequenceTest.cpp",
     "SetOperationsTest.cpp",
     "SetVectorTest.cpp",

From fe2685303b215182b1acc5b6fb8be30c24bd6e8e Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Wed, 18 Dec 2024 19:39:02 -0800
Subject: [PATCH 005/209] [clang-format] Fix a crash caused by commit
 f03bf8c45f43

---
 clang/lib/Format/TokenAnnotator.cpp   | 2 +-
 clang/unittests/Format/FormatTest.cpp | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 6a8caa23753f3..f2cfa7f49f62f 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -493,7 +493,7 @@ class AnnotatingParser {
             (CurrentToken->Next->is(tok::l_paren) ||
              (CurrentToken->Next->is(tok::l_square) &&
               (Line.MustBeDeclaration ||
-               PrevNonComment->isTypeName(LangOpts))))) {
+               (PrevNonComment && PrevNonComment->isTypeName(LangOpts)))))) {
           OpeningParen.setType(OpeningParen.Next->is(tok::caret)
                                    ? TT_ObjCBlockLParen
                                    : TT_FunctionTypeLParen);
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index e892f10433c55..47465a18e9a41 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -13691,6 +13691,10 @@ TEST_F(FormatTest, FormatsArrays) {
       "                                  .aaaaaaaaaaaaaaaaaaaaaa();");
   verifyFormat("a[::b::c];");
 
+  verifyFormat("{\n"
+               "  (*a)[0] = 1;\n"
+               "}");
+
   verifyNoCrash("a[,Y?)]", getLLVMStyleWithColumns(10));
 
   FormatStyle NoColumnLimit = getLLVMStyleWithColumns(0);

From c94ce0cca25229cd0e38560ad6e56a1a2f9a0c8b Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 18 Dec 2024 19:59:11 -0800
Subject: [PATCH 006/209] [ADT] Fix warnings

This patch fixes warnings of the form:

  llvm/unittests/ADT/ScopedHashTableTest.cpp:41:20: error:
  'ScopedHashTableScope' may not intend to support class template
  argument deduction [-Werror,-Wctad-maybe-unsupported]
---
 llvm/unittests/ADT/ScopedHashTableTest.cpp | 30 ++++++++++++++--------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/llvm/unittests/ADT/ScopedHashTableTest.cpp b/llvm/unittests/ADT/ScopedHashTableTest.cpp
index 64afa948d9a17..8ce5c7cecf998 100644
--- a/llvm/unittests/ADT/ScopedHashTableTest.cpp
+++ b/llvm/unittests/ADT/ScopedHashTableTest.cpp
@@ -38,37 +38,45 @@ TEST_F(ScopedHashTableTest, AccessWithNoActiveScope) {
 }
 
 TEST_F(ScopedHashTableTest, AccessWithAScope) {
-  [[maybe_unused]] ScopedHashTableScope varScope(symbolTable);
+  [[maybe_unused]] ScopedHashTableScope<StringRef, StringRef> varScope(
+      symbolTable);
   EXPECT_EQ(symbolTable.count(kGlobalName), 1U);
 }
 
 TEST_F(ScopedHashTableTest, InsertInScope) {
-  [[maybe_unused]] ScopedHashTableScope varScope(symbolTable);
+  [[maybe_unused]] ScopedHashTableScope<StringRef, StringRef> varScope(
+      symbolTable);
   symbolTable.insert(kLocalName, kLocalValue);
   EXPECT_EQ(symbolTable.count(kLocalName), 1U);
 }
 
 TEST_F(ScopedHashTableTest, InsertInLinearSortedScope) {
-  [[maybe_unused]] ScopedHashTableScope varScope(symbolTable);
-  [[maybe_unused]] ScopedHashTableScope varScope2(symbolTable);
-  [[maybe_unused]] ScopedHashTableScope varScope3(symbolTable);
+  [[maybe_unused]] ScopedHashTableScope<StringRef, StringRef> varScope(
+      symbolTable);
+  [[maybe_unused]] ScopedHashTableScope<StringRef, StringRef> varScope2(
+      symbolTable);
+  [[maybe_unused]] ScopedHashTableScope<StringRef, StringRef> varScope3(
+      symbolTable);
   symbolTable.insert(kLocalName, kLocalValue);
   EXPECT_EQ(symbolTable.count(kLocalName), 1U);
 }
 
 TEST_F(ScopedHashTableTest, InsertInOutedScope) {
   {
-    [[maybe_unused]] ScopedHashTableScope varScope(symbolTable);
+    [[maybe_unused]] ScopedHashTableScope<StringRef, StringRef> varScope(
+        symbolTable);
     symbolTable.insert(kLocalName, kLocalValue);
   }
   EXPECT_EQ(symbolTable.count(kLocalName), 0U);
 }
 
 TEST_F(ScopedHashTableTest, OverrideInScope) {
-  [[maybe_unused]] ScopedHashTableScope funScope(symbolTable);
+  [[maybe_unused]] ScopedHashTableScope<StringRef, StringRef> funScope(
+      symbolTable);
   symbolTable.insert(kLocalName, kLocalValue);
   {
-    [[maybe_unused]] ScopedHashTableScope varScope(symbolTable);
+    [[maybe_unused]] ScopedHashTableScope<StringRef, StringRef> varScope(
+        symbolTable);
     symbolTable.insert(kLocalName, kLocalValue2);
     EXPECT_EQ(symbolTable.lookup(kLocalName), kLocalValue2);
   }
@@ -78,11 +86,11 @@ TEST_F(ScopedHashTableTest, OverrideInScope) {
 TEST_F(ScopedHashTableTest, GetCurScope) {
   EXPECT_EQ(symbolTable.getCurScope(), &globalScope);
   {
-    ScopedHashTableScope funScope(symbolTable);
-    ScopedHashTableScope funScope2(symbolTable);
+    ScopedHashTableScope<StringRef, StringRef> funScope(symbolTable);
+    ScopedHashTableScope<StringRef, StringRef> funScope2(symbolTable);
     EXPECT_EQ(symbolTable.getCurScope(), &funScope2);
     {
-      ScopedHashTableScope blockScope(symbolTable);
+      ScopedHashTableScope<StringRef, StringRef> blockScope(symbolTable);
       EXPECT_EQ(symbolTable.getCurScope(), &blockScope);
     }
     EXPECT_EQ(symbolTable.getCurScope(), &funScope2);

From 104ad9258a0f93a969bf7a85ebc0c7d9c533edf1 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 18 Dec 2024 20:09:33 -0800
Subject: [PATCH 007/209] [SelectionDAG] Rename SDNode::uses() to users().
 (#120499)

This function is most often used in range based loops or algorithms
where the iterator is implicitly dereferenced. The dereference returns
an SDNode * of the user rather than SDUse * so users() is a better name.

I've long beeen annoyed that we can't write a range based loop over
SDUse when we need getOperandNo. I plan to rename use_iterator to
user_iterator and add a use_iterator that returns SDUse& on dereference.
This will make it more like IR.
---
 llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 10 ++-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 70 +++++++++----------
 .../lib/CodeGen/SelectionDAG/InstrEmitter.cpp |  6 +-
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |  4 +-
 .../CodeGen/SelectionDAG/LegalizeTypes.cpp    |  4 +-
 .../CodeGen/SelectionDAG/ScheduleDAGFast.cpp  |  2 +-
 .../SelectionDAG/ScheduleDAGSDNodes.cpp       |  2 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 14 ++--
 .../CodeGen/SelectionDAG/SelectionDAGISel.cpp |  2 +-
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 20 +++---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 16 ++---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 10 +--
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 12 ++--
 llvm/lib/Target/ARM/ARMISelLowering.cpp       | 14 ++--
 .../Target/Hexagon/HexagonISelDAGToDAGHVX.cpp |  6 +-
 .../LoongArch/LoongArchISelLowering.cpp       |  2 +-
 llvm/lib/Target/M68k/M68kISelLowering.cpp     |  2 +-
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp   |  2 +-
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp   |  8 +--
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp   | 10 +--
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   | 58 +++++++--------
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   | 18 ++---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  6 +-
 .../Target/SystemZ/SystemZISelDAGToDAG.cpp    |  2 +-
 .../Target/SystemZ/SystemZISelLowering.cpp    | 12 ++--
 llvm/lib/Target/VE/VEISelLowering.cpp         |  4 +-
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp       |  4 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 53 +++++++-------
 llvm/lib/Target/X86/X86ISelLoweringCall.cpp   |  2 +-
 29 files changed, 190 insertions(+), 185 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 61f3c6329efce..b525872f9dd2a 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -750,7 +750,7 @@ END_TWO_BYTE_PACK()
   bool use_empty() const { return UseList == nullptr; }
 
   /// Return true if there is exactly one use of this node.
-  bool hasOneUse() const { return hasSingleElement(uses()); }
+  bool hasOneUse() const { return hasSingleElement(users()); }
 
   /// Return the number of uses of this node. This method takes
   /// time proportional to the number of uses.
@@ -844,10 +844,14 @@ END_TWO_BYTE_PACK()
 
   static use_iterator use_end() { return use_iterator(nullptr); }
 
-  inline iterator_range<use_iterator> uses() {
+  // Dereferencing use_iterator returns the user SDNode* making it closer to a
+  // user_iterator thus this function is called users() to reflect that.
+  // FIXME: Rename to user_iterator and introduce a use_iterator that returns
+  // SDUse*.
+  inline iterator_range<use_iterator> users() {
     return make_range(use_begin(), use_end());
   }
-  inline iterator_range<use_iterator> uses() const {
+  inline iterator_range<use_iterator> users() const {
     return make_range(use_begin(), use_end());
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 10fc8eecaff90..ebce0ebe8f81c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -202,7 +202,7 @@ namespace {
     /// When an instruction is simplified, add all users of the instruction to
     /// the work lists because they might get more simplified now.
     void AddUsersToWorklist(SDNode *N) {
-      for (SDNode *Node : N->uses())
+      for (SDNode *Node : N->users())
         AddToWorklist(Node);
     }
 
@@ -1113,7 +1113,7 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc,
                                          : N1.getConstantOperandVal(1)));
     if (Opc == ISD::SUB)
       ScalableOffset = -ScalableOffset;
-    if (all_of(N->uses(), [&](SDNode *Node) {
+    if (all_of(N->users(), [&](SDNode *Node) {
           if (auto *LoadStore = dyn_cast<MemSDNode>(Node);
               LoadStore && LoadStore->getBasePtr().getNode() == N) {
             TargetLoweringBase::AddrMode AM;
@@ -1151,7 +1151,7 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc,
       return false;
     const int64_t CombinedValue = CombinedValueIntVal.getSExtValue();
 
-    for (SDNode *Node : N->uses()) {
+    for (SDNode *Node : N->users()) {
       if (auto *LoadStore = dyn_cast<MemSDNode>(Node)) {
         // Is x[offset2] already not a legal addressing mode? If so then
         // reassociating the constants breaks nothing (we test offset2 because
@@ -1176,7 +1176,7 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc,
       if (GA->getOpcode() == ISD::GlobalAddress && TLI.isOffsetFoldingLegal(GA))
         return false;
 
-    for (SDNode *Node : N->uses()) {
+    for (SDNode *Node : N->users()) {
       auto *LoadStore = dyn_cast<MemSDNode>(Node);
       if (!LoadStore)
         return false;
@@ -4720,7 +4720,7 @@ SDValue DAGCombiner::useDivRem(SDNode *Node) {
   SDValue Op0 = Node->getOperand(0);
   SDValue Op1 = Node->getOperand(1);
   SDValue combined;
-  for (SDNode *User : Op0->uses()) {
+  for (SDNode *User : Op0->users()) {
     if (User == Node || User->getOpcode() == ISD::DELETED_NODE ||
         User->use_empty())
       continue;
@@ -10369,7 +10369,7 @@ static SDValue combineShiftToMULH(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
   unsigned MulLoHiOp = IsSignExt ? ISD::SMUL_LOHI : ISD::UMUL_LOHI;
   if (!ShiftOperand.hasOneUse() &&
       TLI.isOperationLegalOrCustom(MulLoHiOp, NarrowVT) &&
-      llvm::any_of(ShiftOperand->uses(), UserOfLowerBits)) {
+      llvm::any_of(ShiftOperand->users(), UserOfLowerBits)) {
     return SDValue();
   }
 
@@ -13570,7 +13570,7 @@ static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner,
   if (NonNegZExt) {
     assert(ExtLoadType == ISD::ZEXTLOAD && ExtOpc == ISD::ZERO_EXTEND &&
            "Unexpected load type or opcode");
-    for (SDNode *User : N0->uses()) {
+    for (SDNode *User : N0->users()) {
       if (User->getOpcode() == ISD::SETCC) {
         ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get();
         if (ISD::isSignedIntSetCC(CC)) {
@@ -17673,7 +17673,7 @@ SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
   // Find all FDIV users of the same divisor.
   // Use a set because duplicates may be present in the user list.
   SetVector<SDNode *> Users;
-  for (auto *U : N1->uses()) {
+  for (auto *U : N1->users()) {
     if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) {
       // Skip X/sqrt(X) that has not been simplified to sqrt(X) yet.
       if (U->getOperand(1).getOpcode() == ISD::FSQRT &&
@@ -18965,15 +18965,15 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
   // Now check for #3 and #4.
   bool RealUse = false;
 
-  for (SDNode *Use : Ptr->uses()) {
-    if (Use == N)
+  for (SDNode *User : Ptr->users()) {
+    if (User == N)
       continue;
-    if (SDNode::hasPredecessorHelper(Use, Visited, Worklist, MaxSteps))
+    if (SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
       return false;
 
     // If Ptr may be folded in addressing mode of other use, then it's
     // not profitable to do this transformation.
-    if (!canFoldInAddressingMode(Ptr.getNode(), Use, DAG, TLI))
+    if (!canFoldInAddressingMode(Ptr.getNode(), User, DAG, TLI))
       RealUse = true;
   }
 
@@ -19089,19 +19089,19 @@ static bool shouldCombineToPostInc(SDNode *N, SDValue Ptr, SDNode *PtrUse,
 
   SmallPtrSet<const SDNode *, 32> Visited;
   unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps();
-  for (SDNode *Use : BasePtr->uses()) {
-    if (Use == Ptr.getNode())
+  for (SDNode *User : BasePtr->users()) {
+    if (User == Ptr.getNode())
       continue;
 
     // No if there's a later user which could perform the index instead.
-    if (isa<MemSDNode>(Use)) {
+    if (isa<MemSDNode>(User)) {
       bool IsLoad = true;
       bool IsMasked = false;
       SDValue OtherPtr;
-      if (getCombineLoadStoreParts(Use, ISD::POST_INC, ISD::POST_DEC, IsLoad,
+      if (getCombineLoadStoreParts(User, ISD::POST_INC, ISD::POST_DEC, IsLoad,
                                    IsMasked, OtherPtr, TLI)) {
         SmallVector<const SDNode *, 2> Worklist;
-        Worklist.push_back(Use);
+        Worklist.push_back(User);
         if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps))
           return false;
       }
@@ -19109,9 +19109,9 @@ static bool shouldCombineToPostInc(SDNode *N, SDValue Ptr, SDNode *PtrUse,
 
     // If all the uses are load / store addresses, then don't do the
     // transformation.
-    if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) {
-      for (SDNode *UseUse : Use->uses())
-        if (canFoldInAddressingMode(Use, UseUse, DAG, TLI))
+    if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SUB) {
+      for (SDNode *UserUser : User->users())
+        if (canFoldInAddressingMode(User, UserUser, DAG, TLI))
           return false;
     }
   }
@@ -19136,7 +19136,7 @@ static SDNode *getPostIndexedLoadStoreOp(SDNode *N, bool &IsLoad,
   //    nor a successor of N. Otherwise, if Op is folded that would
   //    create a cycle.
   unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps();
-  for (SDNode *Op : Ptr->uses()) {
+  for (SDNode *Op : Ptr->users()) {
     // Check for #1.
     if (!shouldCombineToPostInc(N, Ptr, Op, BasePtr, Offset, AM, DAG, TLI))
       continue;
@@ -20515,24 +20515,24 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode, SDValue AddNode,
     return true;
 
   // Walk all the users of the constant with which we're multiplying.
-  for (SDNode *Use : ConstNode->uses()) {
-    if (Use == MulNode) // This use is the one we're on right now. Skip it.
+  for (SDNode *User : ConstNode->users()) {
+    if (User == MulNode) // This use is the one we're on right now. Skip it.
       continue;
 
-    if (Use->getOpcode() == ISD::MUL) { // We have another multiply use.
+    if (User->getOpcode() == ISD::MUL) { // We have another multiply use.
       SDNode *OtherOp;
       SDNode *MulVar = AddNode.getOperand(0).getNode();
 
       // OtherOp is what we're multiplying against the constant.
-      if (Use->getOperand(0) == ConstNode)
-        OtherOp = Use->getOperand(1).getNode();
+      if (User->getOperand(0) == ConstNode)
+        OtherOp = User->getOperand(1).getNode();
       else
-        OtherOp = Use->getOperand(0).getNode();
+        OtherOp = User->getOperand(0).getNode();
 
       // Check to see if multiply is with the same operand of our "add".
       //
       //     ConstNode  = CONST
-      //     Use = ConstNode * A  <-- visiting Use. OtherOp is A.
+      //     User = ConstNode * A  <-- visiting User. OtherOp is A.
       //     ...
       //     AddNode  = (A + c1)  <-- MulVar is A.
       //         = AddNode * ConstNode   <-- current visiting instruction.
@@ -20550,7 +20550,7 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode, SDValue AddNode,
       //     ...   = AddNode * ConstNode <-- current visiting instruction.
       //     ...
       //     OtherOp = (A + c2)
-      //     Use     = OtherOp * ConstNode <-- visiting Use.
+      //     User    = OtherOp * ConstNode <-- visiting User.
       //
       // If we make this transformation, we will have a common
       // multiply (CONST * A) after we also do the same transformation
@@ -22902,7 +22902,7 @@ bool DAGCombiner::refineExtractVectorEltIntoMultipleNarrowExtractVectorElts(
     // Did we fail to model any of the users of the Producer?
     bool ProducerIsLeaf = false;
     // Look at each user of this Producer.
-    for (SDNode *User : E.Producer->uses()) {
+    for (SDNode *User : E.Producer->users()) {
       switch (User->getOpcode()) {
       // TODO: support ISD::BITCAST
       // TODO: support ISD::ANY_EXTEND
@@ -23176,14 +23176,14 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
 
   // If only EXTRACT_VECTOR_ELT nodes use the source vector we can
   // simplify it based on the (valid) extraction indices.
-  if (llvm::all_of(VecOp->uses(), [&](SDNode *Use) {
+  if (llvm::all_of(VecOp->users(), [&](SDNode *Use) {
         return Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
                Use->getOperand(0) == VecOp &&
                isa<ConstantSDNode>(Use->getOperand(1));
       })) {
     APInt DemandedElts = APInt::getZero(NumElts);
-    for (SDNode *Use : VecOp->uses()) {
-      auto *CstElt = cast<ConstantSDNode>(Use->getOperand(1));
+    for (SDNode *User : VecOp->users()) {
+      auto *CstElt = cast<ConstantSDNode>(User->getOperand(1));
       if (CstElt->getAPIntValue().ult(NumElts))
         DemandedElts.setBit(CstElt->getZExtValue());
     }
@@ -27302,7 +27302,7 @@ SDValue DAGCombiner::visitGET_FPENV_MEM(SDNode *N) {
   // Check if the memory, where FP state is written to, is used only in a single
   // load operation.
   LoadSDNode *LdNode = nullptr;
-  for (auto *U : Ptr->uses()) {
+  for (auto *U : Ptr->users()) {
     if (U == N)
       continue;
     if (auto *Ld = dyn_cast<LoadSDNode>(U)) {
@@ -27352,7 +27352,7 @@ SDValue DAGCombiner::visitSET_FPENV_MEM(SDNode *N) {
 
   // Check if the address of FP state is used also in a store operation only.
   StoreSDNode *StNode = nullptr;
-  for (auto *U : Ptr->uses()) {
+  for (auto *U : Ptr->users()) {
     if (U == N)
       continue;
     if (auto *St = dyn_cast<StoreSDNode>(U)) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 9c7085cc7e7a8..8e313fb21eede 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -105,7 +105,7 @@ void InstrEmitter::EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone,
   if (TLI->isTypeLegal(VT))
     UseRC = TLI->getRegClassFor(VT, Node->isDivergent());
 
-  for (SDNode *User : Node->uses()) {
+  for (SDNode *User : Node->users()) {
     bool Match = true;
     if (User->getOpcode() == ISD::CopyToReg &&
         User->getOperand(2).getNode() == Node &&
@@ -225,7 +225,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
     }
 
     if (!VRBase && !IsClone && !IsCloned)
-      for (SDNode *User : Node->uses()) {
+      for (SDNode *User : Node->users()) {
         if (User->getOpcode() == ISD::CopyToReg &&
             User->getOperand(2).getNode() == Node &&
             User->getOperand(2).getResNo() == i) {
@@ -502,7 +502,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node, VRBaseMapType &VRBaseMap,
 
   // If the node is only used by a CopyToReg and the dest reg is a vreg, use
   // the CopyToReg'd destination register instead of creating a new vreg.
-  for (SDNode *User : Node->uses()) {
+  for (SDNode *User : Node->users()) {
     if (User->getOpcode() == ISD::CopyToReg &&
         User->getOperand(2).getNode() == Node) {
       Register DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index ca87168929f96..595a410101eca 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1394,7 +1394,7 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
   Visited.insert(Op.getNode());
   Worklist.push_back(Idx.getNode());
   SDValue StackPtr, Ch;
-  for (SDNode *User : Vec.getNode()->uses()) {
+  for (SDNode *User : Vec.getNode()->users()) {
     if (StoreSDNode *ST = dyn_cast<StoreSDNode>(User)) {
       if (ST->isIndexed() || ST->isTruncatingStore() ||
           ST->getValue() != Vec)
@@ -2293,7 +2293,7 @@ static bool useSinCos(SDNode *Node) {
     ? ISD::FCOS : ISD::FSIN;
 
   SDValue Op0 = Node->getOperand(0);
-  for (const SDNode *User : Op0.getNode()->uses()) {
+  for (const SDNode *User : Op0.getNode()->users()) {
     if (User == Node)
       continue;
     // The other user might have been turned into sincos already.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index cb6d3fe4db8a4..c7d29ec1a836c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -189,7 +189,7 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
 #ifndef NDEBUG
   // Checked that NewNodes are only used by other NewNodes.
   for (SDNode *N : NewNodes) {
-    for (SDNode *U : N->uses())
+    for (SDNode *U : N->users())
       assert(U->getNodeId() == NewNode && "NewNode used by non-NewNode!");
   }
 #endif
@@ -399,7 +399,7 @@ bool DAGTypeLegalizer::run() {
     assert(N->getNodeId() == ReadyToProcess && "Node ID recalculated?");
     N->setNodeId(Processed);
 
-    for (SDNode *User : N->uses()) {
+    for (SDNode *User : N->users()) {
       int NodeId = User->getNodeId();
 
       // This node has two options: it can either be a new node or its Node ID
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 70a7438440191..26eba4b257fb9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -756,7 +756,7 @@ void ScheduleDAGLinearize::Schedule() {
     // Glue user must be scheduled together with the glue operand. So other
     // users of the glue operand must be treated as its users.
     SDNode *ImmGUser = Glue->getGluedUser();
-    for (const SDNode *U : Glue->uses())
+    for (const SDNode *U : Glue->users())
       if (U == ImmGUser)
         --Degree;
     GUser->setNodeId(UDegree + Degree);
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 31939ae5922ec..2e59dbf2f7028 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -388,7 +388,7 @@ void ScheduleDAGSDNodes::BuildSchedUnits() {
 
       // There are either zero or one users of the Glue result.
       bool HasGlueUse = false;
-      for (SDNode *U : N->uses())
+      for (SDNode *U : N->users())
         if (GlueVal.isOperandOf(U)) {
           HasGlueUse = true;
           assert(N->getNodeId() == -1 && "Node already inserted!");
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0fb5c4d5c4cb9..bd9e5d4dce8ec 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2556,7 +2556,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
   // destination pointers can be used instead of creating stack allocations.
   SDValue StoresInChain;
   SmallVector<StoreSDNode *, 2> ResultStores(NumResults);
-  for (SDNode *User : Node->uses()) {
+  for (SDNode *User : Node->users()) {
     if (!ISD::isNormalStore(User))
       continue;
     auto *ST = cast<StoreSDNode>(User);
@@ -7933,7 +7933,7 @@ SDValue SelectionDAG::getStackArgumentTokenFactor(SDValue Chain) {
   ArgChains.push_back(Chain);
 
   // Add a chain value for each stack argument.
-  for (SDNode *U : getEntryNode().getNode()->uses())
+  for (SDNode *U : getEntryNode().getNode()->users())
     if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
         if (FI->getIndex() < 0)
@@ -11926,7 +11926,7 @@ void SelectionDAG::updateDivergence(SDNode *N) {
     bool IsDivergent = calculateDivergence(N);
     if (N->SDNodeBits.IsDivergent != IsDivergent) {
       N->SDNodeBits.IsDivergent = IsDivergent;
-      llvm::append_range(Worklist, N->uses());
+      llvm::append_range(Worklist, N->users());
     }
   } while (!Worklist.empty());
 }
@@ -11942,7 +11942,7 @@ void SelectionDAG::CreateTopologicalOrder(std::vector<SDNode *> &Order) {
   }
   for (size_t I = 0; I != Order.size(); ++I) {
     SDNode *N = Order[I];
-    for (auto *U : N->uses()) {
+    for (auto *U : N->users()) {
       unsigned &UnsortedOps = Degree[U];
       if (0 == --UnsortedOps)
         Order.push_back(U);
@@ -12071,7 +12071,7 @@ unsigned SelectionDAG::AssignTopologicalOrder() {
     checkForCycles(N, this);
     // N is in sorted position, so all its uses have one less operand
     // that needs to be sorted.
-    for (SDNode *P : N->uses()) {
+    for (SDNode *P : N->users()) {
       unsigned Degree = P->getNodeId();
       assert(Degree != 0 && "Invalid node degree");
       --Degree;
@@ -12489,7 +12489,7 @@ bool SDNode::hasAnyUseOfValue(unsigned Value) const {
 /// isOnlyUserOf - Return true if this node is the only use of N.
 bool SDNode::isOnlyUserOf(const SDNode *N) const {
   bool Seen = false;
-  for (const SDNode *User : N->uses()) {
+  for (const SDNode *User : N->users()) {
     if (User == this)
       Seen = true;
     else
@@ -12502,7 +12502,7 @@ bool SDNode::isOnlyUserOf(const SDNode *N) const {
 /// Return true if the only users of N are contained in Nodes.
 bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N) {
   bool Seen = false;
-  for (const SDNode *User : N->uses()) {
+  for (const SDNode *User : N->users()) {
     if (llvm::is_contained(Nodes, User))
       Seen = true;
     else
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 35aa7b87bc3b7..9147fb1c2badf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -1225,7 +1225,7 @@ void SelectionDAGISel::EnforceNodeIdInvariant(SDNode *Node) {
 
   while (!Nodes.empty()) {
     SDNode *N = Nodes.pop_back_val();
-    for (auto *U : N->uses()) {
+    for (auto *U : N->users()) {
       auto UId = U->getNodeId();
       if (UId > 0) {
         InvalidateNodeId(U);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 5df61b3722037..ff3ca8a24fc04 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -679,9 +679,9 @@ static bool isWorthFoldingSHL(SDValue V) {
   // operation.  If yes, do not try to fold this node into the address
   // computation, since the computation will be kept.
   const SDNode *Node = V.getNode();
-  for (SDNode *UI : Node->uses())
+  for (SDNode *UI : Node->users())
     if (!isa<MemSDNode>(*UI))
-      for (SDNode *UII : UI->uses())
+      for (SDNode *UII : UI->users())
         if (!isa<MemSDNode>(*UII))
           return false;
   return true;
@@ -1012,15 +1012,15 @@ bool AArch64DAGToDAGISel::SelectArithUXTXRegister(SDValue N, SDValue &Reg,
 /// a single pseudo-instruction for an ADRP/ADD pair so over-aggressive folding
 /// leads to duplicated ADRP instructions.
 static bool isWorthFoldingADDlow(SDValue N) {
-  for (auto *Use : N->uses()) {
-    if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE &&
-        Use->getOpcode() != ISD::ATOMIC_LOAD &&
-        Use->getOpcode() != ISD::ATOMIC_STORE)
+  for (auto *User : N->users()) {
+    if (User->getOpcode() != ISD::LOAD && User->getOpcode() != ISD::STORE &&
+        User->getOpcode() != ISD::ATOMIC_LOAD &&
+        User->getOpcode() != ISD::ATOMIC_STORE)
       return false;
 
     // ldar and stlr have much more restrictive addressing modes (just a
     // register).
-    if (isStrongerThanMonotonic(cast<MemSDNode>(Use)->getSuccessOrdering()))
+    if (isStrongerThanMonotonic(cast<MemSDNode>(User)->getSuccessOrdering()))
       return false;
   }
 
@@ -1245,7 +1245,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
   // operation.  If yes, do not try to fold this node into the address
   // computation, since the computation will be kept.
   const SDNode *Node = N.getNode();
-  for (SDNode *UI : Node->uses()) {
+  for (SDNode *UI : Node->users()) {
     if (!isa<MemSDNode>(*UI))
       return false;
   }
@@ -1329,7 +1329,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
   // operation.  If yes, do not try to fold this node into the address
   // computation, since the computation will be kept.
   const SDNode *Node = N.getNode();
-  for (SDNode *UI : Node->uses()) {
+  for (SDNode *UI : Node->users()) {
     if (!isa<MemSDNode>(*UI))
       return false;
   }
@@ -3031,7 +3031,7 @@ static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
   }
   APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0);
 
-  for (SDNode *Node : Op.getNode()->uses()) {
+  for (SDNode *Node : Op.getNode()->users()) {
     // A use cannot produce useful bits
     APInt UsefulBitsForUse = APInt(UsefulBits);
     getUsefulBitsForUse(Node, UsefulBitsForUse, Op, Depth);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index cb6ba06bd4425..5865dbe1307ba 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -6464,7 +6464,7 @@ bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
         return false;
 
       unsigned NumExtMaskedLoads = 0;
-      for (auto *U : Ld->getMask()->uses())
+      for (auto *U : Ld->getMask()->users())
         if (isa<MaskedLoadSDNode>(U))
           NumExtMaskedLoads++;
 
@@ -8559,7 +8559,7 @@ SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
   ArgChains.push_back(Chain);
 
   // Add a chain value for each stack argument corresponding
-  for (SDNode *U : DAG.getEntryNode().getNode()->uses())
+  for (SDNode *U : DAG.getEntryNode().getNode()->users())
     if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
         if (FI->getIndex() < 0) {
@@ -19586,7 +19586,7 @@ static SDValue performANDSETCCCombine(SDNode *N,
   // Checks if the current node (N) is used by any SELECT instruction and
   // returns an empty SDValue to avoid applying the optimization to prevent
   // incorrect results
-  for (auto U : N->uses())
+  for (auto U : N->users())
     if (U->getOpcode() == ISD::SELECT)
       return SDValue();
 
@@ -24761,7 +24761,7 @@ static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
   EVT UseMVT = FirstUse->getValueType(0);
   if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
     return SDValue();
-  if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
+  if (any_of(Op->users(), [&UseMVT](const SDNode *N) {
         return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
       }))
     return SDValue();
@@ -25335,7 +25335,7 @@ static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   uint64_t MinOffset = -1ull;
-  for (SDNode *N : GN->uses()) {
+  for (SDNode *N : GN->users()) {
     if (N->getOpcode() != ISD::ADD)
       return SDValue();
     auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
@@ -26054,7 +26054,7 @@ static SDValue tryCombineMULLWithUZP1(SDNode *N,
     HasFoundMULLow = false;
 
   // Find ExtractLow.
-  for (SDNode *User : ExtractHighSrcVec.getNode()->uses()) {
+  for (SDNode *User : ExtractHighSrcVec.getNode()->users()) {
     if (User == ExtractHigh.getNode())
       continue;
 
@@ -26561,7 +26561,7 @@ bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
     return false;
 
   bool HasRet = false;
-  for (SDNode *Node : Copy->uses()) {
+  for (SDNode *Node : Copy->users()) {
     if (Node->getOpcode() != AArch64ISD::RET_GLUE)
       return false;
     HasRet = true;
@@ -29650,7 +29650,7 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
 bool AArch64TargetLowering::preferScalarizeSplat(SDNode *N) const {
   unsigned Opc = N->getOpcode();
   if (ISD::isExtOpcode(Opc)) {
-    if (any_of(N->uses(),
+    if (any_of(N->users(),
                [&](SDNode *Use) { return Use->getOpcode() == ISD::MUL; }))
       return false;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 48e9af9fe507f..c129759f3d3c7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -772,7 +772,7 @@ bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
   assert(!N->use_empty());
 
   // XXX - Should this limit number of uses to check?
-  for (const SDNode *U : N->uses()) {
+  for (const SDNode *U : N->users()) {
     if (!hasSourceMods(U))
       return false;
 
@@ -1348,7 +1348,7 @@ SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
   ArgChains.push_back(Chain);
 
   // Add a chain value for each stack argument corresponding
-  for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
+  for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
     if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
         if (FI->getIndex() < 0) {
@@ -3814,7 +3814,7 @@ static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
 }
 
 static bool hasVolatileUser(SDNode *Val) {
-  for (SDNode *U : Val->uses()) {
+  for (SDNode *U : Val->users()) {
     if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
       if (M->isVolatile())
         return true;
@@ -4338,7 +4338,7 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
     if (!AddOp)
       return SDValue();
 
-    if (V.hasOneUse() || all_of(V->uses(), [](const SDNode *U) -> bool {
+    if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool {
           return U->getOpcode() == ISD::MUL;
         }))
       return AddOp;
@@ -4927,7 +4927,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
       SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
       DAG.ReplaceAllUsesWith(N0, Neg);
 
-      for (SDNode *U : Neg->uses())
+      for (SDNode *U : Neg->users())
         DCI.AddToWorklist(U);
     }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7da93f90341d2..2b8cc5b4e33a4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -12544,21 +12544,21 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
         return true;
 
       // If we have any non-vectorized use, then it is a candidate for v_perm
-      for (auto *VUse : OrUse->uses()) {
-        if (!VUse->getValueType(0).isVector())
+      for (auto *VUser : OrUse->users()) {
+        if (!VUser->getValueType(0).isVector())
           return true;
 
         // If the use of a vector is a store, then combining via a v_perm
         // is beneficial.
         // TODO -- whitelist more uses
         for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
-          if (VUse->getOpcode() == VectorwiseOp)
+          if (VUser->getOpcode() == VectorwiseOp)
             return true;
       }
       return false;
     };
 
-    if (!any_of(N->uses(), usesCombinedOperand))
+    if (!any_of(N->users(), usesCombinedOperand))
       return SDValue();
 
     uint32_t LHSMask = getPermuteMask(LHS);
@@ -13895,10 +13895,10 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
   // part of full-rate 64-bit ops).
   if (!Subtarget->hasFullRate64Ops()) {
     unsigned NumUsers = 0;
-    for (SDNode *Use : LHS->uses()) {
+    for (SDNode *User : LHS->users()) {
       // There is a use that does not feed into addition, so the multiply can't
       // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
-      if (Use->getOpcode() != ISD::ADD)
+      if (User->getOpcode() != ISD::ADD)
         return SDValue();
 
       // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 2b20154042fe2..764d3c879f2d6 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -3467,7 +3467,7 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
     SDNode *VMov = Copy;
     // f64 returned in a pair of GPRs.
     SmallPtrSet<SDNode*, 2> Copies;
-    for (SDNode *U : VMov->uses()) {
+    for (SDNode *U : VMov->users()) {
       if (U->getOpcode() != ISD::CopyToReg)
         return false;
       Copies.insert(U);
@@ -3475,7 +3475,7 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
     if (Copies.size() > 2)
       return false;
 
-    for (SDNode *U : VMov->uses()) {
+    for (SDNode *U : VMov->users()) {
       SDValue UseChain = U->getOperand(0);
       if (Copies.count(UseChain.getNode()))
         // Second CopyToReg
@@ -3507,7 +3507,7 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   }
 
   bool HasRet = false;
-  for (const SDNode *U : Copy->uses()) {
+  for (const SDNode *U : Copy->users()) {
     if (U->getOpcode() != ARMISD::RET_GLUE &&
         U->getOpcode() != ARMISD::INTRET_GLUE)
       return false;
@@ -7958,7 +7958,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     // generate a vdup of the constant.
     if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
         (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
-        all_of(BVN->uses(),
+        all_of(BVN->users(),
                [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {
       EVT DupVT = SplatBitSize == 32   ? MVT::v4i32
                   : SplatBitSize == 16 ? MVT::v8i16
@@ -13970,7 +13970,7 @@ static SDValue PerformSHLSimplify(SDNode *N,
     return SDValue();
 
   // Check that all the users could perform the shl themselves.
-  for (auto *U : N->uses()) {
+  for (auto *U : N->users()) {
     switch(U->getOpcode()) {
     default:
       return SDValue();
@@ -15574,13 +15574,13 @@ PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
     return SDValue();
 
   // Find another extract, of Lane + 1
-  auto OtherIt = find_if(Op0->uses(), [&](SDNode *V) {
+  auto OtherIt = find_if(Op0->users(), [&](SDNode *V) {
     return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
            isa<ConstantSDNode>(V->getOperand(1)) &&
            V->getConstantOperandVal(1) == Lane + 1 &&
            V->getOperand(0).getResNo() == ResNo;
   });
-  if (OtherIt == Op0->uses().end())
+  if (OtherIt == Op0->users().end())
     return SDValue();
 
   // For float extracts, we need to be converting to a i32 for both vector
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index c1937ff70f366..db9aa7e18f5e7 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -1756,7 +1756,7 @@ void HvxSelector::select(SDNode *ISelN) {
     // Don't want to select N0 if it's shared with another node, except if
     // it's shared with other ISELs.
     auto IsISelN = [](SDNode *T) { return T->getOpcode() == HexagonISD::ISEL; };
-    if (llvm::all_of(N0->uses(), IsISelN))
+    if (llvm::all_of(N0->users(), IsISelN))
       SubNodes.insert(N0);
   }
   if (SubNodes.empty()) {
@@ -1775,7 +1775,7 @@ void HvxSelector::select(SDNode *ISelN) {
       return true;
     if (T->use_empty() || NonDom.count(T))
       return false;
-    for (SDNode *U : T->uses()) {
+    for (SDNode *U : T->users()) {
       // If T is reachable from a known non-dominated node, then T itself
       // is non-dominated.
       if (!Rec(U, Rec)) {
@@ -1814,7 +1814,7 @@ void HvxSelector::select(SDNode *ISelN) {
 
   for (unsigned I = 0; I != TmpQ.size(); ++I) {
     SDNode *S = TmpQ[I];
-    for (SDNode *U : S->uses()) {
+    for (SDNode *U : S->users()) {
       if (U == ISelN)
         continue;
       auto F = OpCount.find(U);
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 104e601de044b..e32ed41c2893c 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -5351,7 +5351,7 @@ bool LoongArchTargetLowering::isUsedByReturnOnly(SDNode *N,
 
   // The copy must be used by a LoongArchISD::RET, and nothing else.
   bool HasRet = false;
-  for (SDNode *Node : Copy->uses()) {
+  for (SDNode *Node : Copy->users()) {
     if (Node->getOpcode() != LoongArchISD::RET)
       return false;
     HasRet = true;
diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp
index ff966baecf27d..98ed46d91da60 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp
@@ -1990,7 +1990,7 @@ SDValue M68kTargetLowering::EmitTest(SDValue Op, unsigned M68kCC,
   case ISD::XOR:
     // Due to the ISEL shortcoming noted above, be conservative if this op is
     // likely to be selected as part of a load-modify-store instruction.
-    for (const auto *U : Op.getNode()->uses())
+    for (const auto *U : Op.getNode()->users())
       if (U->getOpcode() == ISD::STORE)
         goto default_case;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 94e90a84a2d41..c838b21cbf75e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -318,7 +318,7 @@ bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
     return false;
   // Find and record all uses of this vector that extract element 0 or 1.
   SmallVector<SDNode *, 4> E0, E1;
-  for (auto *U : Vector.getNode()->uses()) {
+  for (auto *U : Vector.getNode()->users()) {
     if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
       continue;
     if (U->getOperand(0) != Vector)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index a033a8247fac5..5c1f717694a4c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -4495,7 +4495,7 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
     //
     int numUses = 0;
     int nonAddCount = 0;
-    for (const SDNode *User : N0.getNode()->uses()) {
+    for (const SDNode *User : N0.getNode()->users()) {
       numUses++;
       if (User->getOpcode() != ISD::FADD)
         ++nonAddCount;
@@ -4523,7 +4523,7 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
         opIsLive = true;
 
       if (!opIsLive)
-        for (const SDNode *User : left->uses()) {
+        for (const SDNode *User : left->users()) {
           int orderNo3 = User->getIROrder();
           if (orderNo3 > orderNo) {
             opIsLive = true;
@@ -4532,7 +4532,7 @@ PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
         }
 
       if (!opIsLive)
-        for (const SDNode *User : right->uses()) {
+        for (const SDNode *User : right->users()) {
           int orderNo3 = User->getIROrder();
           if (orderNo3 > orderNo) {
             opIsLive = true;
@@ -4730,7 +4730,7 @@ static SDValue PerformREMCombine(SDNode *N,
   const SDValue &Num = N->getOperand(0);
   const SDValue &Den = N->getOperand(1);
 
-  for (const SDNode *U : Num->uses()) {
+  for (const SDNode *U : Num->users()) {
     if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
         U->getOperand(1) == Den) {
       // Num % Den -> Num - (Num / Den) * Den
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 2475b8ad11f10..277c1414d7160 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -750,7 +750,7 @@ static bool canOptimizeTLSDFormToXForm(SelectionDAG *CurDAG, SDValue Base) {
   // Base is expected to be an ADD_TLS node.
   if (Base.getOpcode() != PPCISD::ADD_TLS)
     return false;
-  for (auto *ADDTLSUse : Base.getNode()->uses()) {
+  for (auto *ADDTLSUse : Base.getNode()->users()) {
     // The optimization to convert the D-Form load/store into its X-Form
     // counterpart should only occur if the source value offset of the load/
     // store is 0. This also means that The offset should always be undefined.
@@ -3986,7 +3986,7 @@ static bool allUsesExtend(SDValue Compare, SelectionDAG *CurDAG) {
     return true;
   // We want the value in a GPR if it is being extended, used for a select, or
   // used in logical operations.
-  for (auto *CompareUse : Compare.getNode()->uses())
+  for (auto *CompareUse : Compare.getNode()->users())
     if (CompareUse->getOpcode() != ISD::SIGN_EXTEND &&
         CompareUse->getOpcode() != ISD::ZERO_EXTEND &&
         CompareUse->getOpcode() != ISD::SELECT &&
@@ -6701,7 +6701,7 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
 // be folded with the isel so that we don't need to materialize a register
 // containing zero.
 bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) {
-  for (const SDNode *User : N->uses()) {
+  for (const SDNode *User : N->users()) {
     if (!User->isMachineOpcode())
       return false;
     if (User->getMachineOpcode() != PPC::SELECT_I4 &&
@@ -6731,7 +6731,7 @@ bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) {
 
 void PPCDAGToDAGISel::SwapAllSelectUsers(SDNode *N) {
   SmallVector<SDNode *, 4> ToReplace;
-  for (SDNode *User : N->uses()) {
+  for (SDNode *User : N->users()) {
     assert((User->getMachineOpcode() == PPC::SELECT_I4 ||
             User->getMachineOpcode() == PPC::SELECT_I8) &&
            "Must have all select users");
@@ -7382,7 +7382,7 @@ void PPCDAGToDAGISel::PeepholePPC64ZExt() {
     // (except for the original INSERT_SUBREG), then abort the transformation.
     bool OutsideUse = false;
     for (SDNode *PN : ToPromote) {
-      for (SDNode *UN : PN->uses()) {
+      for (SDNode *UN : PN->users()) {
         if (!ToPromote.count(UN) && UN != ISR.getNode()) {
           OutsideUse = true;
           break;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 69bc2cce6c2c7..199e1f41cfc05 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -2687,7 +2687,7 @@ static bool provablyDisjointOr(SelectionDAG &DAG, const SDValue &N) {
 bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base,
                                                SDValue &Index,
                                                SelectionDAG &DAG) const {
-  for (SDNode *U : N->uses()) {
+  for (SDNode *U : N->users()) {
     if (MemSDNode *Memop = dyn_cast<MemSDNode>(U)) {
       if (Memop->getMemoryVT() == MVT::f64) {
           Base = N.getOperand(0);
@@ -12033,7 +12033,7 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
 SDValue PPCTargetLowering::LowerUaddo(SDValue Op, SelectionDAG &DAG) const {
   // Default to target independent lowering if there is a logical user of the
   // carry-bit.
-  for (SDNode *U : Op->uses()) {
+  for (SDNode *U : Op->users()) {
     if (U->getOpcode() == ISD::SELECT)
       return SDValue();
     if (ISD::isBitwiseLogicOp(U->getOpcode())) {
@@ -14290,7 +14290,7 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
         if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
           return true;
 
-      for (SDNode *U : LoadRoot->uses())
+      for (SDNode *U : LoadRoot->users())
         if (((isa<MemSDNode>(U) &&
               cast<MemSDNode>(U)->getChain().getNode() == LoadRoot) ||
              U->getOpcode() == ISD::TokenFactor) &&
@@ -14352,7 +14352,7 @@ SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N,
 
   // If all users of SETCC extend its value to a legal integer type
   // then we replace SETCC with a subtraction
-  for (const SDNode *U : N->uses())
+  for (const SDNode *U : N->users())
     if (U->getOpcode() != ISD::ZERO_EXTEND)
       return SDValue();
 
@@ -14531,7 +14531,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
     if (isa<ConstantSDNode>(Inputs[i]))
       continue;
 
-    for (const SDNode *User : Inputs[i].getNode()->uses()) {
+    for (const SDNode *User : Inputs[i].getNode()->users()) {
       if (User != N && !Visited.count(User))
         return SDValue();
 
@@ -14552,7 +14552,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
   }
 
   for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
-    for (const SDNode *User : PromOps[i].getNode()->uses()) {
+    for (const SDNode *User : PromOps[i].getNode()->users()) {
       if (User != N && !Visited.count(User))
         return SDValue();
 
@@ -14736,7 +14736,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
     if (isa<ConstantSDNode>(Inputs[i]))
       continue;
 
-    for (SDNode *User : Inputs[i].getNode()->uses()) {
+    for (SDNode *User : Inputs[i].getNode()->users()) {
       if (User != N && !Visited.count(User))
         return SDValue();
 
@@ -14758,7 +14758,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
   }
 
   for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
-    for (SDNode *User : PromOps[i].getNode()->uses()) {
+    for (SDNode *User : PromOps[i].getNode()->users()) {
       if (User != N && !Visited.count(User))
         return SDValue();
 
@@ -16556,35 +16556,35 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                   APInt::getAllOnes(Bits /* alignment */)
                                       .zext(Add.getScalarValueSizeInBits()))) {
           SDNode *BasePtr = Add->getOperand(0).getNode();
-          for (SDNode *U : BasePtr->uses()) {
-          if (U->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
-              U->getConstantOperandVal(0) == IID) {
-            // We've found another LVSL/LVSR, and this address is an aligned
-            // multiple of that one. The results will be the same, so use the
-            // one we've just found instead.
-
-            return SDValue(U, 0);
-          }
+          for (SDNode *U : BasePtr->users()) {
+            if (U->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+                U->getConstantOperandVal(0) == IID) {
+              // We've found another LVSL/LVSR, and this address is an aligned
+              // multiple of that one. The results will be the same, so use the
+              // one we've just found instead.
+
+              return SDValue(U, 0);
+            }
           }
         }
 
         if (isa<ConstantSDNode>(Add->getOperand(1))) {
           SDNode *BasePtr = Add->getOperand(0).getNode();
-          for (SDNode *U : BasePtr->uses()) {
-          if (U->getOpcode() == ISD::ADD &&
-              isa<ConstantSDNode>(U->getOperand(1)) &&
-              (Add->getConstantOperandVal(1) - U->getConstantOperandVal(1)) %
-                      (1ULL << Bits) ==
-                  0) {
-            SDNode *OtherAdd = U;
-            for (SDNode *V : OtherAdd->uses()) {
-              if (V->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
-                  V->getConstantOperandVal(0) == IID) {
-                return SDValue(V, 0);
+          for (SDNode *U : BasePtr->users()) {
+            if (U->getOpcode() == ISD::ADD &&
+                isa<ConstantSDNode>(U->getOperand(1)) &&
+                (Add->getConstantOperandVal(1) - U->getConstantOperandVal(1)) %
+                        (1ULL << Bits) ==
+                    0) {
+              SDNode *OtherAdd = U;
+              for (SDNode *V : OtherAdd->users()) {
+                if (V->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+                    V->getConstantOperandVal(0) == IID) {
+                  return SDValue(V, 0);
+                }
               }
             }
           }
-          }
         }
       }
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index ccf34b8a6b2b0..4393d33021760 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2614,21 +2614,21 @@ static bool selectConstantAddr(SelectionDAG *CurDAG, const SDLoc &DL,
 // Is this ADD instruction only used as the base pointer of scalar loads and
 // stores?
 static bool isWorthFoldingAdd(SDValue Add) {
-  for (auto *Use : Add->uses()) {
-    if (Use->getOpcode() != ISD::LOAD && Use->getOpcode() != ISD::STORE &&
-        Use->getOpcode() != ISD::ATOMIC_LOAD &&
-        Use->getOpcode() != ISD::ATOMIC_STORE)
+  for (auto *User : Add->users()) {
+    if (User->getOpcode() != ISD::LOAD && User->getOpcode() != ISD::STORE &&
+        User->getOpcode() != ISD::ATOMIC_LOAD &&
+        User->getOpcode() != ISD::ATOMIC_STORE)
       return false;
-    EVT VT = cast<MemSDNode>(Use)->getMemoryVT();
+    EVT VT = cast<MemSDNode>(User)->getMemoryVT();
     if (!VT.isScalarInteger() && VT != MVT::f16 && VT != MVT::f32 &&
         VT != MVT::f64)
       return false;
     // Don't allow stores of the value. It must be used as the address.
-    if (Use->getOpcode() == ISD::STORE &&
-        cast<StoreSDNode>(Use)->getValue() == Add)
+    if (User->getOpcode() == ISD::STORE &&
+        cast<StoreSDNode>(User)->getValue() == Add)
       return false;
-    if (Use->getOpcode() == ISD::ATOMIC_STORE &&
-        cast<AtomicSDNode>(Use)->getVal() == Add)
+    if (User->getOpcode() == ISD::ATOMIC_STORE &&
+        cast<AtomicSDNode>(User)->getVal() == Add)
       return false;
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index affc29ec18ff7..9383e700ade86 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16310,7 +16310,7 @@ static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
     // All users should be a shift by constant less than or equal to 32. This
     // ensures we'll do this optimization for each of them to produce an
     // add/sub+sext_inreg they can all share.
-    for (SDNode *U : N0->uses()) {
+    for (SDNode *U : N0->users()) {
       if (U->getOpcode() != ISD::SRA ||
           !isa<ConstantSDNode>(U->getOperand(1)) ||
           U->getConstantOperandVal(1) > 32)
@@ -18374,7 +18374,7 @@ bool RISCVTargetLowering::isDesirableToCommuteWithShift(
   // LD/ST, it can still complete the folding optimization operation performed
   // above.
   auto isUsedByLdSt = [](const SDNode *X, const SDNode *User) {
-    for (SDNode *Use : X->uses()) {
+    for (SDNode *Use : X->users()) {
       // This use is the one we're on right now. Skip it
       if (Use == User || Use->getOpcode() == ISD::SELECT)
         continue;
@@ -20511,7 +20511,7 @@ bool RISCVTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
 
   // The copy must be used by a RISCVISD::RET_GLUE, and nothing else.
   bool HasRet = false;
-  for (SDNode *Node : Copy->uses()) {
+  for (SDNode *Node : Copy->users()) {
     if (Node->getOpcode() != RISCVISD::RET_GLUE)
       return false;
     HasRet = true;
diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 403d238aa5b52..210e3c5426f46 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -1890,7 +1890,7 @@ SystemZDAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U,
     SDNode *CCRegUser = nullptr;
     if (CCUser->getOpcode() == ISD::CopyToReg ||
         cast<RegisterSDNode>(CCUser->getOperand(1))->getReg() == SystemZ::CC) {
-      for (auto *U : CCUser->uses()) {
+      for (auto *U : CCUser->users()) {
         if (CCRegUser == nullptr)
           CCRegUser = U;
         else if (CCRegUser != U)
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index becc3936eef89..47008af3479ee 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -2910,7 +2910,7 @@ static void adjustForSubtraction(SelectionDAG &DAG, const SDLoc &DL,
                                  Comparison &C) {
   if (C.CCMask == SystemZ::CCMASK_CMP_EQ ||
       C.CCMask == SystemZ::CCMASK_CMP_NE) {
-    for (SDNode *N : C.Op0->uses()) {
+    for (SDNode *N : C.Op0->users()) {
       if (N->getOpcode() == ISD::SUB &&
           ((N->getOperand(0) == C.Op0 && N->getOperand(1) == C.Op1) ||
            (N->getOperand(0) == C.Op1 && N->getOperand(1) == C.Op0))) {
@@ -2936,7 +2936,7 @@ static void adjustForFNeg(Comparison &C) {
     return;
   auto *C1 = dyn_cast<ConstantFPSDNode>(C.Op1);
   if (C1 && C1->isZero()) {
-    for (SDNode *N : C.Op0->uses()) {
+    for (SDNode *N : C.Op0->users()) {
       if (N->getOpcode() == ISD::FNEG) {
         C.Op0 = SDValue(N, 0);
         C.CCMask = SystemZ::reverseCCMask(C.CCMask);
@@ -2960,7 +2960,7 @@ static void adjustForLTGFR(Comparison &C) {
     if (C1 && C1->getZExtValue() == 32) {
       SDValue ShlOp0 = C.Op0.getOperand(0);
       // See whether X has any SIGN_EXTEND_INREG uses.
-      for (SDNode *N : ShlOp0->uses()) {
+      for (SDNode *N : ShlOp0->users()) {
         if (N->getOpcode() == ISD::SIGN_EXTEND_INREG &&
             cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i32) {
           C.Op0 = SDValue(N, 0);
@@ -7289,7 +7289,7 @@ static bool isVectorElementSwap(ArrayRef<int> M, EVT VT) {
 }
 
 static bool isOnlyUsedByStores(SDValue StoredVal, SelectionDAG &DAG) {
-  for (auto *U : StoredVal->uses()) {
+  for (auto *U : StoredVal->users()) {
     if (StoreSDNode *ST = dyn_cast<StoreSDNode>(U)) {
       EVT CurrMemVT = ST->getMemoryVT().getScalarType();
       if (CurrMemVT.isRound() && CurrMemVT.getStoreSize() <= 16)
@@ -7668,7 +7668,7 @@ SDValue SystemZTargetLowering::combineFP_ROUND(
       Op0.getOperand(1).getOpcode() == ISD::Constant &&
       Op0.getConstantOperandVal(1) == 0) {
     SDValue Vec = Op0.getOperand(0);
-    for (auto *U : Vec->uses()) {
+    for (auto *U : Vec->users()) {
       if (U != Op0.getNode() && U->hasOneUse() &&
           U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
           U->getOperand(0) == Vec &&
@@ -7732,7 +7732,7 @@ SDValue SystemZTargetLowering::combineFP_EXTEND(
       Op0.getOperand(1).getOpcode() == ISD::Constant &&
       Op0.getConstantOperandVal(1) == 0) {
     SDValue Vec = Op0.getOperand(0);
-    for (auto *U : Vec->uses()) {
+    for (auto *U : Vec->users()) {
       if (U != Op0.getNode() && U->hasOneUse() &&
           U->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
           U->getOperand(0) == Vec &&
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index a56b5a2ac9a3e..87c1625c11454 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -2951,7 +2951,7 @@ static bool isI32Insn(const SDNode *User, const SDNode *N) {
 static bool isI32InsnAllUses(const SDNode *User, const SDNode *N) {
   // Check all use of User node.  If all of them are safe, optimize
   // truncate to extract_subreg.
-  for (const SDNode *U : User->uses()) {
+  for (const SDNode *U : User->users()) {
     switch (U->getOpcode()) {
     default:
       // If the use is an instruction which treats the source operand as i32,
@@ -3002,7 +3002,7 @@ SDValue VETargetLowering::combineTRUNCATE(SDNode *N,
     return SDValue();
 
   // Check all use of this TRUNCATE.
-  for (const SDNode *User : N->uses()) {
+  for (const SDNode *User : N->users()) {
     // Make sure that we're not going to replace TRUNCATE for non i32
     // instructions.
     //
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 76ef207f7d47d..bb20e6ecf281b 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -370,7 +370,7 @@ namespace {
         return false;
 
       // Walk all the users of the immediate.
-      for (const SDNode *User : N->uses()) {
+      for (const SDNode *User : N->users()) {
         if (UseCount >= 2)
           break;
 
@@ -1095,7 +1095,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       SDNode *MaxLd = nullptr;
       SDValue Ptr = Ld->getBasePtr();
       SDValue Chain = Ld->getChain();
-      for (SDNode *User : Ptr->uses()) {
+      for (SDNode *User : Ptr->users()) {
         auto *UserLd = dyn_cast<LoadSDNode>(User);
         MVT UserVT = User->getSimpleValueType(0);
         if (User != N && UserLd && ISD::isNormalLoad(User) &&
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 35c0974733aba..4bd65dc6ade40 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7397,7 +7397,7 @@ static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
 }
 
 static bool isFoldableUseOfShuffle(SDNode *N) {
-  for (auto *U : N->uses()) {
+  for (auto *U : N->users()) {
     unsigned Opc = U->getOpcode();
     // VPERMV/VPERMV3 shuffles can never fold their index operands.
     if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N)
@@ -16004,7 +16004,7 @@ static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
 
   // Find the intersection between shuffle users of V1 and V2.
   SmallVector<SDNode *, 2> Shuffles;
-  for (SDNode *User : V1->uses())
+  for (SDNode *User : V1->users())
     if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
         User->getOperand(1) == V2)
       Shuffles.push_back(User);
@@ -18280,7 +18280,7 @@ static APInt getExtractedDemandedElts(SDNode *N) {
   MVT VT = N->getSimpleValueType(0);
   unsigned NumElts = VT.getVectorNumElements();
   APInt DemandedElts = APInt::getZero(NumElts);
-  for (SDNode *User : N->uses()) {
+  for (SDNode *User : N->users()) {
     switch (User->getOpcode()) {
     case X86ISD::PEXTRB:
     case X86ISD::PEXTRW:
@@ -22143,7 +22143,7 @@ static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
   // If this is a FABS and it has an FNEG user, bail out to fold the combination
   // into an FNABS. We'll lower the FABS after that if it is still in use.
   if (IsFABS)
-    for (SDNode *User : Op->uses())
+    for (SDNode *User : Op->users())
       if (User->getOpcode() == ISD::FNEG)
         return Op;
 
@@ -22888,7 +22888,7 @@ static bool hasNonFlagsUse(SDValue Op) {
 // using an RMW op or only the flags are used. Otherwise, leave
 // the node alone and emit a 'cmp' or 'test' instruction.
 static bool isProfitableToUseFlagOp(SDValue Op) {
-  for (SDNode *U : Op->uses())
+  for (SDNode *U : Op->users())
     if (U->getOpcode() != ISD::CopyToReg &&
         U->getOpcode() != ISD::SETCC &&
         U->getOpcode() != ISD::STORE)
@@ -41712,7 +41712,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
 
     // Share broadcast with the longest vector and extract low subvector (free).
     // Ensure the same SDValue from the SDNode use is being used.
-    for (SDNode *User : Src->uses())
+    for (SDNode *User : Src->users())
       if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST &&
           Src == User->getOperand(0) &&
           User->getValueSizeInBits(0).getFixedValue() >
@@ -42910,7 +42910,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
 
     // If we reuse the shift amount just for sse shift amounts then we know that
     // only the bottom 64-bits are only ever used.
-    bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) {
+    bool AssumeSingleUse = llvm::all_of(Amt->users(), [&Amt](SDNode *Use) {
       unsigned UseOpc = Use->getOpcode();
       return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL ||
               UseOpc == X86ISD::VSRA) &&
@@ -45670,7 +45670,7 @@ combineExtractFromVectorLoad(SDNode *N, EVT VecVT, SDValue SrcVec, uint64_t Idx,
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT = N->getValueType(0);
 
-  bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) {
+  bool LikelyUsedAsVector = any_of(N->users(), [](SDNode *Use) {
     return Use->getOpcode() == ISD::STORE ||
            Use->getOpcode() == ISD::INSERT_VECTOR_ELT ||
            Use->getOpcode() == ISD::SCALAR_TO_VECTOR;
@@ -46338,7 +46338,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
       return false;
     };
     // TODO: Can we drop the oneuse check for constant extracts?
-    if (all_of(InputVector->uses(), IsBoolExtract) &&
+    if (all_of(InputVector->users(), IsBoolExtract) &&
         (IsVar || BoolExtracts.size() > 1)) {
       EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
       if (SDValue BC =
@@ -46754,7 +46754,7 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
     // the generic VSELECT anymore. Otherwise, we may perform wrong
     // optimizations as we messed with the actual expectation for the vector
     // boolean values.
-    for (SDNode *U : Cond->uses()) {
+    for (SDNode *U : Cond->users()) {
       if (U->getOpcode() == X86ISD::BLENDV)
         continue;
 
@@ -49937,7 +49937,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
         (VT == MVT::f16 && Subtarget.hasFP16())) {
       bool ExpectingFlags = false;
       // Check for any users that want flags:
-      for (const SDNode *U : N->uses()) {
+      for (const SDNode *U : N->users()) {
         if (ExpectingFlags)
           break;
 
@@ -50765,7 +50765,7 @@ static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag,
     return SDValue();
 
   // Check the only user of flag is `brcond ne`.
-  SDNode *BrCond = *Flag->uses().begin();
+  SDNode *BrCond = *Flag->use_begin();
   if (BrCond->getOpcode() != X86ISD::BRCOND)
     return SDValue();
   unsigned CondNo = 2;
@@ -52179,7 +52179,7 @@ static SDValue combineConstantPoolLoads(SDNode *N, const SDLoc &dl,
 
   // Look through all other loads/broadcasts in the chain for another constant
   // pool entry.
-  for (SDNode *User : Chain->uses()) {
+  for (SDNode *User : Chain->users()) {
     auto *UserLd = dyn_cast<MemSDNode>(User);
     if (User != N && UserLd &&
         (User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD ||
@@ -52289,7 +52289,7 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
       (RegVT.is128BitVector() || RegVT.is256BitVector())) {
     SDValue Ptr = Ld->getBasePtr();
     SDValue Chain = Ld->getChain();
-    for (SDNode *User : Chain->uses()) {
+    for (SDNode *User : Chain->users()) {
       auto *UserLd = dyn_cast<MemSDNode>(User);
       if (User != N && UserLd &&
           User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD &&
@@ -53150,8 +53150,8 @@ static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS,
     return User->getOpcode() == HOpcode && User->getValueType(0) == VT;
   };
   ForceHorizOp =
-      ForceHorizOp || (llvm::any_of(NewLHS->uses(), FoundHorizUser) &&
-                       llvm::any_of(NewRHS->uses(), FoundHorizUser));
+      ForceHorizOp || (llvm::any_of(NewLHS->users(), FoundHorizUser) &&
+                       llvm::any_of(NewRHS->users(), FoundHorizUser));
 
   // Assume a SingleSource HOP if we only shuffle one input and don't need to
   // shuffle the result.
@@ -54878,7 +54878,7 @@ static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
   // of single 'add' instructions, but the cost model for selecting an LEA
   // currently has a high threshold.
   bool HasLEAPotential = false;
-  for (auto *User : Ext->uses()) {
+  for (auto *User : Ext->users()) {
     if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
       HasLEAPotential = true;
       break;
@@ -55066,10 +55066,11 @@ static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG) {
   // Check if we can eliminate V. We assume if a value is only used in FMAs, we
   // can eliminate it. Since this function is invoked for each FMA with this
   // vector.
-  auto IsNotFMA = [](SDNode *Use) {
-    return Use->getOpcode() != ISD::FMA && Use->getOpcode() != ISD::STRICT_FMA;
+  auto IsNotFMA = [](SDNode *User) {
+    return User->getOpcode() != ISD::FMA &&
+           User->getOpcode() != ISD::STRICT_FMA;
   };
-  if (llvm::any_of(V->uses(), IsNotFMA))
+  if (llvm::any_of(V->users(), IsNotFMA))
     return SDValue();
 
   SmallVector<SDValue, 8> Ops;
@@ -55090,7 +55091,7 @@ static SDValue getInvertedVectorForFMA(SDValue V, SelectionDAG &DAG) {
 
   // If an inverted version cannot be eliminated, choose it instead of the
   // original version.
-  if (llvm::any_of(NV->uses(), IsNotFMA))
+  if (llvm::any_of(NV->users(), IsNotFMA))
     return SDValue(NV, 0);
 
   // If the inverted version also can be eliminated, we have to consistently
@@ -56183,7 +56184,7 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
 static bool needCarryOrOverflowFlag(SDValue Flags) {
   assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
 
-  for (const SDNode *User : Flags->uses()) {
+  for (const SDNode *User : Flags->users()) {
     X86::CondCode CC;
     switch (User->getOpcode()) {
     default:
@@ -56218,7 +56219,7 @@ static bool needCarryOrOverflowFlag(SDValue Flags) {
 static bool onlyZeroFlagUsed(SDValue Flags) {
   assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!");
 
-  for (const SDNode *User : Flags->uses()) {
+  for (const SDNode *User : Flags->users()) {
     unsigned CCOpNo;
     switch (User->getOpcode()) {
     default:
@@ -56829,7 +56830,7 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL,
   // TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
   if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
       !isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
-      all_of(N->uses(), [&](SDNode *Use) {
+      all_of(N->users(), [&](SDNode *Use) {
         auto *MemNode = dyn_cast<MemSDNode>(Use);
         return MemNode && MemNode->getBasePtr().getNode() == N;
       })) {
@@ -58485,7 +58486,7 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG,
   // See if we're broadcasting the scalar value, in which case just reuse that.
   // Ensure the same SDValue from the SDNode use is being used.
   if (VT.getScalarType() == Src.getValueType())
-    for (SDNode *User : Src->uses())
+    for (SDNode *User : Src->users())
       if (User->getOpcode() == X86ISD::VBROADCAST &&
           Src == User->getOperand(0)) {
         unsigned SizeInBits = VT.getFixedSizeInBits();
@@ -58881,7 +58882,7 @@ static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
 
   // Look at other users of our base pointer and try to find a wider broadcast.
   // The input chain and the size of the memory VT must match.
-  for (SDNode *User : Ptr->uses())
+  for (SDNode *User : Ptr->users())
     if (User != N && User->getOpcode() == N->getOpcode() &&
         cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
         cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 05a5a36ce5cbe..df12ea2f79df5 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -955,7 +955,7 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
     return false;
 
   bool HasRet = false;
-  for (const SDNode *U : Copy->uses()) {
+  for (const SDNode *U : Copy->users()) {
     if (U->getOpcode() != X86ISD::RET_GLUE)
       return false;
     // If we are returning more than one value, we can definitely

From 2302142f2318ba9624b847cd8c1a7e2d255be5c5 Mon Sep 17 00:00:00 2001
From: Tyler Nowicki <tyler.nowicki@amd.com>
Date: Wed, 18 Dec 2024 23:47:00 -0500
Subject: [PATCH 008/209] [Coroutines][Docs] Add a discussion on the handling
 of certain parameter attribs (#117183)

ByVal arguments and Swifterror require special handling in the coroutine
passes. The goal of this section is to provide a description of how
these parameter attributes are handled.
---
 llvm/docs/Coroutines.rst | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/llvm/docs/Coroutines.rst b/llvm/docs/Coroutines.rst
index 92e138b6893b2..60e32dc467d27 100644
--- a/llvm/docs/Coroutines.rst
+++ b/llvm/docs/Coroutines.rst
@@ -810,6 +810,28 @@ The LLVM IR for a coroutine using a Coroutine with a custom ABI looks like:
     ret ptr %hdl
   }
 
+Parameter Attributes
+====================
+Some parameter attributes, used to communicate additional information about the result or parameters of a function, require special handling.
+
+ByVal
+-----
+A ByVal parameter on an argument indicates that the pointee should be treated as being passed by value to the function.
+Prior to the coroutine transforms loads and stores to/from the pointer are generated where the value is needed.
+Consequently, a ByVal argument is treated much like an alloca.
+Space is allocated for it on the coroutine frame and the uses of the argument pointer are replaced with a pointer to the coroutine frame.
+
+Swift Error
+-----------
+Clang supports the swiftcall calling convention in many common targets, and a user could call a function that takes a swifterror argument from a C++ coroutine.
+The swifterror parameter attribute exists to model and optimize Swift error handling.
+A swifterror alloca or parameter can only be loaded, stored, or passed as a swifterror call argument, and a swifterror call argument can only be a direct reference to a swifterror alloca or parameter. 
+These rules, not coincidentally, mean that you can always perfectly model the data flow in the alloca, and LLVM CodeGen actually has to do that in order to emit code.
+
+For coroutine lowering the default treatment of allocas breaks those rules — splitting will try to replace the alloca with an entry in the coro frame, which can lead to trying to pass that as a swifterror argument.
+To pass a swifterror argument in a split function, we need to still have the alloca around; but we also potentially need the coro frame slot, since useful data can (in theory) be stored in the swifterror alloca slot across suspensions in the presplit coroutine. 
+When split a coroutine it is consequently necessary to keep both the frame slot as well as the alloca itself and then keep them in sync.
+
 Intrinsics
 ==========
 

From 2c782ab2718758bd106ad5939adf7cfb6cd9d1e9 Mon Sep 17 00:00:00 2001
From: Pengcheng Wang <wangpengcheng.pp@bytedance.com>
Date: Thu, 19 Dec 2024 13:00:08 +0800
Subject: [PATCH 009/209] [RISCV] Add software pipeliner support (#117546)

This patch adds basic support of `MachinePipeliner` and disable
it by default.

The functionality should be OK and all llvm-test-suite tests have
passed.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp     |  81 ++++++++++++++
 llvm/lib/Target/RISCV/RISCVInstrInfo.h       |   3 +
 llvm/lib/Target/RISCV/RISCVSubtarget.cpp     |   4 +
 llvm/lib/Target/RISCV/RISCVSubtarget.h       |   4 +
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp |   8 ++
 llvm/test/CodeGen/RISCV/machine-pipeliner.ll | 109 +++++++++++++++++++
 6 files changed, 209 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/machine-pipeliner.ll

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 7e0063589b6f4..0af8161a307ab 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -4248,3 +4248,84 @@ bool RISCV::isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) {
     return false;
   return LHS.getImm() <= RHS.getImm();
 }
+
+namespace {
+class RISCVPipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
+  const MachineInstr *LHS;
+  const MachineInstr *RHS;
+  SmallVector<MachineOperand, 3> Cond;
+
+public:
+  RISCVPipelinerLoopInfo(const MachineInstr *LHS, const MachineInstr *RHS,
+                         const SmallVectorImpl<MachineOperand> &Cond)
+      : LHS(LHS), RHS(RHS), Cond(Cond.begin(), Cond.end()) {}
+
+  bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
+    // Make the instructions for loop control be placed in stage 0.
+    // The predecessors of LHS/RHS are considered by the caller.
+    if (LHS && MI == LHS)
+      return true;
+    if (RHS && MI == RHS)
+      return true;
+    return false;
+  }
+
+  std::optional<bool> createTripCountGreaterCondition(
+      int TC, MachineBasicBlock &MBB,
+      SmallVectorImpl<MachineOperand> &CondParam) override {
+    // A branch instruction will be inserted as "if (Cond) goto epilogue".
+    // Cond is normalized for such use.
+    // The predecessors of the branch are assumed to have already been inserted.
+    CondParam = Cond;
+    return {};
+  }
+
+  void setPreheader(MachineBasicBlock *NewPreheader) override {}
+
+  void adjustTripCount(int TripCountAdjust) override {}
+
+  void disposed() override {}
+};
+} // namespace
+
+std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+RISCVInstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  if (analyzeBranch(*LoopBB, TBB, FBB, Cond, /*AllowModify=*/false))
+    return nullptr;
+
+  // Infinite loops are not supported
+  if (TBB == LoopBB && FBB == LoopBB)
+    return nullptr;
+
+  // Must be conditional branch
+  if (FBB == nullptr)
+    return nullptr;
+
+  assert((TBB == LoopBB || FBB == LoopBB) &&
+         "The Loop must be a single-basic-block loop");
+
+  // Normalization for createTripCountGreaterCondition()
+  if (TBB == LoopBB)
+    reverseBranchCondition(Cond);
+
+  const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
+  auto FindRegDef = [&MRI](MachineOperand &Op) -> const MachineInstr * {
+    if (!Op.isReg())
+      return nullptr;
+    Register Reg = Op.getReg();
+    if (!Reg.isVirtual())
+      return nullptr;
+    return MRI.getVRegDef(Reg);
+  };
+
+  const MachineInstr *LHS = FindRegDef(Cond[1]);
+  const MachineInstr *RHS = FindRegDef(Cond[2]);
+  if (LHS && LHS->isPHI())
+    return nullptr;
+  if (RHS && RHS->isPHI())
+    return nullptr;
+
+  return std::make_unique<RISCVPipelinerLoopInfo>(LHS, RHS, Cond);
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 005cba5d35610..7e8bcd451a8ef 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -298,6 +298,9 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
 
   unsigned getTailDuplicateSize(CodeGenOptLevel OptLevel) const override;
 
+  std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+  analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override;
+
 protected:
   const RISCVSubtarget &STI;
 
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index 90131d82534b1..6e212dc58e6dd 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -194,6 +194,10 @@ bool RISCVSubtarget::useRVVForFixedLengthVectors() const {
 
 bool RISCVSubtarget::enableSubRegLiveness() const { return true; }
 
+bool RISCVSubtarget::enableMachinePipeliner() const {
+  return getSchedModel().hasInstrSchedModel();
+}
+
   /// Enable use of alias analysis during code generation (during MI
   /// scheduling, DAGCombine, etc.).
 bool RISCVSubtarget::useAA() const { return UseAA; }
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 096d696c71f8f..87d508c394173 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -324,6 +324,10 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
 
   bool enableSubRegLiveness() const override;
 
+  bool enableMachinePipeliner() const override;
+
+  bool useDFAforSMS() const override { return false; }
+
   bool useAA() const override;
 
   unsigned getCacheLineSize() const override {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 0b8407943a907..f6ccbfbe217df 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -112,6 +112,11 @@ static cl::opt<bool> DisableVectorMaskMutation(
     cl::desc("Disable the vector mask scheduling mutation"), cl::init(false),
     cl::Hidden);
 
+static cl::opt<bool>
+    EnableMachinePipeliner("riscv-enable-pipeliner",
+                           cl::desc("Enable Machine Pipeliner for RISC-V"),
+                           cl::init(false), cl::Hidden);
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   RegisterTargetMachine<RISCVTargetMachine> X(getTheRISCV32Target());
   RegisterTargetMachine<RISCVTargetMachine> Y(getTheRISCV64Target());
@@ -603,6 +608,9 @@ void RISCVPassConfig::addPreRegAlloc() {
   addPass(createRISCVInsertReadWriteCSRPass());
   addPass(createRISCVInsertWriteVXRMPass());
   addPass(createRISCVLandingPadSetupPass());
+
+  if (TM->getOptLevel() != CodeGenOptLevel::None && EnableMachinePipeliner)
+    addPass(&MachinePipelinerID);
 }
 
 void RISCVPassConfig::addFastRegAlloc() {
diff --git a/llvm/test/CodeGen/RISCV/machine-pipeliner.ll b/llvm/test/CodeGen/RISCV/machine-pipeliner.ll
new file mode 100644
index 0000000000000..d250098576687
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/machine-pipeliner.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv64 -mcpu=sifive-p670 -O3 -verify-machineinstrs -riscv-enable-pipeliner=false < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-NOT-PIPELINED
+; RUN: llc -mtriple=riscv64 -mcpu=sifive-p670 -O3 -verify-machineinstrs -riscv-enable-pipeliner=true < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-PIPELINED
+
+; We shouldn't pipeline this loop as one operand of branch is a PHI.
+define i32 @test_phi() {
+; CHECK-LABEL: test_phi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li a0, 0
+; CHECK-NEXT:  .LBB0_1: # %for.body
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    li a0, 1
+; CHECK-NEXT:    sh a0, 0(zero)
+; CHECK-NEXT:    bnez a1, .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
+; CHECK-NEXT:    li a0, 0
+; CHECK-NEXT:    ret
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 0
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv1 = phi i64 [ 0, %entry ], [ 1, %for.body ]
+  store i16 1, ptr null, align 4
+  %exitcond.not.31 = icmp eq i64 %indvars.iv1, 0
+  br i1 %exitcond.not.31, label %for.cond.cleanup, label %for.body
+}
+
+define void @test_pipelined_1(ptr noalias %in, ptr noalias %out, i32 signext %cnt) {
+; CHECK-NOT-PIPELINED-LABEL: test_pipelined_1:
+; CHECK-NOT-PIPELINED:       # %bb.0: # %entry
+; CHECK-NOT-PIPELINED-NEXT:    blez a2, .LBB1_3
+; CHECK-NOT-PIPELINED-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-NOT-PIPELINED-NEXT:    addi a2, a2, -1
+; CHECK-NOT-PIPELINED-NEXT:    sh2add.uw a2, a2, a1
+; CHECK-NOT-PIPELINED-NEXT:    addi a2, a2, 4
+; CHECK-NOT-PIPELINED-NEXT:  .LBB1_2: # %for.body
+; CHECK-NOT-PIPELINED-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NOT-PIPELINED-NEXT:    lw a3, 0(a1)
+; CHECK-NOT-PIPELINED-NEXT:    addi a1, a1, 4
+; CHECK-NOT-PIPELINED-NEXT:    addi a3, a3, 1
+; CHECK-NOT-PIPELINED-NEXT:    sw a3, 0(a0)
+; CHECK-NOT-PIPELINED-NEXT:    addi a0, a0, 4
+; CHECK-NOT-PIPELINED-NEXT:    bne a1, a2, .LBB1_2
+; CHECK-NOT-PIPELINED-NEXT:  .LBB1_3: # %for.end
+; CHECK-NOT-PIPELINED-NEXT:    ret
+;
+; CHECK-PIPELINED-LABEL: test_pipelined_1:
+; CHECK-PIPELINED:       # %bb.0: # %entry
+; CHECK-PIPELINED-NEXT:    blez a2, .LBB1_6
+; CHECK-PIPELINED-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-PIPELINED-NEXT:    lw a4, 0(a1)
+; CHECK-PIPELINED-NEXT:    addi a2, a2, -1
+; CHECK-PIPELINED-NEXT:    sh2add.uw a6, a2, a1
+; CHECK-PIPELINED-NEXT:    addi a2, a0, 4
+; CHECK-PIPELINED-NEXT:    addi a1, a1, 4
+; CHECK-PIPELINED-NEXT:    addi a6, a6, 4
+; CHECK-PIPELINED-NEXT:    beq a1, a6, .LBB1_5
+; CHECK-PIPELINED-NEXT:  # %bb.2: # %for.body
+; CHECK-PIPELINED-NEXT:    lw a5, 0(a1)
+; CHECK-PIPELINED-NEXT:    addi a3, a2, 4
+; CHECK-PIPELINED-NEXT:    addi a4, a4, 1
+; CHECK-PIPELINED-NEXT:    addi a1, a1, 4
+; CHECK-PIPELINED-NEXT:    beq a1, a6, .LBB1_4
+; CHECK-PIPELINED-NEXT:  .LBB1_3: # %for.body
+; CHECK-PIPELINED-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-PIPELINED-NEXT:    sw a4, 0(a0)
+; CHECK-PIPELINED-NEXT:    mv a4, a5
+; CHECK-PIPELINED-NEXT:    lw a5, 0(a1)
+; CHECK-PIPELINED-NEXT:    mv a0, a2
+; CHECK-PIPELINED-NEXT:    mv a2, a3
+; CHECK-PIPELINED-NEXT:    addi a3, a3, 4
+; CHECK-PIPELINED-NEXT:    addi a4, a4, 1
+; CHECK-PIPELINED-NEXT:    addi a1, a1, 4
+; CHECK-PIPELINED-NEXT:    bne a1, a6, .LBB1_3
+; CHECK-PIPELINED-NEXT:  .LBB1_4:
+; CHECK-PIPELINED-NEXT:    sw a4, 0(a0)
+; CHECK-PIPELINED-NEXT:    mv a0, a2
+; CHECK-PIPELINED-NEXT:    mv a4, a5
+; CHECK-PIPELINED-NEXT:  .LBB1_5:
+; CHECK-PIPELINED-NEXT:    addi a4, a4, 1
+; CHECK-PIPELINED-NEXT:    sw a4, 0(a0)
+; CHECK-PIPELINED-NEXT:  .LBB1_6: # %for.end
+; CHECK-PIPELINED-NEXT:    ret
+entry:
+  %cmp = icmp sgt i32 %cnt, 0
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %inc.next = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %in.addr.next = phi ptr [ %incdec.in, %for.body ], [ %in, %entry ]
+  %out.addr.next = phi ptr [ %incdec.out, %for.body ], [ %out, %entry ]
+  %0 = load i32, ptr %out.addr.next, align 4
+  %1 = add i32 %0, 1
+  store i32 %1, ptr %in.addr.next, align 4
+  %incdec.in = getelementptr inbounds i8, ptr %in.addr.next, i64 4
+  %incdec.out = getelementptr inbounds i8, ptr %out.addr.next, i64 4
+  %inc = add nuw nsw i32 %inc.next, 1
+  %exitcond.not = icmp eq i32 %inc, %cnt
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}

From 5c55f9664f7e2f9fe29589a97bc5818d6b8d3c9c Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Thu, 19 Dec 2024 13:12:01 +0800
Subject: [PATCH 010/209] [Clang] Don't assume unexpanded PackExpansions' size
 when expanding packs (#120380)

CheckParameterPacksForExpansion() previously assumed that template
arguments don't include PackExpansion types when attempting another pack
expansion (i.e. when NumExpansions is present). However, this assumption
doesn't hold for type aliases, whose substitution might involve
unexpanded packs. This can lead to incorrect diagnostics during
substitution because the pack size is not yet determined.

To address this, this patch calculates the minimum pack size (ignoring
unexpanded PackExpansionTypes) and compares it to the previously
expanded size. If the minimum pack size is smaller, then there's still a
chance for future substitution to expand it to a correct size, so we
don't diagnose it too eagerly.

Fixes #61415
Fixes #32252
Fixes #17042
---
 clang/docs/ReleaseNotes.rst                   |  1 +
 .../clang/Basic/DiagnosticSemaKinds.td        |  4 +-
 clang/lib/Sema/SemaTemplateVariadic.cpp       | 57 ++++++++++++++++--
 clang/test/SemaTemplate/pack-deduction.cpp    | 59 +++++++++++++++++++
 4 files changed, 113 insertions(+), 8 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 29794f27d3005..5f91ff9063403 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -850,6 +850,7 @@ Bug Fixes to C++ Support
 - Clang no longer rejects deleting a pointer of incomplete enumeration type. (#GH99278)
 - Fixed recognition of ``std::initializer_list`` when it's surrounded with ``extern "C++"`` and exported
   out of a module (which is the case e.g. in MSVC's implementation of ``std`` module). (#GH118218)
+- Fixed a pack expansion issue in checking unexpanded parameter sizes. (#GH17042)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index d67a81f8564a8..7bd154e7da2f4 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -5863,10 +5863,10 @@ def err_pack_expansion_without_parameter_packs : Error<
   "pack expansion does not contain any unexpanded parameter packs">;
 def err_pack_expansion_length_conflict : Error<
   "pack expansion contains parameter packs %0 and %1 that have different "
-  "lengths (%2 vs. %3)">;
+  "lengths (%2 vs. %select{|at least }3%4))">;
 def err_pack_expansion_length_conflict_multilevel : Error<
   "pack expansion contains parameter pack %0 that has a different "
-  "length (%1 vs. %2) from outer parameter packs">;
+  "length (%1 vs. %select{|at least }2%3) from outer parameter packs">;
 def err_pack_expansion_length_conflict_partial : Error<
   "pack expansion contains parameter pack %0 that has a different "
   "length (at least %1 vs. %2) from outer parameter packs">;
diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp
index 88a21240e1c80..c8452db6bc901 100644
--- a/clang/lib/Sema/SemaTemplateVariadic.cpp
+++ b/clang/lib/Sema/SemaTemplateVariadic.cpp
@@ -780,7 +780,7 @@ bool Sema::CheckParameterPacksForExpansion(
     }
 
     // Determine the size of this argument pack.
-    unsigned NewPackSize;
+    unsigned NewPackSize, PendingPackExpansionSize = 0;
     if (IsVarDeclPack) {
       // Figure out whether we're instantiating to an argument pack or not.
       typedef LocalInstantiationScope::DeclArgumentPack DeclArgumentPack;
@@ -808,7 +808,25 @@ bool Sema::CheckParameterPacksForExpansion(
       }
 
       // Determine the size of the argument pack.
-      NewPackSize = TemplateArgs(Depth, Index).pack_size();
+      ArrayRef<TemplateArgument> Pack =
+          TemplateArgs(Depth, Index).getPackAsArray();
+      NewPackSize = Pack.size();
+      PendingPackExpansionSize =
+          llvm::count_if(Pack, [](const TemplateArgument &TA) {
+            if (!TA.isPackExpansion())
+              return false;
+
+            if (TA.getKind() == TemplateArgument::Type)
+              return !TA.getAsType()
+                          ->getAs<PackExpansionType>()
+                          ->getNumExpansions();
+
+            if (TA.getKind() == TemplateArgument::Expression)
+              return !cast<PackExpansionExpr>(TA.getAsExpr())
+                          ->getNumExpansions();
+
+            return !TA.getNumTemplateExpansions();
+          });
     }
 
     // C++0x [temp.arg.explicit]p9:
@@ -831,7 +849,7 @@ bool Sema::CheckParameterPacksForExpansion(
     }
 
     if (!NumExpansions) {
-      // The is the first pack we've seen for which we have an argument.
+      // This is the first pack we've seen for which we have an argument.
       // Record it.
       NumExpansions = NewPackSize;
       FirstPack.first = Name;
@@ -841,17 +859,44 @@ bool Sema::CheckParameterPacksForExpansion(
     }
 
     if (NewPackSize != *NumExpansions) {
+      // In some cases, we might be handling packs with unexpanded template
+      // arguments. For example, this can occur when substituting into a type
+      // alias declaration that uses its injected template parameters as
+      // arguments:
+      //
+      //   template <class... Outer> struct S {
+      //     template <class... Inner> using Alias = S<void(Outer, Inner)...>;
+      //   };
+      //
+      // Consider an instantiation attempt like 'S<int>::Alias<Pack...>', where
+      // Pack comes from another template parameter. 'S<int>' is first
+      // instantiated, expanding the outer pack 'Outer' to <int>. The alias
+      // declaration is accordingly substituted, leaving the template arguments
+      // as unexpanded
+      // '<Pack...>'.
+      //
+      // Since we have no idea of the size of '<Pack...>' until its expansion,
+      // we shouldn't assume its pack size for validation. However if we are
+      // certain that there are extra arguments beyond unexpanded packs, in
+      // which case the pack size is already larger than the previous expansion,
+      // we can complain that before instantiation.
+      unsigned LeastNewPackSize = NewPackSize - PendingPackExpansionSize;
+      if (PendingPackExpansionSize && LeastNewPackSize <= *NumExpansions) {
+        ShouldExpand = false;
+        continue;
+      }
       // C++0x [temp.variadic]p5:
       //   All of the parameter packs expanded by a pack expansion shall have
       //   the same number of arguments specified.
       if (HaveFirstPack)
         Diag(EllipsisLoc, diag::err_pack_expansion_length_conflict)
-            << FirstPack.first << Name << *NumExpansions << NewPackSize
+            << FirstPack.first << Name << *NumExpansions
+            << (LeastNewPackSize != NewPackSize) << LeastNewPackSize
             << SourceRange(FirstPack.second) << SourceRange(ParmPack.second);
       else
         Diag(EllipsisLoc, diag::err_pack_expansion_length_conflict_multilevel)
-            << Name << *NumExpansions << NewPackSize
-            << SourceRange(ParmPack.second);
+            << Name << *NumExpansions << (LeastNewPackSize != NewPackSize)
+            << LeastNewPackSize << SourceRange(ParmPack.second);
       return true;
     }
   }
diff --git a/clang/test/SemaTemplate/pack-deduction.cpp b/clang/test/SemaTemplate/pack-deduction.cpp
index 28fb127a38644..b3104609994a4 100644
--- a/clang/test/SemaTemplate/pack-deduction.cpp
+++ b/clang/test/SemaTemplate/pack-deduction.cpp
@@ -199,3 +199,62 @@ constexpr auto baz(Int<foo<T>(T())>... x) -> int { return 1; }
 
 static_assert(baz<Int<1>, Int<2>, Int<3>>(Int<10>(), Int<10>(), Int<10>()) == 1, "");
 }
+
+namespace GH17042 {
+
+template <class... Ts> struct X {
+  template <class... Us> using Y = X<void(Ts, Us)...>; // #GH17042_Y
+};
+
+template <class... T>
+using any_pairs_list = X<int, int>::Y<T...>; // #any_pairs_list
+
+template <class... T>
+using any_pairs_list_2 = X<int, int>::Y<>;
+// expected-error@#GH17042_Y {{different length (2 vs. 0)}} \
+// expected-note@-1 {{requested here}}
+
+template <class A, class B, class... P>
+using any_pairs_list_3 = X<int, int>::Y<A, B, P...>; // #any_pairs_list_3
+
+template <class A, class B, class C, class... P>
+using any_pairs_list_4 = X<int, int>::Y<A, B, C, P...>;
+// expected-error@#GH17042_Y {{different length (2 vs. at least 3)}} \
+// expected-note@-1 {{requested here}}
+
+static_assert(__is_same(any_pairs_list<char, char>, X<void(int, char), void(int, char)>), "");
+
+static_assert(!__is_same(any_pairs_list<char, char, char>, X<void(int, char), void(int, char)>), "");
+// expected-error@#GH17042_Y {{different length (2 vs. 3)}} \
+// expected-note@#any_pairs_list {{requested here}} \
+// expected-note@-1 {{requested here}}
+
+static_assert(__is_same(any_pairs_list_3<char, char>, X<void(int, char), void(int, char)>), "");
+
+static_assert(!__is_same(any_pairs_list_3<char, char, float>, X<void(int, char), void(int, char)>), "");
+// expected-error@#GH17042_Y {{different length (2 vs. 3)}} \
+// expected-note@#any_pairs_list_3 {{requested here}} \
+// expected-note@-1 {{requested here}}
+
+namespace TemplateTemplateParameters {
+template <class... T> struct C {};
+
+template <class T, template <class> class... Args1> struct Ttp {
+  template <template <class> class... Args2>
+  using B = C<void(Args1<T>, Args2<T>)...>;
+};
+template <class> struct D {};
+
+template <template <class> class... Args>
+using Alias = Ttp<int, D, D>::B<Args...>;
+}
+
+namespace NTTP {
+template <int... Args1> struct Nttp {
+  template <int... Args2> using B = Nttp<(Args1 + Args2)...>;
+};
+
+template <int... Args> using Alias = Nttp<1, 2, 3>::B<Args...>;
+}
+
+}

From 4ca4287da472c045f1587b1e8c7bd2ba8ef2e5c0 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 18 Dec 2024 21:46:52 -0800
Subject: [PATCH 011/209] [SelectionDAG] Replace findGlueUse in
 SelectionDAGISel with SDNode::getGluedUser. NFC (#120512)

---
 .../lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 9147fb1c2badf..c4d0552ad55d3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -2329,19 +2329,6 @@ void SelectionDAGISel::SelectInlineAsmMemoryOperands(std::vector<SDValue> &Ops,
     Ops.push_back(handle.getValue());
 }
 
-/// findGlueUse - Return use of MVT::Glue value produced by the specified
-/// SDNode.
-///
-static SDNode *findGlueUse(SDNode *N) {
-  unsigned FlagResNo = N->getNumValues()-1;
-  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
-    SDUse &Use = I.getUse();
-    if (Use.getResNo() == FlagResNo)
-      return Use.getUser();
-  }
-  return nullptr;
-}
-
 /// findNonImmUse - Return true if "Def" is a predecessor of "Root" via a path
 /// beyond "ImmedUse".  We may ignore chains as they are checked separately.
 static bool findNonImmUse(SDNode *Root, SDNode *Def, SDNode *ImmedUse,
@@ -2445,7 +2432,7 @@ bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root,
   // glueged set.
   EVT VT = Root->getValueType(Root->getNumValues()-1);
   while (VT == MVT::Glue) {
-    SDNode *GU = findGlueUse(Root);
+    SDNode *GU = Root->getGluedUser();
     if (!GU)
       break;
     Root = GU;

From bd261ecc5aeefd62150cb5f04e4a4f0cb7a12e1c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 18 Dec 2024 22:13:04 -0800
Subject: [PATCH 012/209] [SelectionDAG] Add SDNode::user_begin() and use it in
 some places (#120509)

Most of these are just places that want the first user and aren't
iterating over the whole list.

While there I changed some use_size() == 1 to hasOneUse() which
is more efficient.

This is part of an effort to rename use_iterator to user_iterator
and provide a use_iterator that dereferences to SDUse&. This patch
helps reduce the diff on later patches.
---
 llvm/include/llvm/CodeGen/SelectionDAGNodes.h |  5 +++
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 38 +++++++++----------
 .../Target/AArch64/AArch64ISelLowering.cpp    | 20 +++++-----
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  6 +--
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  2 +-
 llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp       |  8 ++--
 llvm/lib/Target/ARM/ARMISelLowering.cpp       | 29 +++++++-------
 .../Target/Hexagon/HexagonISelDAGToDAG.cpp    |  4 +-
 .../LoongArch/LoongArchISelLowering.cpp       |  2 +-
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp   |  4 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |  8 ++--
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  8 ++--
 .../Target/SystemZ/SystemZISelDAGToDAG.cpp    |  2 +-
 .../Target/SystemZ/SystemZISelLowering.cpp    |  6 +--
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 36 +++++++++---------
 llvm/lib/Target/X86/X86ISelLoweringCall.cpp   |  2 +-
 16 files changed, 92 insertions(+), 88 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index b525872f9dd2a..77c04369f3e92 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -844,6 +844,11 @@ END_TWO_BYTE_PACK()
 
   static use_iterator use_end() { return use_iterator(nullptr); }
 
+  /// Provide iteration support to walk over all users of an SDNode.
+  /// For now, this should only be used to get a pointer to the first user.
+  /// FIXME: Rename use_iterator to user_iterator. Add user_end().
+  use_iterator user_begin() const { return use_iterator(UseList); }
+
   // Dereferencing use_iterator returns the user SDNode* making it closer to a
   // user_iterator thus this function is called users() to reflect that.
   // FIXME: Rename to user_iterator and introduce a use_iterator that returns
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ebce0ebe8f81c..85009439c37b3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2136,8 +2136,8 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
   // If the sole user is a token factor, we should make sure we have a
   // chance to merge them together. This prevents TF chains from inhibiting
   // optimizations.
-  if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::TokenFactor)
-    AddToWorklist(*(N->use_begin()));
+  if (N->hasOneUse() && N->user_begin()->getOpcode() == ISD::TokenFactor)
+    AddToWorklist(*(N->user_begin()));
 
   SmallVector<SDNode *, 8> TFs;     // List of token factors to visit.
   SmallVector<SDValue, 8> Ops;      // Ops for replacing token factor.
@@ -10906,15 +10906,15 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   // which we plan to do. This workaround can be removed once the DAG is
   // processed in topological order.
   if (N->hasOneUse()) {
-    SDNode *Use = *N->use_begin();
+    SDNode *User = *N->user_begin();
 
     // Look pass the truncate.
-    if (Use->getOpcode() == ISD::TRUNCATE && Use->hasOneUse())
-      Use = *Use->use_begin();
+    if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse())
+      User = *User->user_begin();
 
-    if (Use->getOpcode() == ISD::BRCOND || Use->getOpcode() == ISD::AND ||
-        Use->getOpcode() == ISD::OR || Use->getOpcode() == ISD::XOR)
-      AddToWorklist(Use);
+    if (User->getOpcode() == ISD::BRCOND || User->getOpcode() == ISD::AND ||
+        User->getOpcode() == ISD::OR || User->getOpcode() == ISD::XOR)
+      AddToWorklist(User);
   }
 
   // Try to transform this shift into a multiply-high if
@@ -12917,7 +12917,7 @@ SDValue DAGCombiner::visitSETCC(SDNode *N) {
   // also lend itself to numerous combines and, as a result, it is desired
   // we keep the argument to a brcond as a setcc as much as possible.
   bool PreferSetCC =
-      N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BRCOND;
+      N->hasOneUse() && N->user_begin()->getOpcode() == ISD::BRCOND;
 
   ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
   EVT VT = N->getValueType(0);
@@ -14825,7 +14825,7 @@ SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
 
     // If the SRL is only used by a masking AND, we may be able to adjust
     // the ExtVT to make the AND redundant.
-    SDNode *Mask = *(SRL->use_begin());
+    SDNode *Mask = *(SRL->user_begin());
     if (SRL.hasOneUse() && Mask->getOpcode() == ISD::AND &&
         isa<ConstantSDNode>(Mask->getOperand(1))) {
       unsigned Offset, ActiveBits;
@@ -15364,7 +15364,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   }
 
   // If this is anyext(trunc), don't fold it, allow ourselves to be folded.
-  if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ANY_EXTEND))
+  if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ANY_EXTEND))
     return SDValue();
 
   // Fold extract-and-trunc into a narrow extract. For example:
@@ -18370,7 +18370,7 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
       return FoldedVOp;
 
   // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
-  if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
+  if (N->hasOneUse() && N->user_begin()->getOpcode() == ISD::FP_ROUND)
     return SDValue();
 
   // fold (fp_extend c1fp) -> c1fp
@@ -19847,17 +19847,17 @@ struct LoadedSlice {
   bool canMergeExpensiveCrossRegisterBankCopy() const {
     if (!Inst || !Inst->hasOneUse())
       return false;
-    SDNode *Use = *Inst->use_begin();
-    if (Use->getOpcode() != ISD::BITCAST)
+    SDNode *User = *Inst->user_begin();
+    if (User->getOpcode() != ISD::BITCAST)
       return false;
     assert(DAG && "Missing context");
     const TargetLowering &TLI = DAG->getTargetLoweringInfo();
-    EVT ResVT = Use->getValueType(0);
+    EVT ResVT = User->getValueType(0);
     const TargetRegisterClass *ResRC =
-        TLI.getRegClassFor(ResVT.getSimpleVT(), Use->isDivergent());
+        TLI.getRegClassFor(ResVT.getSimpleVT(), User->isDivergent());
     const TargetRegisterClass *ArgRC =
-        TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT(),
-                           Use->getOperand(0)->isDivergent());
+        TLI.getRegClassFor(User->getOperand(0).getValueType().getSimpleVT(),
+                           User->getOperand(0)->isDivergent());
     if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT))
       return false;
 
@@ -20069,7 +20069,7 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) {
     if (User->getOpcode() == ISD::SRL && User->hasOneUse() &&
         isa<ConstantSDNode>(User->getOperand(1))) {
       Shift = User->getConstantOperandVal(1);
-      User = *User->use_begin();
+      User = *User->user_begin();
     }
 
     // At this point, User is a Truncate, iff we encountered, trunc or
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5865dbe1307ba..494506def33a3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -18109,9 +18109,9 @@ bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
   if (N->getOpcode() == ISD::SHL && N->hasOneUse()) {
     if (auto C2 = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
       unsigned ShlAmt = C2->getZExtValue();
-      if (auto ShouldADD = *N->use_begin();
+      if (auto ShouldADD = *N->user_begin();
           ShouldADD->getOpcode() == ISD::ADD && ShouldADD->hasOneUse()) {
-        if (auto ShouldLOAD = dyn_cast<LoadSDNode>(*ShouldADD->use_begin())) {
+        if (auto ShouldLOAD = dyn_cast<LoadSDNode>(*ShouldADD->user_begin())) {
           unsigned ByteVT = ShouldLOAD->getMemoryVT().getSizeInBits() / 8;
           if ((1ULL << ShlAmt) == ByteVT &&
               isIndexedLoadLegal(ISD::PRE_INC, ShouldLOAD->getMemoryVT()))
@@ -18902,8 +18902,8 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
       return SDValue();
     // Conservatively do not lower to shift+add+shift if the mul might be
     // folded into madd or msub.
-    if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
-                           N->use_begin()->getOpcode() == ISD::SUB))
+    if (N->hasOneUse() && (N->user_begin()->getOpcode() == ISD::ADD ||
+                           N->user_begin()->getOpcode() == ISD::SUB))
       return SDValue();
   }
   // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
@@ -21803,7 +21803,7 @@ static SDValue tryCombineWhileLo(SDNode *N,
   if (HalfSize < 2)
     return SDValue();
 
-  auto It = N->use_begin();
+  auto It = N->user_begin();
   SDNode *Lo = *It++;
   SDNode *Hi = *It;
 
@@ -23402,7 +23402,7 @@ static SDValue performPostLD1Combine(SDNode *N,
   // TODO: This could be expanded to more operations if they reliably use the
   // index variants.
   if (N->hasOneUse()) {
-    unsigned UseOpc = N->use_begin()->getOpcode();
+    unsigned UseOpc = N->user_begin()->getOpcode();
     if (UseOpc == ISD::FMUL || UseOpc == ISD::FMA)
       return SDValue();
   }
@@ -24755,7 +24755,7 @@ static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
 
   // Make sure that all uses of Op are VSELECTs with result matching types where
   // the result type has a larger element type than the SetCC operand.
-  SDNode *FirstUse = *Op->use_begin();
+  SDNode *FirstUse = *Op->user_begin();
   if (FirstUse->getOpcode() != ISD::VSELECT)
     return SDValue();
   EVT UseMVT = FirstUse->getValueType(0);
@@ -25905,7 +25905,7 @@ static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
 
   // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
-  if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
+  if (N->hasOneUse() && N->user_begin()->getOpcode() == ISD::FP_ROUND)
     return SDValue();
 
   auto hasValidElementTypeForFPExtLoad = [](EVT VT) {
@@ -26072,7 +26072,7 @@ static SDValue tryCombineMULLWithUZP1(SDNode *N,
 
   // Check ExtractLow's user.
   if (HasFoundMULLow) {
-    SDNode *ExtractLowUser = *ExtractLow.getNode()->use_begin();
+    SDNode *ExtractLowUser = *ExtractLow.getNode()->user_begin();
     if (ExtractLowUser->getOpcode() != N->getOpcode()) {
       HasFoundMULLow = false;
     } else {
@@ -26549,7 +26549,7 @@ bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
     return false;
 
   SDValue TCChain = Chain;
-  SDNode *Copy = *N->use_begin();
+  SDNode *Copy = *N->user_begin();
   if (Copy->getOpcode() == ISD::CopyToReg) {
     // If the copy has a glue operand, we conservatively assume it isn't safe to
     // perform a tail call.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index c129759f3d3c7..a716d185e392a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1088,9 +1088,9 @@ bool AMDGPUTargetLowering::isDesirableToCommuteWithShift(
     return true;
 
   // If only user is a i32 right-shift, then don't destroy a BFE pattern.
-  if (N->getValueType(0) == MVT::i32 && N->use_size() == 1 &&
-      (N->use_begin()->getOpcode() == ISD::SRA ||
-       N->use_begin()->getOpcode() == ISD::SRL))
+  if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&
+      (N->user_begin()->getOpcode() == ISD::SRA ||
+       N->user_begin()->getOpcode() == ISD::SRL))
     return false;
 
   // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 2b8cc5b4e33a4..f83ccf6d8280b 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16896,7 +16896,7 @@ bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
   // Check if we have a good chance to form the memory access pattern with the
   // base and offset
   return (DAG.isBaseWithConstantOffset(N0) &&
-          hasMemSDNodeUser(*N0->use_begin()));
+          hasMemSDNodeUser(*N0->user_begin()));
 }
 
 bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index b03221a440039..9ad46df159c20 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -505,14 +505,14 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
   if (!N->hasOneUse())
     return false;
 
-  SDNode *Use = *N->use_begin();
-  if (Use->getOpcode() == ISD::CopyToReg)
+  SDNode *User = *N->user_begin();
+  if (User->getOpcode() == ISD::CopyToReg)
     return true;
-  if (Use->isMachineOpcode()) {
+  if (User->isMachineOpcode()) {
     const ARMBaseInstrInfo *TII = static_cast<const ARMBaseInstrInfo *>(
         CurDAG->getSubtarget().getInstrInfo());
 
-    const MCInstrDesc &MCID = TII->get(Use->getMachineOpcode());
+    const MCInstrDesc &MCID = TII->get(User->getMachineOpcode());
     if (MCID.mayStore())
       return true;
     unsigned Opcode = MCID.getOpcode();
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 764d3c879f2d6..88293c1b1101a 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -3456,7 +3456,7 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
     return false;
 
   SDValue TCChain = Chain;
-  SDNode *Copy = *N->use_begin();
+  SDNode *Copy = *N->user_begin();
   if (Copy->getOpcode() == ISD::CopyToReg) {
     // If the copy has a glue operand, we conservatively assume it isn't safe to
     // perform a tail call.
@@ -3494,7 +3494,7 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
     // f32 returned in a single GPR.
     if (!Copy->hasOneUse())
       return false;
-    Copy = *Copy->use_begin();
+    Copy = *Copy->user_begin();
     if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
       return false;
     // If the copy has a glue operand, we conservatively assume it isn't safe to
@@ -15356,7 +15356,7 @@ PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   assert(EltVT == MVT::f32 && "Unexpected type!");
 
   // Check 1.2.
-  SDNode *Use = *N->use_begin();
+  SDNode *Use = *N->user_begin();
   if (Use->getOpcode() != ISD::BITCAST ||
       Use->getValueType(0).isFloatingPoint())
     return SDValue();
@@ -15561,9 +15561,8 @@ PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
       !isa<ConstantSDNode>(Ext.getOperand(1)) ||
       Ext.getConstantOperandVal(1) % 2 != 0)
     return SDValue();
-  if (Ext->use_size() == 1 &&
-      (Ext->use_begin()->getOpcode() == ISD::SINT_TO_FP ||
-       Ext->use_begin()->getOpcode() == ISD::UINT_TO_FP))
+  if (Ext->hasOneUse() && (Ext->user_begin()->getOpcode() == ISD::SINT_TO_FP ||
+                           Ext->user_begin()->getOpcode() == ISD::UINT_TO_FP))
     return SDValue();
 
   SDValue Op0 = Ext.getOperand(0);
@@ -15587,11 +15586,11 @@ PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   // lanes.
   SDValue OtherExt(*OtherIt, 0);
   if (OtherExt.getValueType() != MVT::i32) {
-    if (OtherExt->use_size() != 1 ||
-        OtherExt->use_begin()->getOpcode() != ISD::BITCAST ||
-        OtherExt->use_begin()->getValueType(0) != MVT::i32)
+    if (!OtherExt->hasOneUse() ||
+        OtherExt->user_begin()->getOpcode() != ISD::BITCAST ||
+        OtherExt->user_begin()->getValueType(0) != MVT::i32)
       return SDValue();
-    OtherExt = SDValue(*OtherExt->use_begin(), 0);
+    OtherExt = SDValue(*OtherExt->user_begin(), 0);
   }
 
   // Convert the type to a f64 and extract with a VMOVRRD.
@@ -18326,9 +18325,9 @@ static SDValue PerformHWLoopCombine(SDNode *N,
   SelectionDAG &DAG = DCI.DAG;
   SDValue Elements = Int.getOperand(2);
   unsigned IntOp = Int->getConstantOperandVal(1);
-  assert((N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BR)
-          && "expected single br user");
-  SDNode *Br = *N->use_begin();
+  assert((N->hasOneUse() && N->user_begin()->getOpcode() == ISD::BR) &&
+         "expected single br user");
+  SDNode *Br = *N->user_begin();
   SDValue OtherTarget = Br->getOperand(1);
 
   // Update the unconditional branch to branch to the given Dest.
@@ -19330,10 +19329,10 @@ bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
   // If there's more than one user instruction, the loadext is desirable no
   // matter what.  There can be two uses by the same instruction.
   if (ExtVal->use_empty() ||
-      !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
+      !ExtVal->user_begin()->isOnlyUserOf(ExtVal.getNode()))
     return true;
 
-  SDNode *U = *ExtVal->use_begin();
+  SDNode *U = *ExtVal->user_begin();
   if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
        U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
     return false;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 2c20db16b055f..2a267e52610b3 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -1097,7 +1097,7 @@ static bool isMemOPCandidate(SDNode *I, SDNode *U) {
   SDValue S1 = U->getOperand(1);
   SDValue SY = (S0.getNode() == I) ? S1 : S0;
 
-  SDNode *UUse = *U->use_begin();
+  SDNode *UUse = *U->user_begin();
   if (UUse->getNumValues() != 1)
     return false;
 
@@ -2431,7 +2431,7 @@ void HexagonDAGToDAGISel::rebalanceAddressTrees() {
       Worklist.push_back(N->getOperand(1).getNode());
 
       // Not a root if it has only one use and same opcode as its parent
-      if (N->hasOneUse() && Opcode == N->use_begin()->getOpcode())
+      if (N->hasOneUse() && Opcode == N->user_begin()->getOpcode())
         continue;
 
       // This root node has already been processed
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index e32ed41c2893c..7f67def73ca2b 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -5340,7 +5340,7 @@ bool LoongArchTargetLowering::isUsedByReturnOnly(SDNode *N,
   if (!N->hasNUsesOfValue(1, 0))
     return false;
 
-  SDNode *Copy = *N->use_begin();
+  SDNode *Copy = *N->user_begin();
   if (Copy->getOpcode() != ISD::CopyToReg)
     return false;
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 277c1414d7160..5445a0a06bef1 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -6610,7 +6610,7 @@ void PPCDAGToDAGISel::foldBoolExts(SDValue &Res, SDNode *&N) {
   SDValue ConstFalse = CurDAG->getConstant(0, dl, VT);
 
   do {
-    SDNode *User = *N->use_begin();
+    SDNode *User = *N->user_begin();
     if (User->getNumOperands() != 2)
       break;
 
@@ -7564,7 +7564,7 @@ static void reduceVSXSwap(SDNode *N, SelectionDAG *DAG) {
     while (V->isMachineOpcode() &&
            V->getMachineOpcode() == TargetOpcode::COPY_TO_REGCLASS) {
       // All values in the chain should have single use.
-      if (V->use_empty() || !V->use_begin()->isOnlyUserOf(V.getNode()))
+      if (V->use_empty() || !V->user_begin()->isOnlyUserOf(V.getNode()))
         return SDValue();
       V = V->getOperand(0);
     }
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 199e1f41cfc05..3b3842bb14456 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -16331,7 +16331,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       if (!LD->hasNUsesOfValue(2, 0))
         return false;
 
-      auto UI = LD->use_begin();
+      auto UI = LD->user_begin();
       while (UI.getUse().getResNo() != 0) ++UI;
       SDNode *Trunc = *UI++;
       while (UI.getUse().getResNo() != 0) ++UI;
@@ -16349,14 +16349,14 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
           !RightShift->hasOneUse())
         return false;
 
-      SDNode *Trunc2 = *RightShift->use_begin();
+      SDNode *Trunc2 = *RightShift->user_begin();
       if (Trunc2->getOpcode() != ISD::TRUNCATE ||
           Trunc2->getValueType(0) != MVT::i32 ||
           !Trunc2->hasOneUse())
         return false;
 
-      SDNode *Bitcast = *Trunc->use_begin();
-      SDNode *Bitcast2 = *Trunc2->use_begin();
+      SDNode *Bitcast = *Trunc->user_begin();
+      SDNode *Bitcast2 = *Trunc2->user_begin();
 
       if (Bitcast->getOpcode() != ISD::BITCAST ||
           Bitcast->getValueType(0) != MVT::f32)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 9383e700ade86..f0afd26598d6d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -8300,10 +8300,10 @@ SDValue RISCVTargetLowering::lowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     return V;
 
   if (Op.hasOneUse()) {
-    unsigned UseOpc = Op->use_begin()->getOpcode();
+    unsigned UseOpc = Op->user_begin()->getOpcode();
     if (isBinOp(UseOpc) && DAG.isSafeToSpeculativelyExecute(UseOpc)) {
-      SDNode *BinOp = *Op->use_begin();
-      if (SDValue NewSel = foldBinOpIntoSelectIfProfitable(*Op->use_begin(),
+      SDNode *BinOp = *Op->user_begin();
+      if (SDValue NewSel = foldBinOpIntoSelectIfProfitable(*Op->user_begin(),
                                                            DAG, Subtarget)) {
         DAG.ReplaceAllUsesWith(BinOp, &NewSel);
         // Opcode check is necessary because foldBinOpIntoSelectIfProfitable
@@ -20492,7 +20492,7 @@ bool RISCVTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   if (!N->hasNUsesOfValue(1, 0))
     return false;
 
-  SDNode *Copy = *N->use_begin();
+  SDNode *Copy = *N->user_begin();
 
   if (Copy->getOpcode() == ISD::BITCAST) {
     return isUsedByReturnOnly(Copy, Chain);
diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 210e3c5426f46..884d3a0614a8e 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -1886,7 +1886,7 @@ SystemZDAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U,
     // physical CC register, which in turn is glued and chained to the
     // actual instruction that uses the CC value.  Bail out if we have
     // anything else than that.
-    SDNode *CCUser = *U->use_begin();
+    SDNode *CCUser = *U->user_begin();
     SDNode *CCRegUser = nullptr;
     if (CCUser->getOpcode() == ISD::CopyToReg ||
         cast<RegisterSDNode>(CCUser->getOperand(1))->getReg() == SystemZ::CC) {
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 47008af3479ee..331d3a4d494c9 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -7117,7 +7117,7 @@ static bool isI128MovedToParts(LoadSDNode *LD, SDNode *&LoPart,
     if (User->getOpcode() == ISD::SRL &&
         User->getOperand(1).getOpcode() == ISD::Constant &&
         User->getConstantOperandVal(1) == 64 && User->hasOneUse()) {
-      User = *User->use_begin();
+      User = *User->user_begin();
       IsLoPart = false;
     }
     if (User->getOpcode() != ISD::TRUNCATE || User->getValueType(0) != MVT::i64)
@@ -7674,7 +7674,7 @@ SDValue SystemZTargetLowering::combineFP_ROUND(
           U->getOperand(0) == Vec &&
           U->getOperand(1).getOpcode() == ISD::Constant &&
           U->getConstantOperandVal(1) == 1) {
-        SDValue OtherRound = SDValue(*U->use_begin(), 0);
+        SDValue OtherRound = SDValue(*U->user_begin(), 0);
         if (OtherRound.getOpcode() == N->getOpcode() &&
             OtherRound.getOperand(OpNo) == SDValue(U, 0) &&
             OtherRound.getValueType() == MVT::f32) {
@@ -7738,7 +7738,7 @@ SDValue SystemZTargetLowering::combineFP_EXTEND(
           U->getOperand(0) == Vec &&
           U->getOperand(1).getOpcode() == ISD::Constant &&
           U->getConstantOperandVal(1) == 2) {
-        SDValue OtherExtend = SDValue(*U->use_begin(), 0);
+        SDValue OtherExtend = SDValue(*U->user_begin(), 0);
         if (OtherExtend.getOpcode() == N->getOpcode() &&
             OtherExtend.getOperand(OpNo) == SDValue(U, 0) &&
             OtherExtend.getValueType() == MVT::f64) {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4bd65dc6ade40..3d8af69380125 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2766,12 +2766,12 @@ bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT,
 }
 
 bool X86::mayFoldIntoStore(SDValue Op) {
-  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
+  return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->user_begin());
 }
 
 bool X86::mayFoldIntoZeroExtend(SDValue Op) {
   if (Op.hasOneUse()) {
-    unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
+    unsigned Opcode = Op.getNode()->user_begin()->getOpcode();
     return (ISD::ZERO_EXTEND == Opcode);
   }
   return false;
@@ -3215,7 +3215,7 @@ bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
 
       // If this use is not an extract + store, it's probably worth splitting.
       if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
-          UI->use_begin()->getOpcode() != ISD::STORE)
+          UI->user_begin()->getOpcode() != ISD::STORE)
         return true;
     }
     // All non-chain uses are extract + store.
@@ -18212,7 +18212,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
     // because a MOVSSmr can be used instead, which is smaller and faster.
     if (!Op.hasOneUse())
       return SDValue();
-    SDNode *User = *Op.getNode()->use_begin();
+    SDNode *User = *Op.getNode()->user_begin();
     if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
         (User->getOpcode() != ISD::BITCAST ||
          User->getValueType(0) != MVT::i32))
@@ -22873,8 +22873,8 @@ static bool hasNonFlagsUse(SDValue Op) {
     unsigned UOpNo = UI.getOperandNo();
     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
       // Look pass truncate.
-      UOpNo = User->use_begin().getOperandNo();
-      User = *User->use_begin();
+      UOpNo = User->user_begin().getOperandNo();
+      User = *User->user_begin();
     }
 
     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
@@ -25265,7 +25265,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
       // have a fall-through edge, because this requires an explicit
       // jmp when the condition is false.
       if (Op.getNode()->hasOneUse()) {
-        SDNode *User = *Op.getNode()->use_begin();
+        SDNode *User = *Op.getNode()->user_begin();
         // Look for an unconditional branch following this conditional branch.
         // We need this because we need to reverse the successors in order
         // to implement FCMP_OEQ.
@@ -39423,8 +39423,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   // from being reused.
   bool IsMaskedShuffle = false;
   if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
-    if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
-        Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
+    if (Root.hasOneUse() && Root->user_begin()->getOpcode() == ISD::VSELECT &&
+        Root->user_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
       IsMaskedShuffle = true;
     }
   }
@@ -48982,7 +48982,7 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
          (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) {
 
       if (isPowerOf2_64(MulAmt2) && !(SignMulAmt >= 0 && N->hasOneUse() &&
-                                      N->use_begin()->getOpcode() == ISD::ADD))
+                                      N->user_begin()->getOpcode() == ISD::ADD))
         // If second multiplifer is pow2, issue it first. We want the multiply
         // by 3, 5, or 9 to be folded into the addressing mode unless the lone
         // use is an add. Only do this for positive multiply amounts since the
@@ -50765,7 +50765,7 @@ static SDValue combineX86SubCmpForFlags(SDNode *N, SDValue Flag,
     return SDValue();
 
   // Check the only user of flag is `brcond ne`.
-  SDNode *BrCond = *Flag->use_begin();
+  SDNode *BrCond = *Flag->user_begin();
   if (BrCond->getOpcode() != X86ISD::BRCOND)
     return SDValue();
   unsigned CondNo = 2;
@@ -53176,9 +53176,9 @@ static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG,
 
   auto MergableHorizOp = [N](unsigned HorizOpcode) {
     return N->hasOneUse() &&
-           N->use_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
-           (N->use_begin()->getOperand(0).getOpcode() == HorizOpcode ||
-            N->use_begin()->getOperand(1).getOpcode() == HorizOpcode);
+           N->user_begin()->getOpcode() == ISD::VECTOR_SHUFFLE &&
+           (N->user_begin()->getOperand(0).getOpcode() == HorizOpcode ||
+            N->user_begin()->getOperand(1).getOpcode() == HorizOpcode);
   };
 
   switch (Opcode) {
@@ -56422,7 +56422,7 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
       if (Negate) {
         // Bail if this is only used by a user of the x86 add/sub.
         if (GenericAddSub->hasOneUse() &&
-            GenericAddSub->use_begin()->isOnlyUserOf(N))
+            GenericAddSub->user_begin()->isOnlyUserOf(N))
           return;
         Op = DAG.getNegative(Op, DL, VT);
       }
@@ -59419,7 +59419,7 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
   auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
     if (!Op.hasOneUse())
       return false;
-    SDNode *User = *Op->use_begin();
+    SDNode *User = *Op->user_begin();
     if (!ISD::isNormalStore(User))
       return false;
     auto *Ld = cast<LoadSDNode>(Load);
@@ -59432,7 +59432,7 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
       return false;
     if (!Op.hasOneUse())
       return false;
-    SDNode *User = *Op->use_begin();
+    SDNode *User = *Op->user_begin();
     if (User->getOpcode() != ISD::ATOMIC_STORE)
       return false;
     auto *Ld = cast<AtomicSDNode>(Load);
@@ -59443,7 +59443,7 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
   auto IsFoldableZext = [](SDValue Op) {
     if (!Op.hasOneUse())
       return false;
-    SDNode *User = *Op->use_begin();
+    SDNode *User = *Op->user_begin();
     EVT VT = User->getValueType(0);
     return (User->getOpcode() == ISD::ZERO_EXTEND &&
             (VT == MVT::i32 || VT == MVT::i64));
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index df12ea2f79df5..b1c1ab4aa855d 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -944,7 +944,7 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
     return false;
 
   SDValue TCChain = Chain;
-  SDNode *Copy = *N->use_begin();
+  SDNode *Copy = *N->user_begin();
   if (Copy->getOpcode() == ISD::CopyToReg) {
     // If the copy has a glue operand, we conservatively assume it isn't safe to
     // perform a tail call.

From 296c29483e4000963a08fc51828df948d47e945a Mon Sep 17 00:00:00 2001
From: Pengcheng Wang <wangpengcheng.pp@bytedance.com>
Date: Thu, 19 Dec 2024 14:58:04 +0800
Subject: [PATCH 013/209] [RISCV][MCA] Move sifive-x280 tests to directory
 SiFiveX280 (#120522)

---
 .../llvm-mca/RISCV/{ => SiFiveX280}/different-lmul-instruments.s  | 0
 .../llvm-mca/RISCV/{ => SiFiveX280}/different-sew-instruments.s   | 0
 llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/disable-im.s      | 0
 .../tools/llvm-mca/RISCV/{ => SiFiveX280}/fractional-lmul-data.s  | 0
 .../llvm-mca/RISCV/{ => SiFiveX280}/lmul-instrument-at-start.s    | 0
 .../llvm-mca/RISCV/{ => SiFiveX280}/lmul-instrument-in-middle.s   | 0
 .../llvm-mca/RISCV/{ => SiFiveX280}/lmul-instrument-in-region.s   | 0
 .../RISCV/{ => SiFiveX280}/lmul-instrument-straddles-region.s     | 0
 .../RISCV/{ => SiFiveX280}/multiple-same-lmul-instruments.s       | 0
 .../RISCV/{ => SiFiveX280}/multiple-same-sew-instruments.s        | 0
 .../llvm-mca/RISCV/{ => SiFiveX280}/needs-sew-but-only-lmul.s     | 0
 .../tools/llvm-mca/RISCV/{ => SiFiveX280}/no-vsetvli-to-start.s   | 0
 .../tools/llvm-mca/RISCV/{SiFive7 => SiFiveX280}/reductions.s     | 0
 .../RISCV/{ => SiFiveX280}/riscv-lmul-instrument-no-data-is-err.s | 0
 .../RISCV/{ => SiFiveX280}/riscv-sew-instrument-no-data-is-err.s  | 0
 .../llvm-mca/RISCV/{ => SiFiveX280}/sew-instrument-at-start.s     | 0
 .../llvm-mca/RISCV/{ => SiFiveX280}/sew-instrument-in-middle.s    | 0
 .../llvm-mca/RISCV/{ => SiFiveX280}/sew-instrument-in-region.s    | 0
 .../RISCV/{ => SiFiveX280}/sew-instrument-straddles-region.s      | 0
 .../llvm-mca/RISCV/{SiFive7 => SiFiveX280}/strided-load-store.s   | 0
 .../llvm-mca/RISCV/{SiFive7 => SiFiveX280}/strided-load-x0.s      | 0
 .../llvm-mca/RISCV/{ => SiFiveX280}/unknown-instrument-is-err.s   | 0
 .../tools/llvm-mca/RISCV/{ => SiFiveX280}/unknown-lmul-is-err.s   | 0
 .../tools/llvm-mca/RISCV/{ => SiFiveX280}/unknown-sew-is-err.s    | 0
 .../RISCV/{SiFive7 => SiFiveX280}/vector-integer-arithmetic.s     | 0
 llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/vle-vse.s         | 0
 .../llvm-mca/RISCV/{ => SiFiveX280}/vsetivli-lmul-instrument.s    | 0
 .../RISCV/{ => SiFiveX280}/vsetivli-lmul-sew-instrument.s         | 0
 .../llvm-mca/RISCV/{ => SiFiveX280}/vsetvli-lmul-instrument.s     | 0
 .../llvm-mca/RISCV/{ => SiFiveX280}/vsetvli-lmul-sew-instrument.s | 0
 30 files changed, 0 insertions(+), 0 deletions(-)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/different-lmul-instruments.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/different-sew-instruments.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/disable-im.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/fractional-lmul-data.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/lmul-instrument-at-start.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/lmul-instrument-in-middle.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/lmul-instrument-in-region.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/lmul-instrument-straddles-region.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/multiple-same-lmul-instruments.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/multiple-same-sew-instruments.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/needs-sew-but-only-lmul.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/no-vsetvli-to-start.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{SiFive7 => SiFiveX280}/reductions.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/riscv-lmul-instrument-no-data-is-err.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/riscv-sew-instrument-no-data-is-err.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/sew-instrument-at-start.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/sew-instrument-in-middle.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/sew-instrument-in-region.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/sew-instrument-straddles-region.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{SiFive7 => SiFiveX280}/strided-load-store.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{SiFive7 => SiFiveX280}/strided-load-x0.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/unknown-instrument-is-err.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/unknown-lmul-is-err.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/unknown-sew-is-err.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{SiFive7 => SiFiveX280}/vector-integer-arithmetic.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/vle-vse.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/vsetivli-lmul-instrument.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/vsetivli-lmul-sew-instrument.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/vsetvli-lmul-instrument.s (100%)
 rename llvm/test/tools/llvm-mca/RISCV/{ => SiFiveX280}/vsetvli-lmul-sew-instrument.s (100%)

diff --git a/llvm/test/tools/llvm-mca/RISCV/different-lmul-instruments.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/different-lmul-instruments.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/different-lmul-instruments.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/different-lmul-instruments.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/different-sew-instruments.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/different-sew-instruments.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/different-sew-instruments.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/different-sew-instruments.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/disable-im.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/disable-im.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/disable-im.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/disable-im.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/fractional-lmul-data.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/fractional-lmul-data.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/fractional-lmul-data.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/fractional-lmul-data.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/lmul-instrument-at-start.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-at-start.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/lmul-instrument-at-start.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-at-start.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/lmul-instrument-in-middle.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-in-middle.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/lmul-instrument-in-middle.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-in-middle.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/lmul-instrument-in-region.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-in-region.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/lmul-instrument-in-region.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-in-region.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/lmul-instrument-straddles-region.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-straddles-region.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/lmul-instrument-straddles-region.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/lmul-instrument-straddles-region.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/multiple-same-lmul-instruments.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/multiple-same-lmul-instruments.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/multiple-same-lmul-instruments.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/multiple-same-lmul-instruments.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/multiple-same-sew-instruments.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/multiple-same-sew-instruments.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/multiple-same-sew-instruments.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/multiple-same-sew-instruments.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/needs-sew-but-only-lmul.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/needs-sew-but-only-lmul.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/needs-sew-but-only-lmul.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/needs-sew-but-only-lmul.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/no-vsetvli-to-start.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/no-vsetvli-to-start.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/no-vsetvli-to-start.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/no-vsetvli-to-start.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/reductions.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/reductions.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/SiFive7/reductions.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/reductions.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/riscv-lmul-instrument-no-data-is-err.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/riscv-lmul-instrument-no-data-is-err.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/riscv-lmul-instrument-no-data-is-err.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/riscv-lmul-instrument-no-data-is-err.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/riscv-sew-instrument-no-data-is-err.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/riscv-sew-instrument-no-data-is-err.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/riscv-sew-instrument-no-data-is-err.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/riscv-sew-instrument-no-data-is-err.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/sew-instrument-at-start.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-at-start.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/sew-instrument-at-start.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-at-start.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/sew-instrument-in-middle.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-in-middle.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/sew-instrument-in-middle.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-in-middle.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/sew-instrument-in-region.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-in-region.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/sew-instrument-in-region.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-in-region.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/sew-instrument-straddles-region.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-straddles-region.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/sew-instrument-straddles-region.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/sew-instrument-straddles-region.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/strided-load-store.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/strided-load-store.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/SiFive7/strided-load-store.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/strided-load-store.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/strided-load-x0.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/strided-load-x0.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/SiFive7/strided-load-x0.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/strided-load-x0.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/unknown-instrument-is-err.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/unknown-instrument-is-err.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/unknown-instrument-is-err.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/unknown-instrument-is-err.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/unknown-lmul-is-err.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/unknown-lmul-is-err.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/unknown-lmul-is-err.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/unknown-lmul-is-err.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/unknown-sew-is-err.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/unknown-sew-is-err.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/unknown-sew-is-err.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/unknown-sew-is-err.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/vector-integer-arithmetic.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vector-integer-arithmetic.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/SiFive7/vector-integer-arithmetic.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vector-integer-arithmetic.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/vle-vse.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vle-vse.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/vle-vse.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vle-vse.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/vsetivli-lmul-instrument.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetivli-lmul-instrument.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/vsetivli-lmul-instrument.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetivli-lmul-instrument.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/vsetivli-lmul-sew-instrument.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetivli-lmul-sew-instrument.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/vsetivli-lmul-sew-instrument.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetivli-lmul-sew-instrument.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/vsetvli-lmul-instrument.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetvli-lmul-instrument.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/vsetvli-lmul-instrument.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetvli-lmul-instrument.s
diff --git a/llvm/test/tools/llvm-mca/RISCV/vsetvli-lmul-sew-instrument.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetvli-lmul-sew-instrument.s
similarity index 100%
rename from llvm/test/tools/llvm-mca/RISCV/vsetvli-lmul-sew-instrument.s
rename to llvm/test/tools/llvm-mca/RISCV/SiFiveX280/vsetvli-lmul-sew-instrument.s

From a3bf87357da3bae1a35c7a855988287b1fc7ca2f Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 18 Dec 2024 23:36:13 -0800
Subject: [PATCH 014/209] [llvm-mc] --no-exec-stack: replace initSection with
 switchSection. NFC

AsmParser will call initSection unless -n is specified.
It is not good to call initSection twice.
---
 llvm/test/MC/ELF/noexec.s      | 1 +
 llvm/tools/llvm-mc/llvm-mc.cpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/test/MC/ELF/noexec.s b/llvm/test/MC/ELF/noexec.s
index dbfa45c5f428f..a9ae5b24b4291 100644
--- a/llvm/test/MC/ELF/noexec.s
+++ b/llvm/test/MC/ELF/noexec.s
@@ -14,3 +14,4 @@
 // CHECK-NEXT:     AddressAlignment: 1
 // CHECK-NEXT:     EntrySize: 0
 // CHECK-NEXT:   }
+nop
diff --git a/llvm/tools/llvm-mc/llvm-mc.cpp b/llvm/tools/llvm-mc/llvm-mc.cpp
index fd93d7ed34c94..70f92d09aded7 100644
--- a/llvm/tools/llvm-mc/llvm-mc.cpp
+++ b/llvm/tools/llvm-mc/llvm-mc.cpp
@@ -569,7 +569,7 @@ int main(int argc, char **argv) {
                : MAB->createObjectWriter(*OS),
         std::unique_ptr<MCCodeEmitter>(CE), *STI));
     if (NoExecStack)
-      Str->initSections(true, *STI);
+      Str->switchSection(Ctx.getAsmInfo()->getNonexecutableStackSection(Ctx));
     Str->emitVersionForTarget(TheTriple, VersionTuple(), nullptr,
                               VersionTuple());
   }

From e389492d6a00e1c49a034e13343098541ebd03c6 Mon Sep 17 00:00:00 2001
From: Shubham Sandeep Rastogi <srastogi22@apple.com>
Date: Wed, 18 Dec 2024 23:42:24 -0800
Subject: [PATCH 015/209] [NFC] Move DroppedVariableStats code to Analysis
 (#120502)

This is done because the CodeGen library and Passes library both link
against Analysis, to avoid adding a dependency between CodeGen and
Passes if we want to extend the DroppedVariableStats code for MIR stats
as well, as seen in https://github.com/llvm/llvm-project/pull/120501
---
 llvm/include/llvm/{Passes => Analysis}/DroppedVariableStats.h | 0
 llvm/include/llvm/Passes/StandardInstrumentations.h           | 2 +-
 llvm/lib/Analysis/CMakeLists.txt                              | 1 +
 llvm/lib/{Passes => Analysis}/DroppedVariableStats.cpp        | 2 +-
 llvm/lib/Passes/CMakeLists.txt                                | 1 -
 5 files changed, 3 insertions(+), 3 deletions(-)
 rename llvm/include/llvm/{Passes => Analysis}/DroppedVariableStats.h (100%)
 rename llvm/lib/{Passes => Analysis}/DroppedVariableStats.cpp (99%)

diff --git a/llvm/include/llvm/Passes/DroppedVariableStats.h b/llvm/include/llvm/Analysis/DroppedVariableStats.h
similarity index 100%
rename from llvm/include/llvm/Passes/DroppedVariableStats.h
rename to llvm/include/llvm/Analysis/DroppedVariableStats.h
diff --git a/llvm/include/llvm/Passes/StandardInstrumentations.h b/llvm/include/llvm/Passes/StandardInstrumentations.h
index 6ba466f9269f0..23bfee7115df6 100644
--- a/llvm/include/llvm/Passes/StandardInstrumentations.h
+++ b/llvm/include/llvm/Passes/StandardInstrumentations.h
@@ -19,13 +19,13 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/Analysis/DroppedVariableStats.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/OptBisect.h"
 #include "llvm/IR/PassTimingInfo.h"
 #include "llvm/IR/ValueHandle.h"
-#include "llvm/Passes/DroppedVariableStats.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Transforms/IPO/SampleProfileProbe.h"
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 0db5b80f336cb..5cf3777253fbe 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -60,6 +60,7 @@ add_llvm_component_library(LLVMAnalysis
   DomPrinter.cpp
   DomTreeUpdater.cpp
   DominanceFrontier.cpp
+  DroppedVariableStats.cpp
   DXILResource.cpp
   DXILMetadataAnalysis.cpp
   FunctionPropertiesAnalysis.cpp
diff --git a/llvm/lib/Passes/DroppedVariableStats.cpp b/llvm/lib/Analysis/DroppedVariableStats.cpp
similarity index 99%
rename from llvm/lib/Passes/DroppedVariableStats.cpp
rename to llvm/lib/Analysis/DroppedVariableStats.cpp
index 5dc6b75fb8ace..7162cece4f3d9 100644
--- a/llvm/lib/Passes/DroppedVariableStats.cpp
+++ b/llvm/lib/Analysis/DroppedVariableStats.cpp
@@ -11,7 +11,7 @@
 ///
 ///===---------------------------------------------------------------------===//
 
-#include "llvm/Passes/DroppedVariableStats.h"
+#include "llvm/Analysis/DroppedVariableStats.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Module.h"
diff --git a/llvm/lib/Passes/CMakeLists.txt b/llvm/lib/Passes/CMakeLists.txt
index 9e16a446c9b39..6425f4934b210 100644
--- a/llvm/lib/Passes/CMakeLists.txt
+++ b/llvm/lib/Passes/CMakeLists.txt
@@ -1,6 +1,5 @@
 add_llvm_component_library(LLVMPasses
   CodeGenPassBuilder.cpp
-  DroppedVariableStats.cpp
   OptimizationLevel.cpp
   PassBuilder.cpp
   PassBuilderBindings.cpp

From 2c3126247873f126be3218425f1d053aa6d5e8e8 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 19 Dec 2024 07:42:58 +0000
Subject: [PATCH 016/209] [gn build] Port e389492d6a00

---
 llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn | 1 +
 llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn   | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn
index ba78c2cf9e75f..905f4fda9b7bf 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn
@@ -51,6 +51,7 @@ static_library("Analysis") {
     "DomPrinter.cpp",
     "DomTreeUpdater.cpp",
     "DominanceFrontier.cpp",
+    "DroppedVariableStats.cpp",
     "FunctionPropertiesAnalysis.cpp",
     "GlobalsModRef.cpp",
     "GuardUtils.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
index 655264509db59..274f5b54345c7 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
@@ -21,7 +21,6 @@ static_library("Passes") {
   ]
   sources = [
     "CodeGenPassBuilder.cpp",
-    "DroppedVariableStats.cpp",
     "OptimizationLevel.cpp",
     "PassBuilder.cpp",
     "PassBuilderBindings.cpp",

From 16bc44a71266e8855dddffa932f82ad184450ba0 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 19 Dec 2024 09:09:14 +0100
Subject: [PATCH 017/209] [LLVM] Update BPF maintainer (#120429)

Nowadays yonghong-song and eddyz87 are more involved with LLVM
BPF development than 4ast, so update the maintainer list to reflect
this.
---
 llvm/Maintainers.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index 11d6deb1bf919..9db0d582348ee 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -188,8 +188,10 @@ Ben Shi \
 
 #### BPF backend
 
-Alexei Starovoitov \
-alexei.starovoitov@gmail.com (email), [4ast](https://github.com/4ast) (GitHub)
+Yonghong Song \
+yhs@fb.com (email), [yonghong-song](https://github.com/yonghong-song) (GitHub) \
+Eduard Zingerman \
+eddyz87@gmail.com (email), [eddyz87](https://github.com/eddyz87) (GitHub)
 
 #### CSKY backend
 
@@ -461,6 +463,7 @@ Anton Korobeynikov (anton@korobeynikov.info, [asl](https://github.com/asl)) -- A
 Chad Rosier (mcrosier@codeaurora.org) -- FastISel \
 Hans Wennborg (hans@chromium.org, [zmodem](https://github.com/zmodem)) -- Release management \
 Kostya Serebryany ([kcc](https://github.com/kcc)) -- Sanitizers \
+Alexei Starovoitov (alexei.starovoitov@gmail.com, [4ast](https://github.com/4ast)) -- BPF backend \
 Evgeniy Stepanov ([eugenis](https://github.com/eugenis)) -- Sanitizers
 
 ### Former maintainers of removed components

From 70f326c4bcb4f45744ff93c12a963ffad28136ef Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 19 Dec 2024 09:15:26 +0100
Subject: [PATCH 018/209] [LLVM] Move Bigcheese to inactive maintainer for
 Windows object tools (#120425)

Bigcheese isn't actively working on Windows support in object tools
anymore, so move him to the inactive maintainer list. I'm also not
aware of anyone else who is actively involved in this area currently,
so I'm dropping the category entirely for now.
---
 llvm/Maintainers.md | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index 9db0d582348ee..8aac6e0cd4702 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -386,11 +386,6 @@ tstellar@redhat.com (email), [tstellar](https://github.com/tstellar) (GitHub)
 Martin Storsjö \
 martin@martin.st (email), [mstorsjo](https://github.com/mstorsjo) (GitHub)
 
-#### Windows support in object tools
-
-Michael Spencer \
-bigcheesegs@gmail.com (email), [Bigcheese](https://github.com/Bigcheese) (GitHub)
-
 #### Sony PlayStation support
 
 Jeremy Morse \
@@ -463,6 +458,7 @@ Anton Korobeynikov (anton@korobeynikov.info, [asl](https://github.com/asl)) -- A
 Chad Rosier (mcrosier@codeaurora.org) -- FastISel \
 Hans Wennborg (hans@chromium.org, [zmodem](https://github.com/zmodem)) -- Release management \
 Kostya Serebryany ([kcc](https://github.com/kcc)) -- Sanitizers \
+Michael Spencer (bigcheesegs@gmail.com), [Bigcheese](https://github.com/Bigcheese)) -- Windows support in object tools \
 Alexei Starovoitov (alexei.starovoitov@gmail.com, [4ast](https://github.com/4ast)) -- BPF backend \
 Evgeniy Stepanov ([eugenis](https://github.com/eugenis)) -- Sanitizers
 

From 881447fe443788f556fbf5462384ee5677d5d7ef Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 19 Dec 2024 09:32:22 +0100
Subject: [PATCH 019/209] [LLVM] Update maintainers for binary utilities
 (#120428)

We currently list jakehehrlich as the maintainer for llvm-objcopy /
ObjCopy, but he hasn't been involved with LLVM for more than 5 years.

Convert the llvm-object category into a broader binary utilities
category and add jh7370 and MaskRay as the new maintainers.
---
 llvm/Maintainers.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index 8aac6e0cd4702..8df401ae03368 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -357,10 +357,12 @@ cyndyishida@gmail.com (email), [cyndyishida](https://github.com/cyndyishida) (Gi
 Andrea Di Biagio \
 andrea.dibiagio@sony.com, andrea.dibiagio@gmail.com (email), [adibiagio](https://github.com/adibiagio) (GitHub)
 
-#### llvm-objcopy and ObjCopy library
+#### Binary Utilities
 
-Jake Ehrlich \
-jakehehrlich@google.com (email), [jakehehrlich](https://github.com/jakehehrlich) (GitHub)
+James Henderson \
+james.henderson@sony.com (email), [jh7370](https://github.com/jh7370) (GitHub) \
+Fangrui Song \
+i@maskray.me (email), [MaskRay](https://github.com/MaskRay) (GitHub)
 
 #### Gold plugin
 
@@ -452,6 +454,7 @@ sabre@nondot.org (email), [lattner](https://github.com/lattner) (GitHub), clattn
 Paul C. Anagnostopoulos (paul@windfall.com, [Paul-C-Anagnostopoulos](https://github.com/Paul-C-Anagnostopoulos)) -- TableGen \
 Justin Bogner (mail@justinbogner.com, [bogner](https://github.com/bogner)) -- SelectionDAG \
 Evan Cheng (evan.cheng@apple.com) -- Parts of code generator not covered by someone else \
+Jake Ehrlich (jakehehrlich@google.com, [jakehehrlich](https://github.com/jakehehrlich)) -- llvm-objcopy and ObjCopy library \
 Renato Golin (rengolin@systemcall.eu, [rengolin](https://github.com/rengolin)) -- ARM backend \
 James Grosbach (grosbach@apple.com) -- MC layer \
 Anton Korobeynikov (anton@korobeynikov.info, [asl](https://github.com/asl)) -- ARM EABI \

From 223c7648468cd4f649a578d3f9cbc27a63523192 Mon Sep 17 00:00:00 2001
From: Shubham Sandeep Rastogi <srastogi22@apple.com>
Date: Thu, 19 Dec 2024 00:41:48 -0800
Subject: [PATCH 020/209] Add a pass to collect dropped var stats for MIR
 (#120501)

Reland "Add a pass to collect dropped var stats for MIR" (#117044)

I am trying to reland https://github.com/llvm/llvm-project/pull/115566

I also moved the DroppedVariableStats code to the Analysis lib

This is part of a stack of patches with
https://github.com/llvm/llvm-project/pull/120502 being the first one in
the stack
---
 .../llvm/Analysis/DroppedVariableStats.h      |   48 +-
 .../llvm/CodeGen/MachineFunctionPass.h        |    2 +
 llvm/lib/Analysis/CMakeLists.txt              |    1 +
 llvm/lib/Analysis/DroppedVariableStats.cpp    |   63 +-
 llvm/lib/CodeGen/MachineFunctionPass.cpp      |   15 +-
 llvm/unittests/MIR/CMakeLists.txt             |    1 +
 .../MIR/DroppedVariableStatsMIRTest.cpp       | 1067 +++++++++++++++++
 7 files changed, 1194 insertions(+), 3 deletions(-)
 create mode 100644 llvm/unittests/MIR/DroppedVariableStatsMIRTest.cpp

diff --git a/llvm/include/llvm/Analysis/DroppedVariableStats.h b/llvm/include/llvm/Analysis/DroppedVariableStats.h
index 4555157c942b5..e340caf93326b 100644
--- a/llvm/include/llvm/Analysis/DroppedVariableStats.h
+++ b/llvm/include/llvm/Analysis/DroppedVariableStats.h
@@ -7,7 +7,7 @@
 ///===---------------------------------------------------------------------===//
 /// \file
 /// Dropped Variable Statistics for Debug Information. Reports any number
-/// of #dbg_value that get dropped due to an optimization pass.
+/// of #dbg_values or DBG_VALUEs that get dropped due to an optimization pass.
 ///
 ///===---------------------------------------------------------------------===//
 
@@ -219,6 +219,52 @@ class DroppedVariableStatsIR : public DroppedVariableStats {
   }
 };
 
+/// A class to collect and print dropped debug information due to MIR
+/// optimization passes. After every MIR pass is run, it will print how many
+/// #DBG_VALUEs were dropped due to that pass.
+class DroppedVariableStatsMIR : public DroppedVariableStats {
+public:
+  DroppedVariableStatsMIR() : llvm::DroppedVariableStats(false) {}
+
+  void runBeforePass(StringRef PassID, MachineFunction *MF) {
+    if (PassID == "Debug Variable Analysis")
+      return;
+    setup();
+    return runOnMachineFunction(MF, true);
+  }
+
+  void runAfterPass(StringRef PassID, MachineFunction *MF) {
+    if (PassID == "Debug Variable Analysis")
+      return;
+    runOnMachineFunction(MF, false);
+    calculateDroppedVarStatsOnMachineFunction(MF, PassID, MF->getName().str());
+    cleanup();
+  }
+
+private:
+  const MachineFunction *MFunc;
+  /// Populate DebugVariablesBefore, DebugVariablesAfter, InlinedAts before or
+  /// after a pass has run to facilitate dropped variable calculation for an
+  /// llvm::MachineFunction.
+  void runOnMachineFunction(const MachineFunction *MF, bool Before);
+  /// Iterate over all Instructions in a MachineFunction and report any dropped
+  /// debug information.
+  void calculateDroppedVarStatsOnMachineFunction(const MachineFunction *MF,
+                                                 StringRef PassID,
+                                                 StringRef FuncOrModName);
+  /// Override base class method to run on an llvm::MachineFunction
+  /// specifically.
+  virtual void
+  visitEveryInstruction(unsigned &DroppedCount,
+                        DenseMap<VarID, DILocation *> &InlinedAtsMap,
+                        VarID Var) override;
+  /// Override base class method to run on DBG_VALUEs specifically.
+  virtual void visitEveryDebugRecord(
+      DenseSet<VarID> &VarIDSet,
+      DenseMap<StringRef, DenseMap<VarID, DILocation *>> &InlinedAtsMap,
+      StringRef FuncName, bool Before) override;
+};
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/CodeGen/MachineFunctionPass.h b/llvm/include/llvm/CodeGen/MachineFunctionPass.h
index caaf22c2139e3..0d49e4997bc9a 100644
--- a/llvm/include/llvm/CodeGen/MachineFunctionPass.h
+++ b/llvm/include/llvm/CodeGen/MachineFunctionPass.h
@@ -18,6 +18,7 @@
 #ifndef LLVM_CODEGEN_MACHINEFUNCTIONPASS_H
 #define LLVM_CODEGEN_MACHINEFUNCTIONPASS_H
 
+#include "llvm/Analysis/DroppedVariableStats.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/Pass.h"
 
@@ -67,6 +68,7 @@ class MachineFunctionPass : public FunctionPass {
   MachineFunctionProperties RequiredProperties;
   MachineFunctionProperties SetProperties;
   MachineFunctionProperties ClearedProperties;
+  DroppedVariableStatsMIR DroppedVarStatsMF;
 
   /// createPrinterPass - Get a machine function printer pass.
   Pass *createPrinterPass(raw_ostream &O,
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 5cf3777253fbe..245276d370e0b 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -50,6 +50,7 @@ add_llvm_component_library(LLVMAnalysis
   CycleAnalysis.cpp
   DDG.cpp
   DDGPrinter.cpp
+  DroppedVariableStats.cpp
   ConstraintSystem.cpp
   Delinearization.cpp
   DemandedBits.cpp
diff --git a/llvm/lib/Analysis/DroppedVariableStats.cpp b/llvm/lib/Analysis/DroppedVariableStats.cpp
index 7162cece4f3d9..f900baaabae25 100644
--- a/llvm/lib/Analysis/DroppedVariableStats.cpp
+++ b/llvm/lib/Analysis/DroppedVariableStats.cpp
@@ -7,7 +7,7 @@
 ///===---------------------------------------------------------------------===//
 /// \file
 /// Dropped Variable Statistics for Debug Information. Reports any number
-/// of #dbg_value that get dropped due to an optimization pass.
+/// of #dbg_values or DBG_VALUEs that get dropped due to an optimization pass.
 ///
 ///===---------------------------------------------------------------------===//
 
@@ -192,3 +192,64 @@ void DroppedVariableStatsIR::visitEveryDebugRecord(
     }
   }
 }
+
+void DroppedVariableStatsMIR::runOnMachineFunction(const MachineFunction *MF,
+                                                   bool Before) {
+  auto &DebugVariables = DebugVariablesStack.back()[&MF->getFunction()];
+  auto FuncName = MF->getName();
+  MFunc = MF;
+  run(DebugVariables, FuncName, Before);
+}
+
+void DroppedVariableStatsMIR::calculateDroppedVarStatsOnMachineFunction(
+    const MachineFunction *MF, StringRef PassID, StringRef FuncOrModName) {
+  MFunc = MF;
+  StringRef FuncName = MF->getName();
+  const Function *Func = &MF->getFunction();
+  DebugVariables &DbgVariables = DebugVariablesStack.back()[Func];
+  calculateDroppedStatsAndPrint(DbgVariables, FuncName, PassID, FuncOrModName,
+                                "MachineFunction", Func);
+}
+
+void DroppedVariableStatsMIR::visitEveryInstruction(
+    unsigned &DroppedCount, DenseMap<VarID, DILocation *> &InlinedAtsMap,
+    VarID Var) {
+  unsigned PrevDroppedCount = DroppedCount;
+  const DIScope *DbgValScope = std::get<0>(Var);
+  for (const auto &MBB : *MFunc) {
+    for (const auto &MI : MBB) {
+      if (!MI.isDebugInstr()) {
+        auto *DbgLoc = MI.getDebugLoc().get();
+        if (!DbgLoc)
+          continue;
+
+        auto *Scope = DbgLoc->getScope();
+        if (updateDroppedCount(DbgLoc, Scope, DbgValScope, InlinedAtsMap, Var,
+                               DroppedCount))
+          break;
+      }
+    }
+    if (PrevDroppedCount != DroppedCount) {
+      PrevDroppedCount = DroppedCount;
+      break;
+    }
+  }
+}
+
+void DroppedVariableStatsMIR::visitEveryDebugRecord(
+    DenseSet<VarID> &VarIDSet,
+    DenseMap<StringRef, DenseMap<VarID, DILocation *>> &InlinedAtsMap,
+    StringRef FuncName, bool Before) {
+  for (const auto &MBB : *MFunc) {
+    for (const auto &MI : MBB) {
+      if (MI.isDebugValueLike()) {
+        auto *DbgVar = MI.getDebugVariable();
+        if (!DbgVar)
+          continue;
+        auto DbgLoc = MI.getDebugLoc();
+        populateVarIDSetAndInlinedMap(DbgVar, DbgLoc, VarIDSet, InlinedAtsMap,
+                                      FuncName, Before);
+      }
+    }
+  }
+}
diff --git a/llvm/lib/CodeGen/MachineFunctionPass.cpp b/llvm/lib/CodeGen/MachineFunctionPass.cpp
index 62ac3e32d24d9..e803811643f87 100644
--- a/llvm/lib/CodeGen/MachineFunctionPass.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionPass.cpp
@@ -32,6 +32,11 @@
 using namespace llvm;
 using namespace ore;
 
+static cl::opt<bool> DroppedVarStatsMIR(
+    "dropped-variable-stats-mir", cl::Hidden,
+    cl::desc("Dump dropped debug variables stats for MIR passes"),
+    cl::init(false));
+
 Pass *MachineFunctionPass::createPrinterPass(raw_ostream &O,
                                              const std::string &Banner) const {
   return createMachineFunctionPrinterPass(O, Banner);
@@ -91,7 +96,15 @@ bool MachineFunctionPass::runOnFunction(Function &F) {
 
   MFProps.reset(ClearedProperties);
 
-  bool RV = runOnMachineFunction(MF);
+  bool RV;
+  if (DroppedVarStatsMIR) {
+    auto PassName = getPassName();
+    DroppedVarStatsMF.runBeforePass(PassName, &MF);
+    RV = runOnMachineFunction(MF);
+    DroppedVarStatsMF.runAfterPass(PassName, &MF);
+  } else {
+    RV = runOnMachineFunction(MF);
+  }
 
   if (ShouldEmitSizeRemarks) {
     // We wanted size remarks. Check if there was a change to the number of
diff --git a/llvm/unittests/MIR/CMakeLists.txt b/llvm/unittests/MIR/CMakeLists.txt
index 206094266ba14..48d7b9f03c7d1 100644
--- a/llvm/unittests/MIR/CMakeLists.txt
+++ b/llvm/unittests/MIR/CMakeLists.txt
@@ -14,6 +14,7 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_llvm_unittest(MIRTests
+  DroppedVariableStatsMIRTest.cpp
   MachineMetadata.cpp
   MachineStableHashTest.cpp
   )
diff --git a/llvm/unittests/MIR/DroppedVariableStatsMIRTest.cpp b/llvm/unittests/MIR/DroppedVariableStatsMIRTest.cpp
new file mode 100644
index 0000000000000..b26a89c7adcba
--- /dev/null
+++ b/llvm/unittests/MIR/DroppedVariableStatsMIRTest.cpp
@@ -0,0 +1,1067 @@
+//===- unittests/IR/DroppedVariableStatsTest.cpp - TimePassesHandler tests
+//----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/CodeGen/MIRParser/MIRParser.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Pass.h"
+#include "llvm/Passes/StandardInstrumentations.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "gtest/gtest.h"
+#include <gtest/gtest.h>
+#include <llvm/ADT/SmallString.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/PassInstrumentation.h>
+#include <llvm/IR/PassManager.h>
+#include <llvm/IR/PassTimingInfo.h>
+#include <llvm/Support/raw_ostream.h>
+
+using namespace llvm;
+
+namespace {
+
+std::unique_ptr<TargetMachine>
+createTargetMachine(std::string TT, StringRef CPU, StringRef FS) {
+  std::string Error;
+  const Target *T = TargetRegistry::lookupTarget(TT, Error);
+  if (!T)
+    return nullptr;
+  TargetOptions Options;
+  return std::unique_ptr<TargetMachine>(
+      static_cast<TargetMachine *>(T->createTargetMachine(
+          TT, CPU, FS, Options, std::nullopt, std::nullopt)));
+}
+
+std::unique_ptr<Module> parseMIR(const TargetMachine &TM, StringRef MIRCode,
+                                 MachineModuleInfo &MMI, LLVMContext *Context) {
+  SMDiagnostic Diagnostic;
+  std::unique_ptr<Module> M;
+  std::unique_ptr<MemoryBuffer> MBuffer = MemoryBuffer::getMemBuffer(MIRCode);
+  auto MIR = createMIRParser(std::move(MBuffer), *Context);
+  if (!MIR)
+    return nullptr;
+
+  std::unique_ptr<Module> Mod = MIR->parseIRModule();
+  if (!Mod)
+    return nullptr;
+
+  Mod->setDataLayout(TM.createDataLayout());
+
+  if (MIR->parseMachineFunctions(*Mod, MMI)) {
+    M.reset();
+    return nullptr;
+  }
+  return Mod;
+}
+// This test ensures that if a DBG_VALUE and an instruction that exists in the
+// same scope as that DBG_VALUE are both deleted as a result of an optimization
+// pass, debug information is considered not dropped.
+TEST(DroppedVariableStatsMIR, BothDeleted) {
+  InitializeAllTargetInfos();
+  InitializeAllTargets();
+  InitializeAllTargetMCs();
+  PassInstrumentationCallbacks PIC;
+  PassInstrumentation PI(&PIC);
+
+  LLVMContext C;
+
+  const char *MIR =
+      R"(
+--- |
+  ; ModuleID = '/tmp/test.ll'
+  source_filename = "/tmp/test.ll"
+  target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+  
+  define noundef range(i32 -2147483647, -2147483648) i32 @_Z3fooi(i32 noundef %x) local_unnamed_addr !dbg !4 {
+  entry:
+      #dbg_value(i32 %x, !10, !DIExpression(), !11)
+    %add = add nsw i32 %x, 1, !dbg !12
+    ret i32 0
+  }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!2}
+  !llvm.ident = !{!3}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+  !1 = !DIFile(filename: "/tmp/code.cpp", directory: "/")
+  !2 = !{i32 2, !"Debug Info Version", i32 3}
+  !3 = !{!"clang"}
+  !4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9)
+  !5 = !DIFile(filename: "/tmp/code.cpp", directory: "")
+  !6 = !DISubroutineType(types: !7)
+  !7 = !{!8, !8}
+  !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !9 = !{!10}
+  !10 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 1, type: !8)
+  !11 = !DILocation(line: 0, scope: !4)
+  !12 = !DILocation(line: 2, column: 11, scope: !4)
+
+...
+---
+name:            _Z3fooi
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: _, preferred-register: '', flags: [  ] }
+  - { id: 1, class: _, preferred-register: '', flags: [  ] }
+  - { id: 2, class: _, preferred-register: '', flags: [  ] }
+  - { id: 3, class: _, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.1.entry:
+    liveins: $w0
+  
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = G_CONSTANT i32 1
+    %3:_(s32) = G_CONSTANT i32 0
+    DBG_VALUE %0(s32), $noreg, !10, !DIExpression(), debug-location !11
+    %2:_(s32) = nsw G_ADD %0, %1, debug-location !12
+    $w0 = COPY %3(s32)
+    RET_ReallyLR implicit $w0
+    )";
+  auto TM = createTargetMachine(Triple::normalize("aarch64--"), "", "");
+  MachineModuleInfo MMI(TM.get());
+  std::unique_ptr<Module> M = parseMIR(*TM, MIR, MMI, &C);
+  ASSERT_TRUE(M);
+
+  DroppedVariableStatsMIR Stats;
+  auto *MF = MMI.getMachineFunction(*M->getFunction("_Z3fooi"));
+  Stats.runBeforePass("Test", MF);
+
+  // This loop simulates an IR pass that drops debug information.
+  for (auto &MBB : *MF) {
+    for (auto &MI : MBB) {
+      if (MI.isDebugValueLike()) {
+        MI.eraseFromParent();
+        break;
+      }
+    }
+    for (auto &MI : MBB) {
+      auto *DbgLoc = MI.getDebugLoc().get();
+      if (DbgLoc) {
+        MI.eraseFromParent();
+        break;
+      }
+    }
+    break;
+  }
+
+  Stats.runAfterPass("Test", MF);
+  ASSERT_EQ(Stats.getPassDroppedVariables(), false);
+}
+
+// This test ensures that if a DBG_VALUE is dropped after an optimization pass,
+// but an instruction that shares the same scope as the DBG_VALUE still exists,
+// debug information is conisdered dropped.
+TEST(DroppedVariableStatsMIR, DbgValLost) {
+  InitializeAllTargetInfos();
+  InitializeAllTargets();
+  InitializeAllTargetMCs();
+  PassInstrumentationCallbacks PIC;
+  PassInstrumentation PI(&PIC);
+
+  LLVMContext C;
+
+  const char *MIR =
+      R"(
+--- |
+  ; ModuleID = '/tmp/test.ll'
+  source_filename = "/tmp/test.ll"
+  target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+  
+  define noundef range(i32 -2147483647, -2147483648) i32 @_Z3fooi(i32 noundef %x) local_unnamed_addr !dbg !4 {
+  entry:
+      #dbg_value(i32 %x, !10, !DIExpression(), !11)
+    %add = add nsw i32 %x, 1, !dbg !12
+    ret i32 0
+  }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!2}
+  !llvm.ident = !{!3}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+  !1 = !DIFile(filename: "/tmp/code.cpp", directory: "/")
+  !2 = !{i32 2, !"Debug Info Version", i32 3}
+  !3 = !{!"clang"}
+  !4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9)
+  !5 = !DIFile(filename: "/tmp/code.cpp", directory: "")
+  !6 = !DISubroutineType(types: !7)
+  !7 = !{!8, !8}
+  !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !9 = !{!10}
+  !10 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 1, type: !8)
+  !11 = !DILocation(line: 0, scope: !4)
+  !12 = !DILocation(line: 2, column: 11, scope: !4)
+
+...
+---
+name:            _Z3fooi
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: _, preferred-register: '', flags: [  ] }
+  - { id: 1, class: _, preferred-register: '', flags: [  ] }
+  - { id: 2, class: _, preferred-register: '', flags: [  ] }
+  - { id: 3, class: _, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.1.entry:
+    liveins: $w0
+  
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = G_CONSTANT i32 1
+    %3:_(s32) = G_CONSTANT i32 0
+    DBG_VALUE %0(s32), $noreg, !10, !DIExpression(), debug-location !11
+    %2:_(s32) = nsw G_ADD %0, %1, debug-location !12
+    $w0 = COPY %3(s32)
+    RET_ReallyLR implicit $w0
+    )";
+  auto TM = createTargetMachine(Triple::normalize("aarch64--"), "", "");
+  MachineModuleInfo MMI(TM.get());
+  std::unique_ptr<Module> M = parseMIR(*TM, MIR, MMI, &C);
+  ASSERT_TRUE(M);
+
+  DroppedVariableStatsMIR Stats;
+  auto *MF = MMI.getMachineFunction(*M->getFunction("_Z3fooi"));
+  Stats.runBeforePass("Test", MF);
+
+  // This loop simulates an IR pass that drops debug information.
+  for (auto &MBB : *MF) {
+    for (auto &MI : MBB) {
+      if (MI.isDebugValueLike()) {
+        MI.eraseFromParent();
+        break;
+      }
+    }
+    break;
+  }
+
+  Stats.runAfterPass("Test", MF);
+  ASSERT_EQ(Stats.getPassDroppedVariables(), true);
+}
+
+// This test ensures that if a #dbg_value is dropped after an optimization pass,
+// but an instruction that has an unrelated scope as the #dbg_value still
+// exists, debug information is conisdered not dropped.
+TEST(DroppedVariableStatsMIR, UnrelatedScopes) {
+  InitializeAllTargetInfos();
+  InitializeAllTargets();
+  InitializeAllTargetMCs();
+  PassInstrumentationCallbacks PIC;
+  PassInstrumentation PI(&PIC);
+
+  LLVMContext C;
+
+  const char *MIR =
+      R"(
+--- |
+  ; ModuleID = '/tmp/test.ll'
+  source_filename = "/tmp/test.ll"
+  target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+  
+  define noundef range(i32 -2147483647, -2147483648) i32 @_Z3fooi(i32 noundef %x) local_unnamed_addr !dbg !4 {
+  entry:
+      #dbg_value(i32 %x, !10, !DIExpression(), !11)
+    %add = add nsw i32 %x, 1, !dbg !12
+    ret i32 0
+  }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!2}
+  !llvm.ident = !{!3}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+  !1 = !DIFile(filename: "/tmp/code.cpp", directory: "/")
+  !2 = !{i32 2, !"Debug Info Version", i32 3}
+  !3 = !{!"clang"}
+  !4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9)
+  !5 = !DIFile(filename: "/tmp/code.cpp", directory: "")
+  !6 = !DISubroutineType(types: !7)
+  !7 = !{!8, !8}
+  !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !9 = !{!10}
+  !10 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 1, type: !8)
+  !11 = !DILocation(line: 0, scope: !4)
+  !12 = !DILocation(line: 2, column: 11, scope: !13)
+  !13 = distinct !DISubprogram(name: "bar", linkageName: "_Z3bari", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9)
+
+...
+---
+name:            _Z3fooi
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: _, preferred-register: '', flags: [  ] }
+  - { id: 1, class: _, preferred-register: '', flags: [  ] }
+  - { id: 2, class: _, preferred-register: '', flags: [  ] }
+  - { id: 3, class: _, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.1.entry:
+    liveins: $w0
+  
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = G_CONSTANT i32 1
+    %3:_(s32) = G_CONSTANT i32 0
+    DBG_VALUE %0(s32), $noreg, !10, !DIExpression(), debug-location !11
+    %2:_(s32) = nsw G_ADD %0, %1, debug-location !12
+    $w0 = COPY %3(s32)
+    RET_ReallyLR implicit $w0
+    )";
+  auto TM = createTargetMachine(Triple::normalize("aarch64--"), "", "");
+  MachineModuleInfo MMI(TM.get());
+  std::unique_ptr<Module> M = parseMIR(*TM, MIR, MMI, &C);
+  ASSERT_TRUE(M);
+
+  DroppedVariableStatsMIR Stats;
+  auto *MF = MMI.getMachineFunction(*M->getFunction("_Z3fooi"));
+  Stats.runBeforePass("Test", MF);
+
+  // This loop simulates an IR pass that drops debug information.
+  for (auto &MBB : *MF) {
+    for (auto &MI : MBB) {
+      if (MI.isDebugValueLike()) {
+        MI.eraseFromParent();
+        break;
+      }
+    }
+    break;
+  }
+
+  Stats.runAfterPass("Test", MF);
+  ASSERT_EQ(Stats.getPassDroppedVariables(), false);
+}
+
+// This test ensures that if a #dbg_value is dropped after an optimization pass,
+// but an instruction that has a scope which is a child of the #dbg_value scope
+// still exists, debug information is conisdered dropped.
+TEST(DroppedVariableStatsMIR, ChildScopes) {
+  InitializeAllTargetInfos();
+  InitializeAllTargets();
+  InitializeAllTargetMCs();
+  PassInstrumentationCallbacks PIC;
+  PassInstrumentation PI(&PIC);
+
+  LLVMContext C;
+
+  const char *MIR =
+      R"(
+--- |
+  ; ModuleID = '/tmp/test.ll'
+  source_filename = "/tmp/test.ll"
+  target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+  
+  define noundef range(i32 -2147483647, -2147483648) i32 @_Z3fooi(i32 noundef %x) local_unnamed_addr !dbg !4 {
+  entry:
+      #dbg_value(i32 %x, !10, !DIExpression(), !11)
+    %add = add nsw i32 %x, 1, !dbg !12
+    ret i32 0
+  }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!2}
+  !llvm.ident = !{!3}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+  !1 = !DIFile(filename: "/tmp/code.cpp", directory: "/")
+  !2 = !{i32 2, !"Debug Info Version", i32 3}
+  !3 = !{!"clang"}
+  !4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9)
+  !5 = !DIFile(filename: "/tmp/code.cpp", directory: "")
+  !6 = !DISubroutineType(types: !7)
+  !7 = !{!8, !8}
+  !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !9 = !{!10}
+  !10 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 1, type: !8)
+  !11 = !DILocation(line: 0, scope: !4)
+  !12 = !DILocation(line: 2, column: 11, scope: !13)
+  !13 = distinct !DILexicalBlock(scope: !4, file: !5, line: 10, column: 28)
+
+...
+---
+name:            _Z3fooi
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: _, preferred-register: '', flags: [  ] }
+  - { id: 1, class: _, preferred-register: '', flags: [  ] }
+  - { id: 2, class: _, preferred-register: '', flags: [  ] }
+  - { id: 3, class: _, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.1.entry:
+    liveins: $w0
+  
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = G_CONSTANT i32 1
+    %3:_(s32) = G_CONSTANT i32 0
+    DBG_VALUE %0(s32), $noreg, !10, !DIExpression(), debug-location !11
+    %2:_(s32) = nsw G_ADD %0, %1, debug-location !12
+    $w0 = COPY %3(s32)
+    RET_ReallyLR implicit $w0
+    )";
+  auto TM = createTargetMachine(Triple::normalize("aarch64--"), "", "");
+  MachineModuleInfo MMI(TM.get());
+  std::unique_ptr<Module> M = parseMIR(*TM, MIR, MMI, &C);
+  ASSERT_TRUE(M);
+
+  DroppedVariableStatsMIR Stats;
+  auto *MF = MMI.getMachineFunction(*M->getFunction("_Z3fooi"));
+  Stats.runBeforePass("Test", MF);
+
+  // This loop simulates an IR pass that drops debug information.
+  for (auto &MBB : *MF) {
+    for (auto &MI : MBB) {
+      if (MI.isDebugValueLike()) {
+        MI.eraseFromParent();
+        break;
+      }
+    }
+    break;
+  }
+
+  Stats.runAfterPass("Test", MF);
+  ASSERT_EQ(Stats.getPassDroppedVariables(), true);
+}
+
+// This test ensures that if a DBG_VALUE is dropped after an optimization pass,
+// but an instruction that has a scope which is a child of the DBG_VALUE scope
+// still exists, and the DBG_VALUE is inlined at another location, debug
+// information is conisdered not dropped.
+TEST(DroppedVariableStatsMIR, InlinedAt) {
+  InitializeAllTargetInfos();
+  InitializeAllTargets();
+  InitializeAllTargetMCs();
+  PassInstrumentationCallbacks PIC;
+  PassInstrumentation PI(&PIC);
+
+  LLVMContext C;
+
+  const char *MIR =
+      R"(
+--- |
+  ; ModuleID = '/tmp/test.ll'
+  source_filename = "/tmp/test.ll"
+  target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+  
+  define noundef range(i32 -2147483647, -2147483648) i32 @_Z3fooi(i32 noundef %x) local_unnamed_addr !dbg !4 {
+  entry:
+      #dbg_value(i32 %x, !10, !DIExpression(), !11)
+    %add = add nsw i32 %x, 1, !dbg !12
+    ret i32 0
+  }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!2}
+  !llvm.ident = !{!3}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+  !1 = !DIFile(filename: "/tmp/code.cpp", directory: "/")
+  !2 = !{i32 2, !"Debug Info Version", i32 3}
+  !3 = !{!"clang"}
+  !4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9)
+  !5 = !DIFile(filename: "/tmp/code.cpp", directory: "")
+  !6 = !DISubroutineType(types: !7)
+  !7 = !{!8, !8}
+  !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !9 = !{!10}
+  !10 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 1, type: !8)
+  !11 = !DILocation(line: 0, scope: !4, inlinedAt: !14)
+  !12 = !DILocation(line: 2, column: 11, scope: !13)
+  !13 = distinct !DILexicalBlock(scope: !4, file: !5, line: 10, column: 28)
+  !14 = !DILocation(line: 3, column: 2, scope: !4)
+
+...
+---
+name:            _Z3fooi
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: _, preferred-register: '', flags: [  ] }
+  - { id: 1, class: _, preferred-register: '', flags: [  ] }
+  - { id: 2, class: _, preferred-register: '', flags: [  ] }
+  - { id: 3, class: _, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.1.entry:
+    liveins: $w0
+  
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = G_CONSTANT i32 1
+    %3:_(s32) = G_CONSTANT i32 0
+    DBG_VALUE %0(s32), $noreg, !10, !DIExpression(), debug-location !11
+    %2:_(s32) = nsw G_ADD %0, %1, debug-location !12
+    $w0 = COPY %3(s32)
+    RET_ReallyLR implicit $w0
+    )";
+  auto TM = createTargetMachine(Triple::normalize("aarch64--"), "", "");
+  MachineModuleInfo MMI(TM.get());
+  std::unique_ptr<Module> M = parseMIR(*TM, MIR, MMI, &C);
+  ASSERT_TRUE(M);
+
+  DroppedVariableStatsMIR Stats;
+  auto *MF = MMI.getMachineFunction(*M->getFunction("_Z3fooi"));
+  Stats.runBeforePass("Test", MF);
+
+  // This loop simulates an IR pass that drops debug information.
+  for (auto &MBB : *MF) {
+    for (auto &MI : MBB) {
+      if (MI.isDebugValueLike()) {
+        MI.eraseFromParent();
+        break;
+      }
+    }
+    break;
+  }
+
+  Stats.runAfterPass("Test", MF);
+  ASSERT_EQ(Stats.getPassDroppedVariables(), false);
+}
+
+// This test ensures that if a DBG_VALUE is dropped after an optimization pass,
+// but an instruction that has a scope which is a child of the DBG_VALUE scope
+// still exists, and the DBG_VALUE and the instruction are inlined at another
+// location, debug information is conisdered dropped.
+TEST(DroppedVariableStatsMIR, InlinedAtShared) {
+  InitializeAllTargetInfos();
+  InitializeAllTargets();
+  InitializeAllTargetMCs();
+  PassInstrumentationCallbacks PIC;
+  PassInstrumentation PI(&PIC);
+
+  LLVMContext C;
+
+  const char *MIR =
+      R"(
+--- |
+  ; ModuleID = '/tmp/test.ll'
+  source_filename = "/tmp/test.ll"
+  target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+  
+  define noundef range(i32 -2147483647, -2147483648) i32 @_Z3fooi(i32 noundef %x) local_unnamed_addr !dbg !4 {
+  entry:
+      #dbg_value(i32 %x, !10, !DIExpression(), !11)
+    %add = add nsw i32 %x, 1, !dbg !12
+    ret i32 0
+  }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!2}
+  !llvm.ident = !{!3}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+  !1 = !DIFile(filename: "/tmp/code.cpp", directory: "/")
+  !2 = !{i32 2, !"Debug Info Version", i32 3}
+  !3 = !{!"clang"}
+  !4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9)
+  !5 = !DIFile(filename: "/tmp/code.cpp", directory: "")
+  !6 = !DISubroutineType(types: !7)
+  !7 = !{!8, !8}
+  !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !9 = !{!10}
+  !10 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 1, type: !8)
+  !11 = !DILocation(line: 0, scope: !4, inlinedAt: !14)
+  !12 = !DILocation(line: 2, column: 11, scope: !13, inlinedAt: !14)
+  !13 = distinct !DILexicalBlock(scope: !4, file: !5, line: 10, column: 28)
+  !14 = !DILocation(line: 3, column: 2, scope: !4)
+
+...
+---
+name:            _Z3fooi
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: _, preferred-register: '', flags: [  ] }
+  - { id: 1, class: _, preferred-register: '', flags: [  ] }
+  - { id: 2, class: _, preferred-register: '', flags: [  ] }
+  - { id: 3, class: _, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.1.entry:
+    liveins: $w0
+  
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = G_CONSTANT i32 1
+    %3:_(s32) = G_CONSTANT i32 0
+    DBG_VALUE %0(s32), $noreg, !10, !DIExpression(), debug-location !11
+    %2:_(s32) = nsw G_ADD %0, %1, debug-location !12
+    $w0 = COPY %3(s32)
+    RET_ReallyLR implicit $w0
+    )";
+  auto TM = createTargetMachine(Triple::normalize("aarch64--"), "", "");
+  MachineModuleInfo MMI(TM.get());
+  std::unique_ptr<Module> M = parseMIR(*TM, MIR, MMI, &C);
+  ASSERT_TRUE(M);
+
+  DroppedVariableStatsMIR Stats;
+  auto *MF = MMI.getMachineFunction(*M->getFunction("_Z3fooi"));
+  Stats.runBeforePass("Test", MF);
+
+  // This loop simulates an IR pass that drops debug information.
+  for (auto &MBB : *MF) {
+    for (auto &MI : MBB) {
+      if (MI.isDebugValueLike()) {
+        MI.eraseFromParent();
+        break;
+      }
+    }
+    break;
+  }
+
+  Stats.runAfterPass("Test", MF);
+  ASSERT_EQ(Stats.getPassDroppedVariables(), true);
+}
+
+// This test ensures that if a DBG_VALUE is dropped after an optimization pass,
+// but an instruction that has a scope which is a child of the DBG_VALUE scope
+// still exists, and the instruction is inlined at a location that is the
+// DBG_VALUE's inlined at location, debug information is conisdered dropped.
+TEST(DroppedVariableStatsMIR, InlinedAtChild) {
+  InitializeAllTargetInfos();
+  InitializeAllTargets();
+  InitializeAllTargetMCs();
+  PassInstrumentationCallbacks PIC;
+  PassInstrumentation PI(&PIC);
+
+  LLVMContext C;
+
+  const char *MIR =
+      R"(
+--- |
+  ; ModuleID = '/tmp/test.ll'
+  source_filename = "/tmp/test.ll"
+  target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+  
+  define noundef range(i32 -2147483647, -2147483648) i32 @_Z3fooi(i32 noundef %x) local_unnamed_addr !dbg !4 {
+  entry:
+      #dbg_value(i32 %x, !10, !DIExpression(), !11)
+    %add = add nsw i32 %x, 1, !dbg !12
+    ret i32 0
+  }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!2}
+  !llvm.ident = !{!3}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+  !1 = !DIFile(filename: "/tmp/code.cpp", directory: "/")
+  !2 = !{i32 2, !"Debug Info Version", i32 3}
+  !3 = !{!"clang"}
+  !4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9)
+  !5 = !DIFile(filename: "/tmp/code.cpp", directory: "")
+  !6 = !DISubroutineType(types: !7)
+  !7 = !{!8, !8}
+  !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !9 = !{!10}
+  !10 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 1, type: !8)
+  !11 = !DILocation(line: 0, scope: !4, inlinedAt: !14)
+  !12 = !DILocation(line: 2, column: 11, scope: !13, inlinedAt: !15)
+  !13 = distinct !DILexicalBlock(scope: !4, file: !5, line: 10, column: 28)
+  !14 = !DILocation(line: 3, column: 2, scope: !4)
+  !15 = !DILocation(line: 4, column: 5, scope: !13, inlinedAt: !14)
+
+...
+---
+name:            _Z3fooi
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: _, preferred-register: '', flags: [  ] }
+  - { id: 1, class: _, preferred-register: '', flags: [  ] }
+  - { id: 2, class: _, preferred-register: '', flags: [  ] }
+  - { id: 3, class: _, preferred-register: '', flags: [  ] }
+liveins:
+  - { reg: '$w0', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.1.entry:
+    liveins: $w0
+  
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = G_CONSTANT i32 1
+    %3:_(s32) = G_CONSTANT i32 0
+    DBG_VALUE %0(s32), $noreg, !10, !DIExpression(), debug-location !11
+    %2:_(s32) = nsw G_ADD %0, %1, debug-location !12
+    $w0 = COPY %3(s32)
+    RET_ReallyLR implicit $w0
+    )";
+  auto TM = createTargetMachine(Triple::normalize("aarch64--"), "", "");
+  MachineModuleInfo MMI(TM.get());
+  std::unique_ptr<Module> M = parseMIR(*TM, MIR, MMI, &C);
+  ASSERT_TRUE(M);
+
+  DroppedVariableStatsMIR Stats;
+  auto *MF = MMI.getMachineFunction(*M->getFunction("_Z3fooi"));
+  Stats.runBeforePass("Test", MF);
+
+  // This loop simulates an IR pass that drops debug information.
+  for (auto &MBB : *MF) {
+    for (auto &MI : MBB) {
+      if (MI.isDebugValueLike()) {
+        MI.eraseFromParent();
+        break;
+      }
+    }
+    break;
+  }
+
+  Stats.runAfterPass("Test", MF);
+  ASSERT_EQ(Stats.getPassDroppedVariables(), true);
+}
+
+} // end anonymous namespace

From 16d952898f2b41e768f7f2bfd867e48a5b7ef976 Mon Sep 17 00:00:00 2001
From: Shubham Sandeep Rastogi <srastogi22@apple.com>
Date: Thu, 19 Dec 2024 00:48:40 -0800
Subject: [PATCH 021/209] Revert "Add a pass to collect dropped var stats for
 MIR (#120501)"

This reverts commit 223c7648468cd4f649a578d3f9cbc27a63523192.

Reverted due to vuildbot failure:

flang-aarch64-libcxx

Linking CXX shared library lib/libLLVMAnalysis.so.20.0git
FAILED: lib/libLLVMAnalysis.so.20.0git
---
 .../llvm/Analysis/DroppedVariableStats.h      |   48 +-
 .../llvm/CodeGen/MachineFunctionPass.h        |    2 -
 llvm/lib/Analysis/CMakeLists.txt              |    1 -
 llvm/lib/Analysis/DroppedVariableStats.cpp    |   63 +-
 llvm/lib/CodeGen/MachineFunctionPass.cpp      |   15 +-
 llvm/unittests/MIR/CMakeLists.txt             |    1 -
 .../MIR/DroppedVariableStatsMIRTest.cpp       | 1067 -----------------
 7 files changed, 3 insertions(+), 1194 deletions(-)
 delete mode 100644 llvm/unittests/MIR/DroppedVariableStatsMIRTest.cpp

diff --git a/llvm/include/llvm/Analysis/DroppedVariableStats.h b/llvm/include/llvm/Analysis/DroppedVariableStats.h
index e340caf93326b..4555157c942b5 100644
--- a/llvm/include/llvm/Analysis/DroppedVariableStats.h
+++ b/llvm/include/llvm/Analysis/DroppedVariableStats.h
@@ -7,7 +7,7 @@
 ///===---------------------------------------------------------------------===//
 /// \file
 /// Dropped Variable Statistics for Debug Information. Reports any number
-/// of #dbg_values or DBG_VALUEs that get dropped due to an optimization pass.
+/// of #dbg_value that get dropped due to an optimization pass.
 ///
 ///===---------------------------------------------------------------------===//
 
@@ -219,52 +219,6 @@ class DroppedVariableStatsIR : public DroppedVariableStats {
   }
 };
 
-/// A class to collect and print dropped debug information due to MIR
-/// optimization passes. After every MIR pass is run, it will print how many
-/// #DBG_VALUEs were dropped due to that pass.
-class DroppedVariableStatsMIR : public DroppedVariableStats {
-public:
-  DroppedVariableStatsMIR() : llvm::DroppedVariableStats(false) {}
-
-  void runBeforePass(StringRef PassID, MachineFunction *MF) {
-    if (PassID == "Debug Variable Analysis")
-      return;
-    setup();
-    return runOnMachineFunction(MF, true);
-  }
-
-  void runAfterPass(StringRef PassID, MachineFunction *MF) {
-    if (PassID == "Debug Variable Analysis")
-      return;
-    runOnMachineFunction(MF, false);
-    calculateDroppedVarStatsOnMachineFunction(MF, PassID, MF->getName().str());
-    cleanup();
-  }
-
-private:
-  const MachineFunction *MFunc;
-  /// Populate DebugVariablesBefore, DebugVariablesAfter, InlinedAts before or
-  /// after a pass has run to facilitate dropped variable calculation for an
-  /// llvm::MachineFunction.
-  void runOnMachineFunction(const MachineFunction *MF, bool Before);
-  /// Iterate over all Instructions in a MachineFunction and report any dropped
-  /// debug information.
-  void calculateDroppedVarStatsOnMachineFunction(const MachineFunction *MF,
-                                                 StringRef PassID,
-                                                 StringRef FuncOrModName);
-  /// Override base class method to run on an llvm::MachineFunction
-  /// specifically.
-  virtual void
-  visitEveryInstruction(unsigned &DroppedCount,
-                        DenseMap<VarID, DILocation *> &InlinedAtsMap,
-                        VarID Var) override;
-  /// Override base class method to run on DBG_VALUEs specifically.
-  virtual void visitEveryDebugRecord(
-      DenseSet<VarID> &VarIDSet,
-      DenseMap<StringRef, DenseMap<VarID, DILocation *>> &InlinedAtsMap,
-      StringRef FuncName, bool Before) override;
-};
-
 } // namespace llvm
 
 #endif
diff --git a/llvm/include/llvm/CodeGen/MachineFunctionPass.h b/llvm/include/llvm/CodeGen/MachineFunctionPass.h
index 0d49e4997bc9a..caaf22c2139e3 100644
--- a/llvm/include/llvm/CodeGen/MachineFunctionPass.h
+++ b/llvm/include/llvm/CodeGen/MachineFunctionPass.h
@@ -18,7 +18,6 @@
 #ifndef LLVM_CODEGEN_MACHINEFUNCTIONPASS_H
 #define LLVM_CODEGEN_MACHINEFUNCTIONPASS_H
 
-#include "llvm/Analysis/DroppedVariableStats.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/Pass.h"
 
@@ -68,7 +67,6 @@ class MachineFunctionPass : public FunctionPass {
   MachineFunctionProperties RequiredProperties;
   MachineFunctionProperties SetProperties;
   MachineFunctionProperties ClearedProperties;
-  DroppedVariableStatsMIR DroppedVarStatsMF;
 
   /// createPrinterPass - Get a machine function printer pass.
   Pass *createPrinterPass(raw_ostream &O,
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 245276d370e0b..5cf3777253fbe 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -50,7 +50,6 @@ add_llvm_component_library(LLVMAnalysis
   CycleAnalysis.cpp
   DDG.cpp
   DDGPrinter.cpp
-  DroppedVariableStats.cpp
   ConstraintSystem.cpp
   Delinearization.cpp
   DemandedBits.cpp
diff --git a/llvm/lib/Analysis/DroppedVariableStats.cpp b/llvm/lib/Analysis/DroppedVariableStats.cpp
index f900baaabae25..7162cece4f3d9 100644
--- a/llvm/lib/Analysis/DroppedVariableStats.cpp
+++ b/llvm/lib/Analysis/DroppedVariableStats.cpp
@@ -7,7 +7,7 @@
 ///===---------------------------------------------------------------------===//
 /// \file
 /// Dropped Variable Statistics for Debug Information. Reports any number
-/// of #dbg_values or DBG_VALUEs that get dropped due to an optimization pass.
+/// of #dbg_value that get dropped due to an optimization pass.
 ///
 ///===---------------------------------------------------------------------===//
 
@@ -192,64 +192,3 @@ void DroppedVariableStatsIR::visitEveryDebugRecord(
     }
   }
 }
-
-void DroppedVariableStatsMIR::runOnMachineFunction(const MachineFunction *MF,
-                                                   bool Before) {
-  auto &DebugVariables = DebugVariablesStack.back()[&MF->getFunction()];
-  auto FuncName = MF->getName();
-  MFunc = MF;
-  run(DebugVariables, FuncName, Before);
-}
-
-void DroppedVariableStatsMIR::calculateDroppedVarStatsOnMachineFunction(
-    const MachineFunction *MF, StringRef PassID, StringRef FuncOrModName) {
-  MFunc = MF;
-  StringRef FuncName = MF->getName();
-  const Function *Func = &MF->getFunction();
-  DebugVariables &DbgVariables = DebugVariablesStack.back()[Func];
-  calculateDroppedStatsAndPrint(DbgVariables, FuncName, PassID, FuncOrModName,
-                                "MachineFunction", Func);
-}
-
-void DroppedVariableStatsMIR::visitEveryInstruction(
-    unsigned &DroppedCount, DenseMap<VarID, DILocation *> &InlinedAtsMap,
-    VarID Var) {
-  unsigned PrevDroppedCount = DroppedCount;
-  const DIScope *DbgValScope = std::get<0>(Var);
-  for (const auto &MBB : *MFunc) {
-    for (const auto &MI : MBB) {
-      if (!MI.isDebugInstr()) {
-        auto *DbgLoc = MI.getDebugLoc().get();
-        if (!DbgLoc)
-          continue;
-
-        auto *Scope = DbgLoc->getScope();
-        if (updateDroppedCount(DbgLoc, Scope, DbgValScope, InlinedAtsMap, Var,
-                               DroppedCount))
-          break;
-      }
-    }
-    if (PrevDroppedCount != DroppedCount) {
-      PrevDroppedCount = DroppedCount;
-      break;
-    }
-  }
-}
-
-void DroppedVariableStatsMIR::visitEveryDebugRecord(
-    DenseSet<VarID> &VarIDSet,
-    DenseMap<StringRef, DenseMap<VarID, DILocation *>> &InlinedAtsMap,
-    StringRef FuncName, bool Before) {
-  for (const auto &MBB : *MFunc) {
-    for (const auto &MI : MBB) {
-      if (MI.isDebugValueLike()) {
-        auto *DbgVar = MI.getDebugVariable();
-        if (!DbgVar)
-          continue;
-        auto DbgLoc = MI.getDebugLoc();
-        populateVarIDSetAndInlinedMap(DbgVar, DbgLoc, VarIDSet, InlinedAtsMap,
-                                      FuncName, Before);
-      }
-    }
-  }
-}
diff --git a/llvm/lib/CodeGen/MachineFunctionPass.cpp b/llvm/lib/CodeGen/MachineFunctionPass.cpp
index e803811643f87..62ac3e32d24d9 100644
--- a/llvm/lib/CodeGen/MachineFunctionPass.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionPass.cpp
@@ -32,11 +32,6 @@
 using namespace llvm;
 using namespace ore;
 
-static cl::opt<bool> DroppedVarStatsMIR(
-    "dropped-variable-stats-mir", cl::Hidden,
-    cl::desc("Dump dropped debug variables stats for MIR passes"),
-    cl::init(false));
-
 Pass *MachineFunctionPass::createPrinterPass(raw_ostream &O,
                                              const std::string &Banner) const {
   return createMachineFunctionPrinterPass(O, Banner);
@@ -96,15 +91,7 @@ bool MachineFunctionPass::runOnFunction(Function &F) {
 
   MFProps.reset(ClearedProperties);
 
-  bool RV;
-  if (DroppedVarStatsMIR) {
-    auto PassName = getPassName();
-    DroppedVarStatsMF.runBeforePass(PassName, &MF);
-    RV = runOnMachineFunction(MF);
-    DroppedVarStatsMF.runAfterPass(PassName, &MF);
-  } else {
-    RV = runOnMachineFunction(MF);
-  }
+  bool RV = runOnMachineFunction(MF);
 
   if (ShouldEmitSizeRemarks) {
     // We wanted size remarks. Check if there was a change to the number of
diff --git a/llvm/unittests/MIR/CMakeLists.txt b/llvm/unittests/MIR/CMakeLists.txt
index 48d7b9f03c7d1..206094266ba14 100644
--- a/llvm/unittests/MIR/CMakeLists.txt
+++ b/llvm/unittests/MIR/CMakeLists.txt
@@ -14,7 +14,6 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_llvm_unittest(MIRTests
-  DroppedVariableStatsMIRTest.cpp
   MachineMetadata.cpp
   MachineStableHashTest.cpp
   )
diff --git a/llvm/unittests/MIR/DroppedVariableStatsMIRTest.cpp b/llvm/unittests/MIR/DroppedVariableStatsMIRTest.cpp
deleted file mode 100644
index b26a89c7adcba..0000000000000
--- a/llvm/unittests/MIR/DroppedVariableStatsMIRTest.cpp
+++ /dev/null
@@ -1,1067 +0,0 @@
-//===- unittests/IR/DroppedVariableStatsTest.cpp - TimePassesHandler tests
-//----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/AsmParser/Parser.h"
-#include "llvm/CodeGen/MIRParser/MIRParser.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Module.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Pass.h"
-#include "llvm/Passes/StandardInstrumentations.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Target/TargetMachine.h"
-#include "gtest/gtest.h"
-#include <gtest/gtest.h>
-#include <llvm/ADT/SmallString.h>
-#include <llvm/IR/LLVMContext.h>
-#include <llvm/IR/Module.h>
-#include <llvm/IR/PassInstrumentation.h>
-#include <llvm/IR/PassManager.h>
-#include <llvm/IR/PassTimingInfo.h>
-#include <llvm/Support/raw_ostream.h>
-
-using namespace llvm;
-
-namespace {
-
-std::unique_ptr<TargetMachine>
-createTargetMachine(std::string TT, StringRef CPU, StringRef FS) {
-  std::string Error;
-  const Target *T = TargetRegistry::lookupTarget(TT, Error);
-  if (!T)
-    return nullptr;
-  TargetOptions Options;
-  return std::unique_ptr<TargetMachine>(
-      static_cast<TargetMachine *>(T->createTargetMachine(
-          TT, CPU, FS, Options, std::nullopt, std::nullopt)));
-}
-
-std::unique_ptr<Module> parseMIR(const TargetMachine &TM, StringRef MIRCode,
-                                 MachineModuleInfo &MMI, LLVMContext *Context) {
-  SMDiagnostic Diagnostic;
-  std::unique_ptr<Module> M;
-  std::unique_ptr<MemoryBuffer> MBuffer = MemoryBuffer::getMemBuffer(MIRCode);
-  auto MIR = createMIRParser(std::move(MBuffer), *Context);
-  if (!MIR)
-    return nullptr;
-
-  std::unique_ptr<Module> Mod = MIR->parseIRModule();
-  if (!Mod)
-    return nullptr;
-
-  Mod->setDataLayout(TM.createDataLayout());
-
-  if (MIR->parseMachineFunctions(*Mod, MMI)) {
-    M.reset();
-    return nullptr;
-  }
-  return Mod;
-}
-// This test ensures that if a DBG_VALUE and an instruction that exists in the
-// same scope as that DBG_VALUE are both deleted as a result of an optimization
-// pass, debug information is considered not dropped.
-TEST(DroppedVariableStatsMIR, BothDeleted) {
-  InitializeAllTargetInfos();
-  InitializeAllTargets();
-  InitializeAllTargetMCs();
-  PassInstrumentationCallbacks PIC;
-  PassInstrumentation PI(&PIC);
-
-  LLVMContext C;
-
-  const char *MIR =
-      R"(
---- |
-  ; ModuleID = '/tmp/test.ll'
-  source_filename = "/tmp/test.ll"
-  target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
-  
-  define noundef range(i32 -2147483647, -2147483648) i32 @_Z3fooi(i32 noundef %x) local_unnamed_addr !dbg !4 {
-  entry:
-      #dbg_value(i32 %x, !10, !DIExpression(), !11)
-    %add = add nsw i32 %x, 1, !dbg !12
-    ret i32 0
-  }
-  
-  !llvm.dbg.cu = !{!0}
-  !llvm.module.flags = !{!2}
-  !llvm.ident = !{!3}
-  
-  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
-  !1 = !DIFile(filename: "/tmp/code.cpp", directory: "/")
-  !2 = !{i32 2, !"Debug Info Version", i32 3}
-  !3 = !{!"clang"}
-  !4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9)
-  !5 = !DIFile(filename: "/tmp/code.cpp", directory: "")
-  !6 = !DISubroutineType(types: !7)
-  !7 = !{!8, !8}
-  !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-  !9 = !{!10}
-  !10 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 1, type: !8)
-  !11 = !DILocation(line: 0, scope: !4)
-  !12 = !DILocation(line: 2, column: 11, scope: !4)
-
-...
----
-name:            _Z3fooi
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-noPhis:          false
-isSSA:           true
-noVRegs:         false
-hasFakeUses:     false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHCatchret:   false
-hasEHScopes:     false
-hasEHFunclets:   false
-isOutlined:      false
-debugInstrRef:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
-  - { id: 0, class: _, preferred-register: '', flags: [  ] }
-  - { id: 1, class: _, preferred-register: '', flags: [  ] }
-  - { id: 2, class: _, preferred-register: '', flags: [  ] }
-  - { id: 3, class: _, preferred-register: '', flags: [  ] }
-liveins:
-  - { reg: '$w0', virtual-reg: '' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    1
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  isCalleeSavedInfoValid: false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      []
-stack:           []
-entry_values:    []
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo: {}
-body:             |
-  bb.1.entry:
-    liveins: $w0
-  
-    %0:_(s32) = COPY $w0
-    %1:_(s32) = G_CONSTANT i32 1
-    %3:_(s32) = G_CONSTANT i32 0
-    DBG_VALUE %0(s32), $noreg, !10, !DIExpression(), debug-location !11
-    %2:_(s32) = nsw G_ADD %0, %1, debug-location !12
-    $w0 = COPY %3(s32)
-    RET_ReallyLR implicit $w0
-    )";
-  auto TM = createTargetMachine(Triple::normalize("aarch64--"), "", "");
-  MachineModuleInfo MMI(TM.get());
-  std::unique_ptr<Module> M = parseMIR(*TM, MIR, MMI, &C);
-  ASSERT_TRUE(M);
-
-  DroppedVariableStatsMIR Stats;
-  auto *MF = MMI.getMachineFunction(*M->getFunction("_Z3fooi"));
-  Stats.runBeforePass("Test", MF);
-
-  // This loop simulates an IR pass that drops debug information.
-  for (auto &MBB : *MF) {
-    for (auto &MI : MBB) {
-      if (MI.isDebugValueLike()) {
-        MI.eraseFromParent();
-        break;
-      }
-    }
-    for (auto &MI : MBB) {
-      auto *DbgLoc = MI.getDebugLoc().get();
-      if (DbgLoc) {
-        MI.eraseFromParent();
-        break;
-      }
-    }
-    break;
-  }
-
-  Stats.runAfterPass("Test", MF);
-  ASSERT_EQ(Stats.getPassDroppedVariables(), false);
-}
-
-// This test ensures that if a DBG_VALUE is dropped after an optimization pass,
-// but an instruction that shares the same scope as the DBG_VALUE still exists,
-// debug information is conisdered dropped.
-TEST(DroppedVariableStatsMIR, DbgValLost) {
-  InitializeAllTargetInfos();
-  InitializeAllTargets();
-  InitializeAllTargetMCs();
-  PassInstrumentationCallbacks PIC;
-  PassInstrumentation PI(&PIC);
-
-  LLVMContext C;
-
-  const char *MIR =
-      R"(
---- |
-  ; ModuleID = '/tmp/test.ll'
-  source_filename = "/tmp/test.ll"
-  target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
-  
-  define noundef range(i32 -2147483647, -2147483648) i32 @_Z3fooi(i32 noundef %x) local_unnamed_addr !dbg !4 {
-  entry:
-      #dbg_value(i32 %x, !10, !DIExpression(), !11)
-    %add = add nsw i32 %x, 1, !dbg !12
-    ret i32 0
-  }
-  
-  !llvm.dbg.cu = !{!0}
-  !llvm.module.flags = !{!2}
-  !llvm.ident = !{!3}
-  
-  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
-  !1 = !DIFile(filename: "/tmp/code.cpp", directory: "/")
-  !2 = !{i32 2, !"Debug Info Version", i32 3}
-  !3 = !{!"clang"}
-  !4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9)
-  !5 = !DIFile(filename: "/tmp/code.cpp", directory: "")
-  !6 = !DISubroutineType(types: !7)
-  !7 = !{!8, !8}
-  !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-  !9 = !{!10}
-  !10 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 1, type: !8)
-  !11 = !DILocation(line: 0, scope: !4)
-  !12 = !DILocation(line: 2, column: 11, scope: !4)
-
-...
----
-name:            _Z3fooi
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-noPhis:          false
-isSSA:           true
-noVRegs:         false
-hasFakeUses:     false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHCatchret:   false
-hasEHScopes:     false
-hasEHFunclets:   false
-isOutlined:      false
-debugInstrRef:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
-  - { id: 0, class: _, preferred-register: '', flags: [  ] }
-  - { id: 1, class: _, preferred-register: '', flags: [  ] }
-  - { id: 2, class: _, preferred-register: '', flags: [  ] }
-  - { id: 3, class: _, preferred-register: '', flags: [  ] }
-liveins:
-  - { reg: '$w0', virtual-reg: '' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    1
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  isCalleeSavedInfoValid: false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      []
-stack:           []
-entry_values:    []
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo: {}
-body:             |
-  bb.1.entry:
-    liveins: $w0
-  
-    %0:_(s32) = COPY $w0
-    %1:_(s32) = G_CONSTANT i32 1
-    %3:_(s32) = G_CONSTANT i32 0
-    DBG_VALUE %0(s32), $noreg, !10, !DIExpression(), debug-location !11
-    %2:_(s32) = nsw G_ADD %0, %1, debug-location !12
-    $w0 = COPY %3(s32)
-    RET_ReallyLR implicit $w0
-    )";
-  auto TM = createTargetMachine(Triple::normalize("aarch64--"), "", "");
-  MachineModuleInfo MMI(TM.get());
-  std::unique_ptr<Module> M = parseMIR(*TM, MIR, MMI, &C);
-  ASSERT_TRUE(M);
-
-  DroppedVariableStatsMIR Stats;
-  auto *MF = MMI.getMachineFunction(*M->getFunction("_Z3fooi"));
-  Stats.runBeforePass("Test", MF);
-
-  // This loop simulates an IR pass that drops debug information.
-  for (auto &MBB : *MF) {
-    for (auto &MI : MBB) {
-      if (MI.isDebugValueLike()) {
-        MI.eraseFromParent();
-        break;
-      }
-    }
-    break;
-  }
-
-  Stats.runAfterPass("Test", MF);
-  ASSERT_EQ(Stats.getPassDroppedVariables(), true);
-}
-
-// This test ensures that if a #dbg_value is dropped after an optimization pass,
-// but an instruction that has an unrelated scope as the #dbg_value still
-// exists, debug information is conisdered not dropped.
-TEST(DroppedVariableStatsMIR, UnrelatedScopes) {
-  InitializeAllTargetInfos();
-  InitializeAllTargets();
-  InitializeAllTargetMCs();
-  PassInstrumentationCallbacks PIC;
-  PassInstrumentation PI(&PIC);
-
-  LLVMContext C;
-
-  const char *MIR =
-      R"(
---- |
-  ; ModuleID = '/tmp/test.ll'
-  source_filename = "/tmp/test.ll"
-  target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
-  
-  define noundef range(i32 -2147483647, -2147483648) i32 @_Z3fooi(i32 noundef %x) local_unnamed_addr !dbg !4 {
-  entry:
-      #dbg_value(i32 %x, !10, !DIExpression(), !11)
-    %add = add nsw i32 %x, 1, !dbg !12
-    ret i32 0
-  }
-  
-  !llvm.dbg.cu = !{!0}
-  !llvm.module.flags = !{!2}
-  !llvm.ident = !{!3}
-  
-  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
-  !1 = !DIFile(filename: "/tmp/code.cpp", directory: "/")
-  !2 = !{i32 2, !"Debug Info Version", i32 3}
-  !3 = !{!"clang"}
-  !4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9)
-  !5 = !DIFile(filename: "/tmp/code.cpp", directory: "")
-  !6 = !DISubroutineType(types: !7)
-  !7 = !{!8, !8}
-  !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-  !9 = !{!10}
-  !10 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 1, type: !8)
-  !11 = !DILocation(line: 0, scope: !4)
-  !12 = !DILocation(line: 2, column: 11, scope: !13)
-  !13 = distinct !DISubprogram(name: "bar", linkageName: "_Z3bari", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9)
-
-...
----
-name:            _Z3fooi
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-noPhis:          false
-isSSA:           true
-noVRegs:         false
-hasFakeUses:     false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHCatchret:   false
-hasEHScopes:     false
-hasEHFunclets:   false
-isOutlined:      false
-debugInstrRef:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
-  - { id: 0, class: _, preferred-register: '', flags: [  ] }
-  - { id: 1, class: _, preferred-register: '', flags: [  ] }
-  - { id: 2, class: _, preferred-register: '', flags: [  ] }
-  - { id: 3, class: _, preferred-register: '', flags: [  ] }
-liveins:
-  - { reg: '$w0', virtual-reg: '' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    1
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  isCalleeSavedInfoValid: false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      []
-stack:           []
-entry_values:    []
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo: {}
-body:             |
-  bb.1.entry:
-    liveins: $w0
-  
-    %0:_(s32) = COPY $w0
-    %1:_(s32) = G_CONSTANT i32 1
-    %3:_(s32) = G_CONSTANT i32 0
-    DBG_VALUE %0(s32), $noreg, !10, !DIExpression(), debug-location !11
-    %2:_(s32) = nsw G_ADD %0, %1, debug-location !12
-    $w0 = COPY %3(s32)
-    RET_ReallyLR implicit $w0
-    )";
-  auto TM = createTargetMachine(Triple::normalize("aarch64--"), "", "");
-  MachineModuleInfo MMI(TM.get());
-  std::unique_ptr<Module> M = parseMIR(*TM, MIR, MMI, &C);
-  ASSERT_TRUE(M);
-
-  DroppedVariableStatsMIR Stats;
-  auto *MF = MMI.getMachineFunction(*M->getFunction("_Z3fooi"));
-  Stats.runBeforePass("Test", MF);
-
-  // This loop simulates an IR pass that drops debug information.
-  for (auto &MBB : *MF) {
-    for (auto &MI : MBB) {
-      if (MI.isDebugValueLike()) {
-        MI.eraseFromParent();
-        break;
-      }
-    }
-    break;
-  }
-
-  Stats.runAfterPass("Test", MF);
-  ASSERT_EQ(Stats.getPassDroppedVariables(), false);
-}
-
-// This test ensures that if a #dbg_value is dropped after an optimization pass,
-// but an instruction that has a scope which is a child of the #dbg_value scope
-// still exists, debug information is conisdered dropped.
-TEST(DroppedVariableStatsMIR, ChildScopes) {
-  InitializeAllTargetInfos();
-  InitializeAllTargets();
-  InitializeAllTargetMCs();
-  PassInstrumentationCallbacks PIC;
-  PassInstrumentation PI(&PIC);
-
-  LLVMContext C;
-
-  const char *MIR =
-      R"(
---- |
-  ; ModuleID = '/tmp/test.ll'
-  source_filename = "/tmp/test.ll"
-  target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
-  
-  define noundef range(i32 -2147483647, -2147483648) i32 @_Z3fooi(i32 noundef %x) local_unnamed_addr !dbg !4 {
-  entry:
-      #dbg_value(i32 %x, !10, !DIExpression(), !11)
-    %add = add nsw i32 %x, 1, !dbg !12
-    ret i32 0
-  }
-  
-  !llvm.dbg.cu = !{!0}
-  !llvm.module.flags = !{!2}
-  !llvm.ident = !{!3}
-  
-  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
-  !1 = !DIFile(filename: "/tmp/code.cpp", directory: "/")
-  !2 = !{i32 2, !"Debug Info Version", i32 3}
-  !3 = !{!"clang"}
-  !4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9)
-  !5 = !DIFile(filename: "/tmp/code.cpp", directory: "")
-  !6 = !DISubroutineType(types: !7)
-  !7 = !{!8, !8}
-  !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-  !9 = !{!10}
-  !10 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 1, type: !8)
-  !11 = !DILocation(line: 0, scope: !4)
-  !12 = !DILocation(line: 2, column: 11, scope: !13)
-  !13 = distinct !DILexicalBlock(scope: !4, file: !5, line: 10, column: 28)
-
-...
----
-name:            _Z3fooi
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-noPhis:          false
-isSSA:           true
-noVRegs:         false
-hasFakeUses:     false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHCatchret:   false
-hasEHScopes:     false
-hasEHFunclets:   false
-isOutlined:      false
-debugInstrRef:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
-  - { id: 0, class: _, preferred-register: '', flags: [  ] }
-  - { id: 1, class: _, preferred-register: '', flags: [  ] }
-  - { id: 2, class: _, preferred-register: '', flags: [  ] }
-  - { id: 3, class: _, preferred-register: '', flags: [  ] }
-liveins:
-  - { reg: '$w0', virtual-reg: '' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    1
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  isCalleeSavedInfoValid: false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      []
-stack:           []
-entry_values:    []
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo: {}
-body:             |
-  bb.1.entry:
-    liveins: $w0
-  
-    %0:_(s32) = COPY $w0
-    %1:_(s32) = G_CONSTANT i32 1
-    %3:_(s32) = G_CONSTANT i32 0
-    DBG_VALUE %0(s32), $noreg, !10, !DIExpression(), debug-location !11
-    %2:_(s32) = nsw G_ADD %0, %1, debug-location !12
-    $w0 = COPY %3(s32)
-    RET_ReallyLR implicit $w0
-    )";
-  auto TM = createTargetMachine(Triple::normalize("aarch64--"), "", "");
-  MachineModuleInfo MMI(TM.get());
-  std::unique_ptr<Module> M = parseMIR(*TM, MIR, MMI, &C);
-  ASSERT_TRUE(M);
-
-  DroppedVariableStatsMIR Stats;
-  auto *MF = MMI.getMachineFunction(*M->getFunction("_Z3fooi"));
-  Stats.runBeforePass("Test", MF);
-
-  // This loop simulates an IR pass that drops debug information.
-  for (auto &MBB : *MF) {
-    for (auto &MI : MBB) {
-      if (MI.isDebugValueLike()) {
-        MI.eraseFromParent();
-        break;
-      }
-    }
-    break;
-  }
-
-  Stats.runAfterPass("Test", MF);
-  ASSERT_EQ(Stats.getPassDroppedVariables(), true);
-}
-
-// This test ensures that if a DBG_VALUE is dropped after an optimization pass,
-// but an instruction that has a scope which is a child of the DBG_VALUE scope
-// still exists, and the DBG_VALUE is inlined at another location, debug
-// information is conisdered not dropped.
-TEST(DroppedVariableStatsMIR, InlinedAt) {
-  InitializeAllTargetInfos();
-  InitializeAllTargets();
-  InitializeAllTargetMCs();
-  PassInstrumentationCallbacks PIC;
-  PassInstrumentation PI(&PIC);
-
-  LLVMContext C;
-
-  const char *MIR =
-      R"(
---- |
-  ; ModuleID = '/tmp/test.ll'
-  source_filename = "/tmp/test.ll"
-  target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
-  
-  define noundef range(i32 -2147483647, -2147483648) i32 @_Z3fooi(i32 noundef %x) local_unnamed_addr !dbg !4 {
-  entry:
-      #dbg_value(i32 %x, !10, !DIExpression(), !11)
-    %add = add nsw i32 %x, 1, !dbg !12
-    ret i32 0
-  }
-  
-  !llvm.dbg.cu = !{!0}
-  !llvm.module.flags = !{!2}
-  !llvm.ident = !{!3}
-  
-  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
-  !1 = !DIFile(filename: "/tmp/code.cpp", directory: "/")
-  !2 = !{i32 2, !"Debug Info Version", i32 3}
-  !3 = !{!"clang"}
-  !4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9)
-  !5 = !DIFile(filename: "/tmp/code.cpp", directory: "")
-  !6 = !DISubroutineType(types: !7)
-  !7 = !{!8, !8}
-  !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-  !9 = !{!10}
-  !10 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 1, type: !8)
-  !11 = !DILocation(line: 0, scope: !4, inlinedAt: !14)
-  !12 = !DILocation(line: 2, column: 11, scope: !13)
-  !13 = distinct !DILexicalBlock(scope: !4, file: !5, line: 10, column: 28)
-  !14 = !DILocation(line: 3, column: 2, scope: !4)
-
-...
----
-name:            _Z3fooi
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-noPhis:          false
-isSSA:           true
-noVRegs:         false
-hasFakeUses:     false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHCatchret:   false
-hasEHScopes:     false
-hasEHFunclets:   false
-isOutlined:      false
-debugInstrRef:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
-  - { id: 0, class: _, preferred-register: '', flags: [  ] }
-  - { id: 1, class: _, preferred-register: '', flags: [  ] }
-  - { id: 2, class: _, preferred-register: '', flags: [  ] }
-  - { id: 3, class: _, preferred-register: '', flags: [  ] }
-liveins:
-  - { reg: '$w0', virtual-reg: '' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    1
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  isCalleeSavedInfoValid: false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      []
-stack:           []
-entry_values:    []
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo: {}
-body:             |
-  bb.1.entry:
-    liveins: $w0
-  
-    %0:_(s32) = COPY $w0
-    %1:_(s32) = G_CONSTANT i32 1
-    %3:_(s32) = G_CONSTANT i32 0
-    DBG_VALUE %0(s32), $noreg, !10, !DIExpression(), debug-location !11
-    %2:_(s32) = nsw G_ADD %0, %1, debug-location !12
-    $w0 = COPY %3(s32)
-    RET_ReallyLR implicit $w0
-    )";
-  auto TM = createTargetMachine(Triple::normalize("aarch64--"), "", "");
-  MachineModuleInfo MMI(TM.get());
-  std::unique_ptr<Module> M = parseMIR(*TM, MIR, MMI, &C);
-  ASSERT_TRUE(M);
-
-  DroppedVariableStatsMIR Stats;
-  auto *MF = MMI.getMachineFunction(*M->getFunction("_Z3fooi"));
-  Stats.runBeforePass("Test", MF);
-
-  // This loop simulates an IR pass that drops debug information.
-  for (auto &MBB : *MF) {
-    for (auto &MI : MBB) {
-      if (MI.isDebugValueLike()) {
-        MI.eraseFromParent();
-        break;
-      }
-    }
-    break;
-  }
-
-  Stats.runAfterPass("Test", MF);
-  ASSERT_EQ(Stats.getPassDroppedVariables(), false);
-}
-
-// This test ensures that if a DBG_VALUE is dropped after an optimization pass,
-// but an instruction that has a scope which is a child of the DBG_VALUE scope
-// still exists, and the DBG_VALUE and the instruction are inlined at another
-// location, debug information is conisdered dropped.
-TEST(DroppedVariableStatsMIR, InlinedAtShared) {
-  InitializeAllTargetInfos();
-  InitializeAllTargets();
-  InitializeAllTargetMCs();
-  PassInstrumentationCallbacks PIC;
-  PassInstrumentation PI(&PIC);
-
-  LLVMContext C;
-
-  const char *MIR =
-      R"(
---- |
-  ; ModuleID = '/tmp/test.ll'
-  source_filename = "/tmp/test.ll"
-  target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
-  
-  define noundef range(i32 -2147483647, -2147483648) i32 @_Z3fooi(i32 noundef %x) local_unnamed_addr !dbg !4 {
-  entry:
-      #dbg_value(i32 %x, !10, !DIExpression(), !11)
-    %add = add nsw i32 %x, 1, !dbg !12
-    ret i32 0
-  }
-  
-  !llvm.dbg.cu = !{!0}
-  !llvm.module.flags = !{!2}
-  !llvm.ident = !{!3}
-  
-  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
-  !1 = !DIFile(filename: "/tmp/code.cpp", directory: "/")
-  !2 = !{i32 2, !"Debug Info Version", i32 3}
-  !3 = !{!"clang"}
-  !4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9)
-  !5 = !DIFile(filename: "/tmp/code.cpp", directory: "")
-  !6 = !DISubroutineType(types: !7)
-  !7 = !{!8, !8}
-  !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-  !9 = !{!10}
-  !10 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 1, type: !8)
-  !11 = !DILocation(line: 0, scope: !4, inlinedAt: !14)
-  !12 = !DILocation(line: 2, column: 11, scope: !13, inlinedAt: !14)
-  !13 = distinct !DILexicalBlock(scope: !4, file: !5, line: 10, column: 28)
-  !14 = !DILocation(line: 3, column: 2, scope: !4)
-
-...
----
-name:            _Z3fooi
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-noPhis:          false
-isSSA:           true
-noVRegs:         false
-hasFakeUses:     false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHCatchret:   false
-hasEHScopes:     false
-hasEHFunclets:   false
-isOutlined:      false
-debugInstrRef:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
-  - { id: 0, class: _, preferred-register: '', flags: [  ] }
-  - { id: 1, class: _, preferred-register: '', flags: [  ] }
-  - { id: 2, class: _, preferred-register: '', flags: [  ] }
-  - { id: 3, class: _, preferred-register: '', flags: [  ] }
-liveins:
-  - { reg: '$w0', virtual-reg: '' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    1
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  isCalleeSavedInfoValid: false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      []
-stack:           []
-entry_values:    []
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo: {}
-body:             |
-  bb.1.entry:
-    liveins: $w0
-  
-    %0:_(s32) = COPY $w0
-    %1:_(s32) = G_CONSTANT i32 1
-    %3:_(s32) = G_CONSTANT i32 0
-    DBG_VALUE %0(s32), $noreg, !10, !DIExpression(), debug-location !11
-    %2:_(s32) = nsw G_ADD %0, %1, debug-location !12
-    $w0 = COPY %3(s32)
-    RET_ReallyLR implicit $w0
-    )";
-  auto TM = createTargetMachine(Triple::normalize("aarch64--"), "", "");
-  MachineModuleInfo MMI(TM.get());
-  std::unique_ptr<Module> M = parseMIR(*TM, MIR, MMI, &C);
-  ASSERT_TRUE(M);
-
-  DroppedVariableStatsMIR Stats;
-  auto *MF = MMI.getMachineFunction(*M->getFunction("_Z3fooi"));
-  Stats.runBeforePass("Test", MF);
-
-  // This loop simulates an IR pass that drops debug information.
-  for (auto &MBB : *MF) {
-    for (auto &MI : MBB) {
-      if (MI.isDebugValueLike()) {
-        MI.eraseFromParent();
-        break;
-      }
-    }
-    break;
-  }
-
-  Stats.runAfterPass("Test", MF);
-  ASSERT_EQ(Stats.getPassDroppedVariables(), true);
-}
-
-// This test ensures that if a DBG_VALUE is dropped after an optimization pass,
-// but an instruction that has a scope which is a child of the DBG_VALUE scope
-// still exists, and the instruction is inlined at a location that is the
-// DBG_VALUE's inlined at location, debug information is conisdered dropped.
-TEST(DroppedVariableStatsMIR, InlinedAtChild) {
-  InitializeAllTargetInfos();
-  InitializeAllTargets();
-  InitializeAllTargetMCs();
-  PassInstrumentationCallbacks PIC;
-  PassInstrumentation PI(&PIC);
-
-  LLVMContext C;
-
-  const char *MIR =
-      R"(
---- |
-  ; ModuleID = '/tmp/test.ll'
-  source_filename = "/tmp/test.ll"
-  target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
-  
-  define noundef range(i32 -2147483647, -2147483648) i32 @_Z3fooi(i32 noundef %x) local_unnamed_addr !dbg !4 {
-  entry:
-      #dbg_value(i32 %x, !10, !DIExpression(), !11)
-    %add = add nsw i32 %x, 1, !dbg !12
-    ret i32 0
-  }
-  
-  !llvm.dbg.cu = !{!0}
-  !llvm.module.flags = !{!2}
-  !llvm.ident = !{!3}
-  
-  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
-  !1 = !DIFile(filename: "/tmp/code.cpp", directory: "/")
-  !2 = !{i32 2, !"Debug Info Version", i32 3}
-  !3 = !{!"clang"}
-  !4 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !5, file: !5, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !9)
-  !5 = !DIFile(filename: "/tmp/code.cpp", directory: "")
-  !6 = !DISubroutineType(types: !7)
-  !7 = !{!8, !8}
-  !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-  !9 = !{!10}
-  !10 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !5, line: 1, type: !8)
-  !11 = !DILocation(line: 0, scope: !4, inlinedAt: !14)
-  !12 = !DILocation(line: 2, column: 11, scope: !13, inlinedAt: !15)
-  !13 = distinct !DILexicalBlock(scope: !4, file: !5, line: 10, column: 28)
-  !14 = !DILocation(line: 3, column: 2, scope: !4)
-  !15 = !DILocation(line: 4, column: 5, scope: !13, inlinedAt: !14)
-
-...
----
-name:            _Z3fooi
-alignment:       4
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
-failedISel:      false
-tracksRegLiveness: true
-hasWinCFI:       false
-noPhis:          false
-isSSA:           true
-noVRegs:         false
-hasFakeUses:     false
-callsEHReturn:   false
-callsUnwindInit: false
-hasEHCatchret:   false
-hasEHScopes:     false
-hasEHFunclets:   false
-isOutlined:      false
-debugInstrRef:   false
-failsVerification: false
-tracksDebugUserValues: false
-registers:
-  - { id: 0, class: _, preferred-register: '', flags: [  ] }
-  - { id: 1, class: _, preferred-register: '', flags: [  ] }
-  - { id: 2, class: _, preferred-register: '', flags: [  ] }
-  - { id: 3, class: _, preferred-register: '', flags: [  ] }
-liveins:
-  - { reg: '$w0', virtual-reg: '' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    1
-  adjustsStack:    false
-  hasCalls:        false
-  stackProtector:  ''
-  functionContext: ''
-  maxCallFrameSize: 4294967295
-  cvBytesOfCalleeSavedRegisters: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
-  hasTailCall:     false
-  isCalleeSavedInfoValid: false
-  localFrameSize:  0
-  savePoint:       ''
-  restorePoint:    ''
-fixedStack:      []
-stack:           []
-entry_values:    []
-callSites:       []
-debugValueSubstitutions: []
-constants:       []
-machineFunctionInfo: {}
-body:             |
-  bb.1.entry:
-    liveins: $w0
-  
-    %0:_(s32) = COPY $w0
-    %1:_(s32) = G_CONSTANT i32 1
-    %3:_(s32) = G_CONSTANT i32 0
-    DBG_VALUE %0(s32), $noreg, !10, !DIExpression(), debug-location !11
-    %2:_(s32) = nsw G_ADD %0, %1, debug-location !12
-    $w0 = COPY %3(s32)
-    RET_ReallyLR implicit $w0
-    )";
-  auto TM = createTargetMachine(Triple::normalize("aarch64--"), "", "");
-  MachineModuleInfo MMI(TM.get());
-  std::unique_ptr<Module> M = parseMIR(*TM, MIR, MMI, &C);
-  ASSERT_TRUE(M);
-
-  DroppedVariableStatsMIR Stats;
-  auto *MF = MMI.getMachineFunction(*M->getFunction("_Z3fooi"));
-  Stats.runBeforePass("Test", MF);
-
-  // This loop simulates an IR pass that drops debug information.
-  for (auto &MBB : *MF) {
-    for (auto &MI : MBB) {
-      if (MI.isDebugValueLike()) {
-        MI.eraseFromParent();
-        break;
-      }
-    }
-    break;
-  }
-
-  Stats.runAfterPass("Test", MF);
-  ASSERT_EQ(Stats.getPassDroppedVariables(), true);
-}
-
-} // end anonymous namespace

From 0f9257b9abab72afdc210412a21c628f2df1a1f0 Mon Sep 17 00:00:00 2001
From: Djordje Todorovic <djordje.todorovic@htecgroup.com>
Date: Thu, 19 Dec 2024 09:52:16 +0100
Subject: [PATCH 022/209] [RISCV] Add scheduling model for mips p8700 CPU
 (#119885)

Depends on #119882.
---
 llvm/lib/Target/RISCV/RISCV.td               |   1 +
 llvm/lib/Target/RISCV/RISCVProcessors.td     |   3 +-
 llvm/lib/Target/RISCV/RISCVSchedMIPSP8700.td | 280 +++++++++++++++++++
 llvm/test/tools/llvm-mca/RISCV/MIPS/p8700.s  | 143 ++++++++++
 4 files changed, 425 insertions(+), 2 deletions(-)
 create mode 100644 llvm/lib/Target/RISCV/RISCVSchedMIPSP8700.td
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/MIPS/p8700.s

diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index 00c3d702e12a2..1df6f9ae1944c 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -46,6 +46,7 @@ include "RISCVMacroFusion.td"
 // RISC-V Scheduling Models
 //===----------------------------------------------------------------------===//
 
+include "RISCVSchedMIPSP8700.td"
 include "RISCVSchedRocket.td"
 include "RISCVSchedSiFive7.td"
 include "RISCVSchedSiFiveP400.td"
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 445e084d07686..053a3b11f39bc 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -105,7 +105,7 @@ def GENERIC_RV64 : RISCVProcessorModel<"generic-rv64",
 def GENERIC : RISCVTuneProcessorModel<"generic", NoSchedModel>, GenericTuneInfo;
 
 def MIPS_P8700 : RISCVProcessorModel<"mips-p8700",
-                                     NoSchedModel,
+                                     MIPSP8700Model,
                                      [Feature64Bit,
                                       FeatureStdExtI,
                                       FeatureStdExtM,
@@ -321,7 +321,6 @@ def SIFIVE_P470 : RISCVProcessorModel<"sifive-p470", SiFiveP400Model,
                                                   [TuneNoSinkSplatOperands,
                                                    TuneVXRMPipelineFlush])>;
 
-
 def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
                                       !listconcat(RVA22U64Features,
                                       [FeatureStdExtV,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedMIPSP8700.td b/llvm/lib/Target/RISCV/RISCVSchedMIPSP8700.td
new file mode 100644
index 0000000000000..4029f4dbc11fb
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVSchedMIPSP8700.td
@@ -0,0 +1,280 @@
+//===-- RISCVSchedMIPSP8700.td - MIPS RISC-V Processor -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// P8700 - a RISC-V processor by MIPS.
+// Pipelines:
+//   - 2 Integer Arithmetic and Logical Units (ALU and AL2)
+//   - Multiply / Divide Unit (MDU)
+//   - Branch Unit (CTI)
+//   - Load Store Unit (LSU)
+//   - Short Floating Point Pipe (FPUS)
+//   - Long Floating Point Pipe (FPUL)
+//===----------------------------------------------------------------------===//
+
+def MIPSP8700Model : SchedMachineModel {
+  int IssueWidth = 4;
+  int MicroOpBufferSize = 96;
+  int LoadLatency = 4;
+  int MispredictPenalty = 8;
+  let CompleteModel = 0;
+}
+
+let SchedModel = MIPSP8700Model in {
+// Handle ALQ Pipelines.
+// It contains 1 ALU Unit only.
+def p8700ALQ : ProcResource<1> { let BufferSize = 16; }
+
+// Handle AGQ Pipelines.
+def p8700AGQ : ProcResource<3> { let BufferSize = 16; }
+def p8700IssueAL2 : ProcResource<1> { let Super = p8700AGQ; }
+def p8700IssueCTI : ProcResource<1> { let Super = p8700AGQ; }
+def p8700IssueLSU : ProcResource<1> { let Super = p8700AGQ; }
+def p8700WriteEitherALU : ProcResGroup<[p8700ALQ, p8700IssueAL2]>;
+
+// Handle Multiply Divide Pipe.
+def p8700GpDiv : ProcResource<1>;
+def p8700GpMul : ProcResource<1>;
+
+def : WriteRes<WriteIALU, [p8700WriteEitherALU]>;
+def : WriteRes<WriteIALU32, [p8700WriteEitherALU]>;
+def : WriteRes<WriteShiftImm, [p8700WriteEitherALU]>;
+def : WriteRes<WriteShiftImm32, [p8700WriteEitherALU]>;
+def : WriteRes<WriteShiftReg, [p8700WriteEitherALU]>;
+def : WriteRes<WriteShiftReg32, [p8700WriteEitherALU]>;
+
+// Handle zba.
+def : WriteRes<WriteSHXADD, [p8700WriteEitherALU]>;
+def : WriteRes<WriteSHXADD32, [p8700WriteEitherALU]>;
+
+// Handle zbb.
+let Latency = 2 in {
+def : WriteRes<WriteCLZ, [p8700IssueAL2]>;
+def : WriteRes<WriteCTZ, [p8700IssueAL2]>;
+def : WriteRes<WriteCPOP, [p8700IssueAL2]>;
+def : WriteRes<WriteCLZ32, [p8700IssueAL2]>;
+def : WriteRes<WriteCTZ32, [p8700IssueAL2]>;
+def : WriteRes<WriteCPOP32, [p8700IssueAL2]>;
+}
+def : WriteRes<WriteRotateReg, [p8700WriteEitherALU]>;
+def : WriteRes<WriteRotateImm, [p8700WriteEitherALU]>;
+def : WriteRes<WriteRotateReg32, [p8700WriteEitherALU]>;
+def : WriteRes<WriteRotateImm32, [p8700WriteEitherALU]>;
+def : WriteRes<WriteREV8, [p8700WriteEitherALU]>;
+def : WriteRes<WriteORCB, [p8700WriteEitherALU]>;
+def : WriteRes<WriteIMinMax, [p8700WriteEitherALU]>;
+
+let Latency = 0 in
+def : WriteRes<WriteNop, [p8700WriteEitherALU]>;
+
+let Latency = 4 in {
+def : WriteRes<WriteLDB, [p8700IssueLSU]>;
+def : WriteRes<WriteLDH, [p8700IssueLSU]>;
+def : WriteRes<WriteLDW, [p8700IssueLSU]>;
+def : WriteRes<WriteLDD, [p8700IssueLSU]>;
+
+def : WriteRes<WriteAtomicW, [p8700IssueLSU]>;
+def : WriteRes<WriteAtomicD, [p8700IssueLSU]>;
+def : WriteRes<WriteAtomicLDW, [p8700IssueLSU]>;
+def : WriteRes<WriteAtomicLDD, [p8700IssueLSU]>;
+}
+
+let Latency = 8 in {
+def : WriteRes<WriteFLD32, [p8700IssueLSU]>;
+def : WriteRes<WriteFLD64, [p8700IssueLSU]>;
+}
+
+let Latency = 3 in {
+def : WriteRes<WriteSTB, [p8700IssueLSU]>;
+def : WriteRes<WriteSTH, [p8700IssueLSU]>;
+def : WriteRes<WriteSTW, [p8700IssueLSU]>;
+def : WriteRes<WriteSTD, [p8700IssueLSU]>;
+
+def : WriteRes<WriteAtomicSTW, [p8700IssueLSU]>;
+def : WriteRes<WriteAtomicSTD, [p8700IssueLSU]>;
+}
+
+def : WriteRes<WriteFST32, [p8700IssueLSU]>;
+def : WriteRes<WriteFST64, [p8700IssueLSU]>;
+
+let Latency = 7 in {
+def : WriteRes<WriteFMovI32ToF32, [p8700IssueLSU]>;
+def : WriteRes<WriteFMovF32ToI32, [p8700IssueLSU]>;
+def : WriteRes<WriteFMovI64ToF64, [p8700IssueLSU]>;
+def : WriteRes<WriteFMovF64ToI64, [p8700IssueLSU]>;
+}
+
+let Latency = 4 in {
+def : WriteRes<WriteIMul, [p8700GpMul]>;
+def : WriteRes<WriteIMul32, [p8700GpMul]>;
+}
+
+let Latency = 7, ReleaseAtCycles = [7] in {
+def : WriteRes<WriteIDiv, [p8700GpDiv]>;
+def : WriteRes<WriteIDiv32,  [p8700GpDiv]>;
+def : WriteRes<WriteIRem, [p8700GpDiv]>;
+def : WriteRes<WriteIRem32, [p8700GpDiv]>;
+}
+
+def : WriteRes<WriteCSR, [p8700ALQ]>;
+
+// Handle CTI Pipeline.
+def : WriteRes<WriteJmp, [p8700IssueCTI]>;
+def : WriteRes<WriteJalr, [p8700IssueCTI]>;
+let Latency = 2 in {
+def : WriteRes<WriteJal, [p8700IssueCTI]>;
+def : WriteRes<WriteJalr, [p8700IssueCTI]>;
+}
+
+// Handle FPU Pipelines.
+def p8700FPQ : ProcResource<3> { let BufferSize = 16; }
+def p8700IssueFPUS : ProcResource<1> { let Super = p8700FPQ; }
+def p8700IssueFPUL : ProcResource<1> { let Super = p8700FPQ; }
+def p8700FpuApu    : ProcResource<1>;
+def p8700FpuLong   : ProcResource<1>;
+
+let Latency = 4 in {
+def : WriteRes<WriteFCvtI32ToF32, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFCvtI32ToF64, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFCvtI64ToF32, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFCvtI64ToF64, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFCvtF32ToI32, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFCvtF32ToI64, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFCvtF32ToF64, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFCvtF64ToI32, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFCvtF64ToI64, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFCvtF64ToF32, [p8700IssueFPUL, p8700FpuApu]>;
+
+def : WriteRes<WriteFAdd32, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFAdd64, [p8700IssueFPUL, p8700FpuApu]>;
+}
+
+let Latency = 2 in {
+def : WriteRes<WriteFSGNJ32, [p8700IssueFPUS, p8700FpuApu]>;
+def : WriteRes<WriteFMinMax32, [p8700IssueFPUS, p8700FpuApu]>;
+def : WriteRes<WriteFSGNJ64, [p8700IssueFPUS, p8700FpuApu]>;
+def : WriteRes<WriteFMinMax64, [p8700IssueFPUS, p8700FpuApu]>;
+
+def : WriteRes<WriteFCmp32, [p8700IssueFPUS, p8700FpuApu]>;
+def : WriteRes<WriteFCmp64, [p8700IssueFPUS, p8700FpuApu]>;
+def : WriteRes<WriteFClass32, [p8700IssueFPUS, p8700FpuApu]>;
+def : WriteRes<WriteFClass64, [p8700IssueFPUS, p8700FpuApu]>;
+}
+
+let Latency = 8 in {
+def : WriteRes<WriteFMA32, [p8700FpuLong, p8700FpuApu]>;
+def : WriteRes<WriteFMA64, [p8700FpuLong, p8700FpuApu]>;
+}
+
+let Latency = 5 in {
+def : WriteRes<WriteFMul32, [p8700FpuLong, p8700FpuApu]>;
+def : WriteRes<WriteFMul64, [p8700FpuLong, p8700FpuApu]>;
+}
+
+let Latency = 11, ReleaseAtCycles = [1, 11] in {
+def : WriteRes<WriteFDiv32, [p8700FpuLong, p8700FpuApu]>;
+def : WriteRes<WriteFSqrt32, [p8700FpuLong, p8700FpuApu]>;
+}
+
+let Latency = 17, ReleaseAtCycles = [1, 17] in {
+def : WriteRes<WriteFDiv64, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFSqrt64, [p8700IssueFPUL, p8700FpuApu]>;
+}
+
+// Bypass and advance.
+def : ReadAdvance<ReadIALU, 0>;
+def : ReadAdvance<ReadIALU32, 0>;
+def : ReadAdvance<ReadShiftImm, 0>;
+def : ReadAdvance<ReadShiftImm32, 0>;
+def : ReadAdvance<ReadShiftReg, 0>;
+def : ReadAdvance<ReadShiftReg32, 0>;
+def : ReadAdvance<ReadSHXADD, 0>;
+def : ReadAdvance<ReadSHXADD32, 0>;
+def : ReadAdvance<ReadRotateReg, 0>;
+def : ReadAdvance<ReadRotateImm, 0>;
+def : ReadAdvance<ReadCLZ, 0>;
+def : ReadAdvance<ReadCTZ, 0>;
+def : ReadAdvance<ReadCPOP, 0>;
+def : ReadAdvance<ReadRotateReg32, 0>;
+def : ReadAdvance<ReadRotateImm32, 0>;
+def : ReadAdvance<ReadCLZ32, 0>;
+def : ReadAdvance<ReadCTZ32, 0>;
+def : ReadAdvance<ReadCPOP32, 0>;
+def : ReadAdvance<ReadREV8, 0>;
+def : ReadAdvance<ReadORCB, 0>;
+def : ReadAdvance<ReadIMul, 0>;
+def : ReadAdvance<ReadIMul32, 0>;
+def : ReadAdvance<ReadIDiv, 0>;
+def : ReadAdvance<ReadIDiv32, 0>;
+def : ReadAdvance<ReadJmp, 0>;
+def : ReadAdvance<ReadJalr, 0>;
+def : ReadAdvance<ReadFMovI32ToF32, 0>;
+def : ReadAdvance<ReadFMovF32ToI32, 0>;
+def : ReadAdvance<ReadFMovI64ToF64, 0>;
+def : ReadAdvance<ReadFMovF64ToI64, 0>;
+def : ReadAdvance<ReadFSGNJ32, 0>;
+def : ReadAdvance<ReadFMinMax32, 0>;
+def : ReadAdvance<ReadFSGNJ64, 0>;
+def : ReadAdvance<ReadFMinMax64, 0>;
+def : ReadAdvance<ReadFCmp32, 0>;
+def : ReadAdvance<ReadFCmp64, 0>;
+def : ReadAdvance<ReadFCvtI32ToF32, 0>;
+def : ReadAdvance<ReadFCvtI32ToF64, 0>;
+def : ReadAdvance<ReadFCvtI64ToF32, 0>;
+def : ReadAdvance<ReadFCvtI64ToF64, 0>;
+def : ReadAdvance<ReadFCvtF32ToI32, 0>;
+def : ReadAdvance<ReadFCvtF32ToI64, 0>;
+def : ReadAdvance<ReadFCvtF32ToF64, 0>;
+def : ReadAdvance<ReadFCvtF64ToI32, 0>;
+def : ReadAdvance<ReadFCvtF64ToI64, 0>;
+def : ReadAdvance<ReadFCvtF64ToF32, 0>;
+def : ReadAdvance<ReadFAdd32, 0>;
+def : ReadAdvance<ReadFAdd64, 0>;
+def : ReadAdvance<ReadFMul32, 0>;
+def : ReadAdvance<ReadFMul64, 0>;
+def : ReadAdvance<ReadFMA32, 0>;
+def : ReadAdvance<ReadFMA32Addend, 0>;
+def : ReadAdvance<ReadFMA64, 0>;
+def : ReadAdvance<ReadFMA64Addend, 0>;
+def : ReadAdvance<ReadFDiv32, 0>;
+def : ReadAdvance<ReadFSqrt32, 0>;
+def : ReadAdvance<ReadFDiv64, 0>;
+def : ReadAdvance<ReadFSqrt64, 0>;
+def : ReadAdvance<ReadAtomicWA, 0>;
+def : ReadAdvance<ReadAtomicWD, 0>;
+def : ReadAdvance<ReadAtomicDA, 0>;
+def : ReadAdvance<ReadAtomicDD, 0>;
+def : ReadAdvance<ReadAtomicLDW, 0>;
+def : ReadAdvance<ReadAtomicLDD, 0>;
+def : ReadAdvance<ReadAtomicSTW, 0>;
+def : ReadAdvance<ReadAtomicSTD, 0>;
+def : ReadAdvance<ReadFStoreData, 0>;
+def : ReadAdvance<ReadCSR, 0>;
+def : ReadAdvance<ReadMemBase, 0>;
+def : ReadAdvance<ReadStoreData, 0>;
+def : ReadAdvance<ReadFMemBase, 0>;
+def : ReadAdvance<ReadFClass32, 0>;
+def : ReadAdvance<ReadFClass64, 0>;
+def : ReadAdvance<ReadIMinMax, 0>;
+def : ReadAdvance<ReadIRem, 0>;
+def : ReadAdvance<ReadIRem32, 0>;
+
+// Unsupported extensions.
+defm : UnsupportedSchedV;
+defm : UnsupportedSchedZbc;
+defm : UnsupportedSchedZbs;
+defm : UnsupportedSchedZbkb;
+defm : UnsupportedSchedZbkx;
+defm : UnsupportedSchedZfa;
+defm : UnsupportedSchedZfh;
+defm : UnsupportedSchedSFB;
+defm : UnsupportedSchedZabha;
+defm : UnsupportedSchedXsfvcp;
+defm : UnsupportedSchedZvk;
+defm : UnsupportedSchedZvkned;
+}
diff --git a/llvm/test/tools/llvm-mca/RISCV/MIPS/p8700.s b/llvm/test/tools/llvm-mca/RISCV/MIPS/p8700.s
new file mode 100644
index 0000000000000..ca91f6bb970d8
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/MIPS/p8700.s
@@ -0,0 +1,143 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=mips-p8700 -timeline -iterations=1 < %s | FileCheck %s
+
+# A few instructions to test the pipeline:
+# - Integer division (IDiv) exercises the p8700GpDiv resource.
+# - Integer multiplication (IMul) uses p8700GpMul.
+# - Floating-point multiplication uses the FPUL pipeline.
+# - Load/Store instructions use the LSU pipeline.
+# - Simple ALU instructions test the p8700WriteEitherALU and p8700IssueAL2 resources.
+# - A jump instruction to test the CTI pipeline.
+
+  .text
+  .globl _start
+_start:
+
+# Integer division: a0 = a1 / a2
+# Exercises p8700GpDiv resource.
+  div     a0, a1, a2
+
+# Integer multiplication: a4 = a1 * a2
+# Exercises p8700GpMul resource.
+  mul     a4, a1, a2
+
+# Floating-point multiply: f1 = f2 * f3 (single precision)
+# Exercises p8700FpuLong + p8700FpuApu resources.
+  fmul.s  f1, f2, f3
+
+# Load/Store: load word from a0 into a3, then store a3 into a1
+# Exercises p8700IssueLSU resource.
+  lw      a3, 0(a0)
+  sw      a3, 0(a1)
+
+# Simple ALU operations (adding two registers, rotating bits)
+# Exercises p8700WriteEitherALU.
+  add     a5, a1, a2
+  ror     a6, a5, a2
+
+# A jump instruction: a simple forward jump
+# Exercises p8700IssueCTI resource.
+  jal     x0, .Lend
+
+  add     a7, a4, a0  # Instruction after jump (won't execute)
+.Lend:
+  nop
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      10
+# CHECK-NEXT: Total Cycles:      17
+# CHECK-NEXT: Total uOps:        10
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.59
+# CHECK-NEXT: IPC:               0.59
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      7     7.00                        div	a0, a1, a2
+# CHECK-NEXT:  1      4     1.00                        mul	a4, a1, a2
+# CHECK-NEXT:  1      5     1.00                        fmul.s	ft1, ft2, ft3
+# CHECK-NEXT:  1      4     1.00    *                   lw	a3, 0(a0)
+# CHECK-NEXT:  1      3     1.00           *            sw	a3, 0(a1)
+# CHECK-NEXT:  1      1     0.50                        add	a5, a1, a2
+# CHECK-NEXT:  1      1     0.50                        ror	a6, a5, a2
+# CHECK-NEXT:  1      1     1.00                        j	.Lend
+# CHECK-NEXT:  1      1     0.50                        add	a7, a4, a0
+# CHECK-NEXT:  1      0     0.50                        nop
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - p8700AGQ
+# CHECK-NEXT: [0.1] - p8700AGQ
+# CHECK-NEXT: [0.2] - p8700AGQ
+# CHECK-NEXT: [1]   - p8700ALQ
+# CHECK-NEXT: [2.0] - p8700FPQ
+# CHECK-NEXT: [2.1] - p8700FPQ
+# CHECK-NEXT: [2.2] - p8700FPQ
+# CHECK-NEXT: [3]   - p8700FpuApu
+# CHECK-NEXT: [4]   - p8700FpuLong
+# CHECK-NEXT: [5]   - p8700GpDiv
+# CHECK-NEXT: [6]   - p8700GpMul
+# CHECK-NEXT: [7]   - p8700IssueAL2
+# CHECK-NEXT: [8]   - p8700IssueCTI
+# CHECK-NEXT: [9]   - p8700IssueFPUL
+# CHECK-NEXT: [10]  - p8700IssueFPUS
+# CHECK-NEXT: [11]  - p8700IssueLSU
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [0.2]  [1]    [2.0]  [2.1]  [2.2]  [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
+# CHECK-NEXT: 1.00   1.00   1.00   2.00    -      -      -     1.00   1.00   7.00   1.00   2.00   1.00    -      -     2.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [0.2]  [1]    [2.0]  [2.1]  [2.2]  [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     7.00    -      -      -      -      -      -     div	a0, a1, a2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     mul	a4, a1, a2
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     fmul.s	ft1, ft2, ft3
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   lw	a3, 0(a0)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   sw	a3, 0(a1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     add	a5, a1, a2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     ror	a6, a5, a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     1.00    -      -      -     j	.Lend
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     add	a7, a4, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     nop
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    ..   div	a0, a1, a2
+# CHECK-NEXT: [0,1]     DeeeeE---R.    ..   mul	a4, a1, a2
+# CHECK-NEXT: [0,2]     DeeeeeE--R.    ..   fmul.s	ft1, ft2, ft3
+# CHECK-NEXT: [0,3]     D=======eeeeER ..   lw	a3, 0(a0)
+# CHECK-NEXT: [0,4]     .D==========eeeER   sw	a3, 0(a1)
+# CHECK-NEXT: [0,5]     .DeE------------R   add	a5, a1, a2
+# CHECK-NEXT: [0,6]     .D=eE-----------R   ror	a6, a5, a2
+# CHECK-NEXT: [0,7]     .DeE------------R   j	.Lend
+# CHECK-NEXT: [0,8]     . D=====eE------R   add	a7, a4, a0
+# CHECK-NEXT: [0,9]     . DE------------R   nop
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       div	a0, a1, a2
+# CHECK-NEXT: 1.     1     1.0    1.0    3.0       mul	a4, a1, a2
+# CHECK-NEXT: 2.     1     1.0    1.0    2.0       fmul.s	ft1, ft2, ft3
+# CHECK-NEXT: 3.     1     8.0    0.0    0.0       lw	a3, 0(a0)
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       sw	a3, 0(a1)
+# CHECK-NEXT: 5.     1     1.0    1.0    12.0      add	a5, a1, a2
+# CHECK-NEXT: 6.     1     2.0    0.0    11.0      ror	a6, a5, a2
+# CHECK-NEXT: 7.     1     1.0    1.0    12.0      j	.Lend
+# CHECK-NEXT: 8.     1     6.0    0.0    6.0       add	a7, a4, a0
+# CHECK-NEXT: 9.     1     1.0    1.0    12.0      nop
+# CHECK-NEXT:        1     3.3    0.6    5.8       <total>

From 023fb258b0a8f81b4eb80a8a1c8e1c48a71873de Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 19 Dec 2024 09:59:27 +0100
Subject: [PATCH 023/209] [LLVM] Update ADT/Support maintainers (#120423)

Nominate dwblaikie and kuhar as new maintainers for ADT/Support,
replacing chandlerc.
---
 llvm/Maintainers.md | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index 8df401ae03368..8c9800054bb99 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -295,8 +295,10 @@ andrei.safronov@espressif.com (email), [andreisfr](https://github.com/andreisfr)
 
 #### ADT, Support
 
-Chandler Carruth \
-chandlerc@gmail.com, chandlerc@google.com (email), [chandlerc](https://github.com/chandlerc) (GitHub)
+David Blaikie \
+dblaikie@gmail.com (email), [dwblaikie](https://github.com/dwblaike) (GitHub) \
+Jakub Kuderski \
+jakub@nod-labs.com (email), [kuhar](https://github.com/kuhar) (GitHub)
 
 #### Bitcode
 
@@ -453,6 +455,7 @@ sabre@nondot.org (email), [lattner](https://github.com/lattner) (GitHub), clattn
 
 Paul C. Anagnostopoulos (paul@windfall.com, [Paul-C-Anagnostopoulos](https://github.com/Paul-C-Anagnostopoulos)) -- TableGen \
 Justin Bogner (mail@justinbogner.com, [bogner](https://github.com/bogner)) -- SelectionDAG \
+Chandler Carruth (chandlerc@gmail.com, chandlerc@google.com, [chandlerc](https://github.com/chandlerc)) -- ADT, Support \
 Evan Cheng (evan.cheng@apple.com) -- Parts of code generator not covered by someone else \
 Jake Ehrlich (jakehehrlich@google.com, [jakehehrlich](https://github.com/jakehehrlich)) -- llvm-objcopy and ObjCopy library \
 Renato Golin (rengolin@systemcall.eu, [rengolin](https://github.com/rengolin)) -- ARM backend \

From 9fa109a5088bff9e8eabf6d67d0650fbd3db27cb Mon Sep 17 00:00:00 2001
From: Djordje Todorovic <djordje.todorovic@htecgroup.com>
Date: Thu, 19 Dec 2024 10:01:46 +0100
Subject: [PATCH 024/209] Revert "[RISCV] Add scheduling model for mips p8700
 CPU" (#120537)

Reverts llvm/llvm-project#119885

llvm-project/llvm/lib/Target/RISCV/RISCVSchedMIPSP8700.td:20:5:
error: Processor does not define resources for WriteFCvtF32ToF16
def MIPSP8700Model : SchedMachineModel {
---
 llvm/lib/Target/RISCV/RISCV.td               |   1 -
 llvm/lib/Target/RISCV/RISCVProcessors.td     |   3 +-
 llvm/lib/Target/RISCV/RISCVSchedMIPSP8700.td | 280 -------------------
 llvm/test/tools/llvm-mca/RISCV/MIPS/p8700.s  | 143 ----------
 4 files changed, 2 insertions(+), 425 deletions(-)
 delete mode 100644 llvm/lib/Target/RISCV/RISCVSchedMIPSP8700.td
 delete mode 100644 llvm/test/tools/llvm-mca/RISCV/MIPS/p8700.s

diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index 1df6f9ae1944c..00c3d702e12a2 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -46,7 +46,6 @@ include "RISCVMacroFusion.td"
 // RISC-V Scheduling Models
 //===----------------------------------------------------------------------===//
 
-include "RISCVSchedMIPSP8700.td"
 include "RISCVSchedRocket.td"
 include "RISCVSchedSiFive7.td"
 include "RISCVSchedSiFiveP400.td"
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 053a3b11f39bc..445e084d07686 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -105,7 +105,7 @@ def GENERIC_RV64 : RISCVProcessorModel<"generic-rv64",
 def GENERIC : RISCVTuneProcessorModel<"generic", NoSchedModel>, GenericTuneInfo;
 
 def MIPS_P8700 : RISCVProcessorModel<"mips-p8700",
-                                     MIPSP8700Model,
+                                     NoSchedModel,
                                      [Feature64Bit,
                                       FeatureStdExtI,
                                       FeatureStdExtM,
@@ -321,6 +321,7 @@ def SIFIVE_P470 : RISCVProcessorModel<"sifive-p470", SiFiveP400Model,
                                                   [TuneNoSinkSplatOperands,
                                                    TuneVXRMPipelineFlush])>;
 
+
 def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
                                       !listconcat(RVA22U64Features,
                                       [FeatureStdExtV,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedMIPSP8700.td b/llvm/lib/Target/RISCV/RISCVSchedMIPSP8700.td
deleted file mode 100644
index 4029f4dbc11fb..0000000000000
--- a/llvm/lib/Target/RISCV/RISCVSchedMIPSP8700.td
+++ /dev/null
@@ -1,280 +0,0 @@
-//===-- RISCVSchedMIPSP8700.td - MIPS RISC-V Processor -----*- tablegen -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// P8700 - a RISC-V processor by MIPS.
-// Pipelines:
-//   - 2 Integer Arithmetic and Logical Units (ALU and AL2)
-//   - Multiply / Divide Unit (MDU)
-//   - Branch Unit (CTI)
-//   - Load Store Unit (LSU)
-//   - Short Floating Point Pipe (FPUS)
-//   - Long Floating Point Pipe (FPUL)
-//===----------------------------------------------------------------------===//
-
-def MIPSP8700Model : SchedMachineModel {
-  int IssueWidth = 4;
-  int MicroOpBufferSize = 96;
-  int LoadLatency = 4;
-  int MispredictPenalty = 8;
-  let CompleteModel = 0;
-}
-
-let SchedModel = MIPSP8700Model in {
-// Handle ALQ Pipelines.
-// It contains 1 ALU Unit only.
-def p8700ALQ : ProcResource<1> { let BufferSize = 16; }
-
-// Handle AGQ Pipelines.
-def p8700AGQ : ProcResource<3> { let BufferSize = 16; }
-def p8700IssueAL2 : ProcResource<1> { let Super = p8700AGQ; }
-def p8700IssueCTI : ProcResource<1> { let Super = p8700AGQ; }
-def p8700IssueLSU : ProcResource<1> { let Super = p8700AGQ; }
-def p8700WriteEitherALU : ProcResGroup<[p8700ALQ, p8700IssueAL2]>;
-
-// Handle Multiply Divide Pipe.
-def p8700GpDiv : ProcResource<1>;
-def p8700GpMul : ProcResource<1>;
-
-def : WriteRes<WriteIALU, [p8700WriteEitherALU]>;
-def : WriteRes<WriteIALU32, [p8700WriteEitherALU]>;
-def : WriteRes<WriteShiftImm, [p8700WriteEitherALU]>;
-def : WriteRes<WriteShiftImm32, [p8700WriteEitherALU]>;
-def : WriteRes<WriteShiftReg, [p8700WriteEitherALU]>;
-def : WriteRes<WriteShiftReg32, [p8700WriteEitherALU]>;
-
-// Handle zba.
-def : WriteRes<WriteSHXADD, [p8700WriteEitherALU]>;
-def : WriteRes<WriteSHXADD32, [p8700WriteEitherALU]>;
-
-// Handle zbb.
-let Latency = 2 in {
-def : WriteRes<WriteCLZ, [p8700IssueAL2]>;
-def : WriteRes<WriteCTZ, [p8700IssueAL2]>;
-def : WriteRes<WriteCPOP, [p8700IssueAL2]>;
-def : WriteRes<WriteCLZ32, [p8700IssueAL2]>;
-def : WriteRes<WriteCTZ32, [p8700IssueAL2]>;
-def : WriteRes<WriteCPOP32, [p8700IssueAL2]>;
-}
-def : WriteRes<WriteRotateReg, [p8700WriteEitherALU]>;
-def : WriteRes<WriteRotateImm, [p8700WriteEitherALU]>;
-def : WriteRes<WriteRotateReg32, [p8700WriteEitherALU]>;
-def : WriteRes<WriteRotateImm32, [p8700WriteEitherALU]>;
-def : WriteRes<WriteREV8, [p8700WriteEitherALU]>;
-def : WriteRes<WriteORCB, [p8700WriteEitherALU]>;
-def : WriteRes<WriteIMinMax, [p8700WriteEitherALU]>;
-
-let Latency = 0 in
-def : WriteRes<WriteNop, [p8700WriteEitherALU]>;
-
-let Latency = 4 in {
-def : WriteRes<WriteLDB, [p8700IssueLSU]>;
-def : WriteRes<WriteLDH, [p8700IssueLSU]>;
-def : WriteRes<WriteLDW, [p8700IssueLSU]>;
-def : WriteRes<WriteLDD, [p8700IssueLSU]>;
-
-def : WriteRes<WriteAtomicW, [p8700IssueLSU]>;
-def : WriteRes<WriteAtomicD, [p8700IssueLSU]>;
-def : WriteRes<WriteAtomicLDW, [p8700IssueLSU]>;
-def : WriteRes<WriteAtomicLDD, [p8700IssueLSU]>;
-}
-
-let Latency = 8 in {
-def : WriteRes<WriteFLD32, [p8700IssueLSU]>;
-def : WriteRes<WriteFLD64, [p8700IssueLSU]>;
-}
-
-let Latency = 3 in {
-def : WriteRes<WriteSTB, [p8700IssueLSU]>;
-def : WriteRes<WriteSTH, [p8700IssueLSU]>;
-def : WriteRes<WriteSTW, [p8700IssueLSU]>;
-def : WriteRes<WriteSTD, [p8700IssueLSU]>;
-
-def : WriteRes<WriteAtomicSTW, [p8700IssueLSU]>;
-def : WriteRes<WriteAtomicSTD, [p8700IssueLSU]>;
-}
-
-def : WriteRes<WriteFST32, [p8700IssueLSU]>;
-def : WriteRes<WriteFST64, [p8700IssueLSU]>;
-
-let Latency = 7 in {
-def : WriteRes<WriteFMovI32ToF32, [p8700IssueLSU]>;
-def : WriteRes<WriteFMovF32ToI32, [p8700IssueLSU]>;
-def : WriteRes<WriteFMovI64ToF64, [p8700IssueLSU]>;
-def : WriteRes<WriteFMovF64ToI64, [p8700IssueLSU]>;
-}
-
-let Latency = 4 in {
-def : WriteRes<WriteIMul, [p8700GpMul]>;
-def : WriteRes<WriteIMul32, [p8700GpMul]>;
-}
-
-let Latency = 7, ReleaseAtCycles = [7] in {
-def : WriteRes<WriteIDiv, [p8700GpDiv]>;
-def : WriteRes<WriteIDiv32,  [p8700GpDiv]>;
-def : WriteRes<WriteIRem, [p8700GpDiv]>;
-def : WriteRes<WriteIRem32, [p8700GpDiv]>;
-}
-
-def : WriteRes<WriteCSR, [p8700ALQ]>;
-
-// Handle CTI Pipeline.
-def : WriteRes<WriteJmp, [p8700IssueCTI]>;
-def : WriteRes<WriteJalr, [p8700IssueCTI]>;
-let Latency = 2 in {
-def : WriteRes<WriteJal, [p8700IssueCTI]>;
-def : WriteRes<WriteJalr, [p8700IssueCTI]>;
-}
-
-// Handle FPU Pipelines.
-def p8700FPQ : ProcResource<3> { let BufferSize = 16; }
-def p8700IssueFPUS : ProcResource<1> { let Super = p8700FPQ; }
-def p8700IssueFPUL : ProcResource<1> { let Super = p8700FPQ; }
-def p8700FpuApu    : ProcResource<1>;
-def p8700FpuLong   : ProcResource<1>;
-
-let Latency = 4 in {
-def : WriteRes<WriteFCvtI32ToF32, [p8700IssueFPUL, p8700FpuApu]>;
-def : WriteRes<WriteFCvtI32ToF64, [p8700IssueFPUL, p8700FpuApu]>;
-def : WriteRes<WriteFCvtI64ToF32, [p8700IssueFPUL, p8700FpuApu]>;
-def : WriteRes<WriteFCvtI64ToF64, [p8700IssueFPUL, p8700FpuApu]>;
-def : WriteRes<WriteFCvtF32ToI32, [p8700IssueFPUL, p8700FpuApu]>;
-def : WriteRes<WriteFCvtF32ToI64, [p8700IssueFPUL, p8700FpuApu]>;
-def : WriteRes<WriteFCvtF32ToF64, [p8700IssueFPUL, p8700FpuApu]>;
-def : WriteRes<WriteFCvtF64ToI32, [p8700IssueFPUL, p8700FpuApu]>;
-def : WriteRes<WriteFCvtF64ToI64, [p8700IssueFPUL, p8700FpuApu]>;
-def : WriteRes<WriteFCvtF64ToF32, [p8700IssueFPUL, p8700FpuApu]>;
-
-def : WriteRes<WriteFAdd32, [p8700IssueFPUL, p8700FpuApu]>;
-def : WriteRes<WriteFAdd64, [p8700IssueFPUL, p8700FpuApu]>;
-}
-
-let Latency = 2 in {
-def : WriteRes<WriteFSGNJ32, [p8700IssueFPUS, p8700FpuApu]>;
-def : WriteRes<WriteFMinMax32, [p8700IssueFPUS, p8700FpuApu]>;
-def : WriteRes<WriteFSGNJ64, [p8700IssueFPUS, p8700FpuApu]>;
-def : WriteRes<WriteFMinMax64, [p8700IssueFPUS, p8700FpuApu]>;
-
-def : WriteRes<WriteFCmp32, [p8700IssueFPUS, p8700FpuApu]>;
-def : WriteRes<WriteFCmp64, [p8700IssueFPUS, p8700FpuApu]>;
-def : WriteRes<WriteFClass32, [p8700IssueFPUS, p8700FpuApu]>;
-def : WriteRes<WriteFClass64, [p8700IssueFPUS, p8700FpuApu]>;
-}
-
-let Latency = 8 in {
-def : WriteRes<WriteFMA32, [p8700FpuLong, p8700FpuApu]>;
-def : WriteRes<WriteFMA64, [p8700FpuLong, p8700FpuApu]>;
-}
-
-let Latency = 5 in {
-def : WriteRes<WriteFMul32, [p8700FpuLong, p8700FpuApu]>;
-def : WriteRes<WriteFMul64, [p8700FpuLong, p8700FpuApu]>;
-}
-
-let Latency = 11, ReleaseAtCycles = [1, 11] in {
-def : WriteRes<WriteFDiv32, [p8700FpuLong, p8700FpuApu]>;
-def : WriteRes<WriteFSqrt32, [p8700FpuLong, p8700FpuApu]>;
-}
-
-let Latency = 17, ReleaseAtCycles = [1, 17] in {
-def : WriteRes<WriteFDiv64, [p8700IssueFPUL, p8700FpuApu]>;
-def : WriteRes<WriteFSqrt64, [p8700IssueFPUL, p8700FpuApu]>;
-}
-
-// Bypass and advance.
-def : ReadAdvance<ReadIALU, 0>;
-def : ReadAdvance<ReadIALU32, 0>;
-def : ReadAdvance<ReadShiftImm, 0>;
-def : ReadAdvance<ReadShiftImm32, 0>;
-def : ReadAdvance<ReadShiftReg, 0>;
-def : ReadAdvance<ReadShiftReg32, 0>;
-def : ReadAdvance<ReadSHXADD, 0>;
-def : ReadAdvance<ReadSHXADD32, 0>;
-def : ReadAdvance<ReadRotateReg, 0>;
-def : ReadAdvance<ReadRotateImm, 0>;
-def : ReadAdvance<ReadCLZ, 0>;
-def : ReadAdvance<ReadCTZ, 0>;
-def : ReadAdvance<ReadCPOP, 0>;
-def : ReadAdvance<ReadRotateReg32, 0>;
-def : ReadAdvance<ReadRotateImm32, 0>;
-def : ReadAdvance<ReadCLZ32, 0>;
-def : ReadAdvance<ReadCTZ32, 0>;
-def : ReadAdvance<ReadCPOP32, 0>;
-def : ReadAdvance<ReadREV8, 0>;
-def : ReadAdvance<ReadORCB, 0>;
-def : ReadAdvance<ReadIMul, 0>;
-def : ReadAdvance<ReadIMul32, 0>;
-def : ReadAdvance<ReadIDiv, 0>;
-def : ReadAdvance<ReadIDiv32, 0>;
-def : ReadAdvance<ReadJmp, 0>;
-def : ReadAdvance<ReadJalr, 0>;
-def : ReadAdvance<ReadFMovI32ToF32, 0>;
-def : ReadAdvance<ReadFMovF32ToI32, 0>;
-def : ReadAdvance<ReadFMovI64ToF64, 0>;
-def : ReadAdvance<ReadFMovF64ToI64, 0>;
-def : ReadAdvance<ReadFSGNJ32, 0>;
-def : ReadAdvance<ReadFMinMax32, 0>;
-def : ReadAdvance<ReadFSGNJ64, 0>;
-def : ReadAdvance<ReadFMinMax64, 0>;
-def : ReadAdvance<ReadFCmp32, 0>;
-def : ReadAdvance<ReadFCmp64, 0>;
-def : ReadAdvance<ReadFCvtI32ToF32, 0>;
-def : ReadAdvance<ReadFCvtI32ToF64, 0>;
-def : ReadAdvance<ReadFCvtI64ToF32, 0>;
-def : ReadAdvance<ReadFCvtI64ToF64, 0>;
-def : ReadAdvance<ReadFCvtF32ToI32, 0>;
-def : ReadAdvance<ReadFCvtF32ToI64, 0>;
-def : ReadAdvance<ReadFCvtF32ToF64, 0>;
-def : ReadAdvance<ReadFCvtF64ToI32, 0>;
-def : ReadAdvance<ReadFCvtF64ToI64, 0>;
-def : ReadAdvance<ReadFCvtF64ToF32, 0>;
-def : ReadAdvance<ReadFAdd32, 0>;
-def : ReadAdvance<ReadFAdd64, 0>;
-def : ReadAdvance<ReadFMul32, 0>;
-def : ReadAdvance<ReadFMul64, 0>;
-def : ReadAdvance<ReadFMA32, 0>;
-def : ReadAdvance<ReadFMA32Addend, 0>;
-def : ReadAdvance<ReadFMA64, 0>;
-def : ReadAdvance<ReadFMA64Addend, 0>;
-def : ReadAdvance<ReadFDiv32, 0>;
-def : ReadAdvance<ReadFSqrt32, 0>;
-def : ReadAdvance<ReadFDiv64, 0>;
-def : ReadAdvance<ReadFSqrt64, 0>;
-def : ReadAdvance<ReadAtomicWA, 0>;
-def : ReadAdvance<ReadAtomicWD, 0>;
-def : ReadAdvance<ReadAtomicDA, 0>;
-def : ReadAdvance<ReadAtomicDD, 0>;
-def : ReadAdvance<ReadAtomicLDW, 0>;
-def : ReadAdvance<ReadAtomicLDD, 0>;
-def : ReadAdvance<ReadAtomicSTW, 0>;
-def : ReadAdvance<ReadAtomicSTD, 0>;
-def : ReadAdvance<ReadFStoreData, 0>;
-def : ReadAdvance<ReadCSR, 0>;
-def : ReadAdvance<ReadMemBase, 0>;
-def : ReadAdvance<ReadStoreData, 0>;
-def : ReadAdvance<ReadFMemBase, 0>;
-def : ReadAdvance<ReadFClass32, 0>;
-def : ReadAdvance<ReadFClass64, 0>;
-def : ReadAdvance<ReadIMinMax, 0>;
-def : ReadAdvance<ReadIRem, 0>;
-def : ReadAdvance<ReadIRem32, 0>;
-
-// Unsupported extensions.
-defm : UnsupportedSchedV;
-defm : UnsupportedSchedZbc;
-defm : UnsupportedSchedZbs;
-defm : UnsupportedSchedZbkb;
-defm : UnsupportedSchedZbkx;
-defm : UnsupportedSchedZfa;
-defm : UnsupportedSchedZfh;
-defm : UnsupportedSchedSFB;
-defm : UnsupportedSchedZabha;
-defm : UnsupportedSchedXsfvcp;
-defm : UnsupportedSchedZvk;
-defm : UnsupportedSchedZvkned;
-}
diff --git a/llvm/test/tools/llvm-mca/RISCV/MIPS/p8700.s b/llvm/test/tools/llvm-mca/RISCV/MIPS/p8700.s
deleted file mode 100644
index ca91f6bb970d8..0000000000000
--- a/llvm/test/tools/llvm-mca/RISCV/MIPS/p8700.s
+++ /dev/null
@@ -1,143 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=riscv64 -mcpu=mips-p8700 -timeline -iterations=1 < %s | FileCheck %s
-
-# A few instructions to test the pipeline:
-# - Integer division (IDiv) exercises the p8700GpDiv resource.
-# - Integer multiplication (IMul) uses p8700GpMul.
-# - Floating-point multiplication uses the FPUL pipeline.
-# - Load/Store instructions use the LSU pipeline.
-# - Simple ALU instructions test the p8700WriteEitherALU and p8700IssueAL2 resources.
-# - A jump instruction to test the CTI pipeline.
-
-  .text
-  .globl _start
-_start:
-
-# Integer division: a0 = a1 / a2
-# Exercises p8700GpDiv resource.
-  div     a0, a1, a2
-
-# Integer multiplication: a4 = a1 * a2
-# Exercises p8700GpMul resource.
-  mul     a4, a1, a2
-
-# Floating-point multiply: f1 = f2 * f3 (single precision)
-# Exercises p8700FpuLong + p8700FpuApu resources.
-  fmul.s  f1, f2, f3
-
-# Load/Store: load word from a0 into a3, then store a3 into a1
-# Exercises p8700IssueLSU resource.
-  lw      a3, 0(a0)
-  sw      a3, 0(a1)
-
-# Simple ALU operations (adding two registers, rotating bits)
-# Exercises p8700WriteEitherALU.
-  add     a5, a1, a2
-  ror     a6, a5, a2
-
-# A jump instruction: a simple forward jump
-# Exercises p8700IssueCTI resource.
-  jal     x0, .Lend
-
-  add     a7, a4, a0  # Instruction after jump (won't execute)
-.Lend:
-  nop
-
-# CHECK:      Iterations:        1
-# CHECK-NEXT: Instructions:      10
-# CHECK-NEXT: Total Cycles:      17
-# CHECK-NEXT: Total uOps:        10
-
-# CHECK:      Dispatch Width:    4
-# CHECK-NEXT: uOps Per Cycle:    0.59
-# CHECK-NEXT: IPC:               0.59
-# CHECK-NEXT: Block RThroughput: 7.0
-
-# CHECK:      Instruction Info:
-# CHECK-NEXT: [1]: #uOps
-# CHECK-NEXT: [2]: Latency
-# CHECK-NEXT: [3]: RThroughput
-# CHECK-NEXT: [4]: MayLoad
-# CHECK-NEXT: [5]: MayStore
-# CHECK-NEXT: [6]: HasSideEffects (U)
-
-# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  1      7     7.00                        div	a0, a1, a2
-# CHECK-NEXT:  1      4     1.00                        mul	a4, a1, a2
-# CHECK-NEXT:  1      5     1.00                        fmul.s	ft1, ft2, ft3
-# CHECK-NEXT:  1      4     1.00    *                   lw	a3, 0(a0)
-# CHECK-NEXT:  1      3     1.00           *            sw	a3, 0(a1)
-# CHECK-NEXT:  1      1     0.50                        add	a5, a1, a2
-# CHECK-NEXT:  1      1     0.50                        ror	a6, a5, a2
-# CHECK-NEXT:  1      1     1.00                        j	.Lend
-# CHECK-NEXT:  1      1     0.50                        add	a7, a4, a0
-# CHECK-NEXT:  1      0     0.50                        nop
-
-# CHECK:      Resources:
-# CHECK-NEXT: [0.0] - p8700AGQ
-# CHECK-NEXT: [0.1] - p8700AGQ
-# CHECK-NEXT: [0.2] - p8700AGQ
-# CHECK-NEXT: [1]   - p8700ALQ
-# CHECK-NEXT: [2.0] - p8700FPQ
-# CHECK-NEXT: [2.1] - p8700FPQ
-# CHECK-NEXT: [2.2] - p8700FPQ
-# CHECK-NEXT: [3]   - p8700FpuApu
-# CHECK-NEXT: [4]   - p8700FpuLong
-# CHECK-NEXT: [5]   - p8700GpDiv
-# CHECK-NEXT: [6]   - p8700GpMul
-# CHECK-NEXT: [7]   - p8700IssueAL2
-# CHECK-NEXT: [8]   - p8700IssueCTI
-# CHECK-NEXT: [9]   - p8700IssueFPUL
-# CHECK-NEXT: [10]  - p8700IssueFPUS
-# CHECK-NEXT: [11]  - p8700IssueLSU
-
-# CHECK:      Resource pressure per iteration:
-# CHECK-NEXT: [0.0]  [0.1]  [0.2]  [1]    [2.0]  [2.1]  [2.2]  [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT: 1.00   1.00   1.00   2.00    -      -      -     1.00   1.00   7.00   1.00   2.00   1.00    -      -     2.00
-
-# CHECK:      Resource pressure by instruction:
-# CHECK-NEXT: [0.0]  [0.1]  [0.2]  [1]    [2.0]  [2.1]  [2.2]  [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     7.00    -      -      -      -      -      -     div	a0, a1, a2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     mul	a4, a1, a2
-# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     fmul.s	ft1, ft2, ft3
-# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   lw	a3, 0(a0)
-# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   sw	a3, 0(a1)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     add	a5, a1, a2
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     ror	a6, a5, a2
-# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     1.00    -      -      -     j	.Lend
-# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     add	a7, a4, a0
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     nop
-
-# CHECK:      Timeline view:
-# CHECK-NEXT:                     0123456
-# CHECK-NEXT: Index     0123456789
-
-# CHECK:      [0,0]     DeeeeeeeER.    ..   div	a0, a1, a2
-# CHECK-NEXT: [0,1]     DeeeeE---R.    ..   mul	a4, a1, a2
-# CHECK-NEXT: [0,2]     DeeeeeE--R.    ..   fmul.s	ft1, ft2, ft3
-# CHECK-NEXT: [0,3]     D=======eeeeER ..   lw	a3, 0(a0)
-# CHECK-NEXT: [0,4]     .D==========eeeER   sw	a3, 0(a1)
-# CHECK-NEXT: [0,5]     .DeE------------R   add	a5, a1, a2
-# CHECK-NEXT: [0,6]     .D=eE-----------R   ror	a6, a5, a2
-# CHECK-NEXT: [0,7]     .DeE------------R   j	.Lend
-# CHECK-NEXT: [0,8]     . D=====eE------R   add	a7, a4, a0
-# CHECK-NEXT: [0,9]     . DE------------R   nop
-
-# CHECK:      Average Wait times (based on the timeline view):
-# CHECK-NEXT: [0]: Executions
-# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
-# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
-# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
-
-# CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     1     1.0    1.0    0.0       div	a0, a1, a2
-# CHECK-NEXT: 1.     1     1.0    1.0    3.0       mul	a4, a1, a2
-# CHECK-NEXT: 2.     1     1.0    1.0    2.0       fmul.s	ft1, ft2, ft3
-# CHECK-NEXT: 3.     1     8.0    0.0    0.0       lw	a3, 0(a0)
-# CHECK-NEXT: 4.     1     11.0   0.0    0.0       sw	a3, 0(a1)
-# CHECK-NEXT: 5.     1     1.0    1.0    12.0      add	a5, a1, a2
-# CHECK-NEXT: 6.     1     2.0    0.0    11.0      ror	a6, a5, a2
-# CHECK-NEXT: 7.     1     1.0    1.0    12.0      j	.Lend
-# CHECK-NEXT: 8.     1     6.0    0.0    6.0       add	a7, a4, a0
-# CHECK-NEXT: 9.     1     1.0    1.0    12.0      nop
-# CHECK-NEXT:        1     3.3    0.6    5.8       <total>

From e4351f27cdaa6ca80312b6fca5c160d78acd9bb4 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 19 Dec 2024 09:07:40 +0000
Subject: [PATCH 025/209] [TOSA] Don't run validation pass on non TOSA
 operations (#120205)

This commit ensures the validation pass is not run on operations from
other dialects. In doing so, operations from other dialects that, for
example, use types not supported by TOSA don't result in an error.

Signed-off-by: Luke Hutton <luke.hutton@arm.com>
---
 mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp |  4 ++++
 mlir/test/Dialect/Tosa/invalid.mlir                 | 12 ++++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
index 893cedefc1ebd..6fd671051362c 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
@@ -543,6 +543,10 @@ bool TosaValidation::isValidElementType(Type type) {
 void TosaValidation::runOnOperation() {
   configLevelAndProfile();
   getOperation().walk([&](Operation *op) {
+    if (!op->getDialect() ||
+        op->getDialect()->getNamespace() != TosaDialect::getDialectNamespace())
+      return;
+
     for (Value operand : op->getOperands()) {
       auto elementTy = getElementTypeOrSelf(operand);
       if (!isValidElementType(elementTy)) {
diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir
index 79bb7fce5755e..cca50b25d14d6 100644
--- a/mlir/test/Dialect/Tosa/invalid.mlir
+++ b/mlir/test/Dialect/Tosa/invalid.mlir
@@ -625,7 +625,6 @@ func.func @test_mul_invalid_shift(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1
 func.func @test_unsupported_int64_data_type(%arg0: tensor<1x13x13x5xf32>) -> tensor<1x13x13xi64> {
   // expected-error@+1 {{'tosa.argmax' op is not profile-aligned: element type 'i64' is not legal}}
   %0 = tosa.argmax %arg0 {axis = 3 : i32} : (tensor<1x13x13x5xf32>) -> tensor<1x13x13xi64>
-  // expected-error@+1 {{'func.return' op is not profile-aligned: element type 'i64' is not legal}}
   return %0 : tensor<1x13x13xi64>
 }
 
@@ -879,4 +878,13 @@ func.func @test_mismatch_in_out_shape_logical_not(%arg0: tensor<1x21x3xi1>) -> t
   // expected-error@+1 {{'tosa.logical_not' op requires the same shape for all operands and results}}
   %0 = tosa.logical_not %arg0 : (tensor<1x21x3xi1>) -> tensor<13x21x3xi1>
   return %0 : tensor<13x21x3xi1>
-}
\ No newline at end of file
+}
+
+// -----
+
+// Check validate pass doesn't run on non TOSA ops
+func.func @test_non_tosa_ops() {
+  %0 = arith.constant 6 : index
+  %2 = tensor.empty(%0) : tensor<?x27xi64>
+  return
+}

From beea5acc5e7d17a29e48f5dc627019e4db510e23 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 19 Dec 2024 01:07:54 -0800
Subject: [PATCH 026/209] Reapply "[driver] Fix sanitizer libc++ runtime
 linking (#120370)" (#120538)

Reland without item 2 from #120370 to avoid breaking libc++ tests.

This reverts commit 60a2f32cf5ce75c9a2511d7fc2b0f24699605912.
---
 clang/lib/Driver/SanitizerArgs.cpp |  8 +--
 clang/test/Driver/sanitizer-ld.c   | 80 ++++++++++++++++++++++++++----
 2 files changed, 74 insertions(+), 14 deletions(-)

diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 595bfb45f97f4..0b1f4f0112ac0 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -1106,10 +1106,10 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
                    options::OPT_fno_sanitize_link_runtime, LinkRuntimes);
 
   // Parse -link-cxx-sanitizer flag.
-  LinkCXXRuntimes = Args.hasArg(options::OPT_fsanitize_link_cxx_runtime,
-                                options::OPT_fno_sanitize_link_cxx_runtime,
-                                LinkCXXRuntimes) ||
-                    D.CCCIsCXX();
+  LinkCXXRuntimes = D.CCCIsCXX();
+  LinkCXXRuntimes =
+      Args.hasFlag(options::OPT_fsanitize_link_cxx_runtime,
+                   options::OPT_fno_sanitize_link_cxx_runtime, LinkCXXRuntimes);
 
   NeedsMemProfRt = Args.hasFlag(options::OPT_fmemory_profile,
                                 options::OPT_fmemory_profile_EQ,
diff --git a/clang/test/Driver/sanitizer-ld.c b/clang/test/Driver/sanitizer-ld.c
index 60d60a6047b0f..8347f9c45935d 100644
--- a/clang/test/Driver/sanitizer-ld.c
+++ b/clang/test/Driver/sanitizer-ld.c
@@ -132,18 +132,78 @@
 // RUN:     -resource-dir=%S/Inputs/empty_resource_dir \
 // RUN:     --sysroot=%S/Inputs/basic_linux_tree \
 // RUN:   | FileCheck --check-prefix=CHECK-ASAN-LINUX-CXX %s
-//
+
+// RUN: %clangxx -### %s 2>&1 \
+// RUN:     --target=i386-unknown-linux -fuse-ld=ld -stdlib=platform -fsanitize=address \
+// RUN:     -resource-dir=%S/Inputs/empty_resource_dir \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:     -fsanitize-link-c++-runtime \
+// RUN:   | FileCheck --check-prefix=CHECK-ASAN-LINUX-CXX %s
+
 // CHECK-ASAN-LINUX-CXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
-// CHECK-ASAN-LINUX-CXX-NOT: "-lc"
-// CHECK-ASAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
-// CHECK-ASAN-LINUX-CXX: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive"
+// CHECK-ASAN-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
+// CHECK-ASAN-LINUX-CXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan_cxx.a" "--no-whole-archive"
 // CHECK-ASAN-LINUX-CXX-NOT: "--dynamic-list"
-// CHECK-ASAN-LINUX-CXX: "--export-dynamic"
-// CHECK-ASAN-LINUX-CXX: stdc++
-// CHECK-ASAN-LINUX-CXX: "-lpthread"
-// CHECK-ASAN-LINUX-CXX: "-lrt"
-// CHECK-ASAN-LINUX-CXX: "-ldl"
-// CHECK-ASAN-LINUX-CXX: "-lresolv"
+// CHECK-ASAN-LINUX-CXX-SAME: "--export-dynamic"
+// CHECK-ASAN-LINUX-CXX-SAME: "-lstdc++"
+// CHECK-ASAN-LINUX-CXX-SAME: "-lpthread"
+// CHECK-ASAN-LINUX-CXX-SAME: "-lrt"
+// CHECK-ASAN-LINUX-CXX-SAME: "-ldl"
+// CHECK-ASAN-LINUX-CXX-SAME: "-lresolv"
+// CHECK-ASAN-LINUX-CXX-SAME: "-lc"
+
+// RUN: %clang -### %s 2>&1 \
+// RUN:     --target=i386-unknown-linux -fuse-ld=ld -stdlib=platform -fsanitize=address \
+// RUN:     -resource-dir=%S/Inputs/empty_resource_dir \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:     -fno-sanitize-link-c++-runtime \
+// RUN:   | FileCheck --check-prefix=CHECK-ASAN-LINUX-CNOCXX %s
+
+// CHECK-ASAN-LINUX-CNOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
+// CHECK-ASAN-LINUX-CNOCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
+// CHECK-ASAN-LINUX-CNOCXX-NOT: libclang_rt.asan_cxx
+// CHECK-ASAN-LINUX-CNOCXX-SAME: "--export-dynamic"
+// CHECK-ASAN-LINUX-CNOCXX-NOT: stdc++
+// CHECK-ASAN-LINUX-CNOCXX-SAME: "-lpthread"
+// CHECK-ASAN-LINUX-CNOCXX-SAME: "-lrt"
+// CHECK-ASAN-LINUX-CNOCXX-SAME: "-ldl"
+// CHECK-ASAN-LINUX-CNOCXX-SAME: "-lresolv"
+// CHECK-ASAN-LINUX-CNOCXX-SAME: "-lc"
+
+// RUN: %clangxx -### %s 2>&1 \
+// RUN:     --target=i386-unknown-linux -fuse-ld=ld -stdlib=platform -fsanitize=address \
+// RUN:     -resource-dir=%S/Inputs/empty_resource_dir \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:     -fno-sanitize-link-c++-runtime \
+// RUN:   | FileCheck --check-prefix=CHECK-ASAN-LINUX-NOCXX %s
+
+// CHECK-ASAN-LINUX-NOCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
+// CHECK-ASAN-LINUX-NOCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
+// CHECK-ASAN-LINUX-NOCXX-NOT: libclang_rt.asan_cxx
+// CHECK-ASAN-LINUX-NOCXX-SAME: "--export-dynamic"
+// CHECK-ASAN-LINUX-NOCXX-SAME: "-lstdc++"
+// CHECK-ASAN-LINUX-NOCXX-SAME: "-lpthread"
+// CHECK-ASAN-LINUX-NOCXX-SAME: "-lrt"
+// CHECK-ASAN-LINUX-NOCXX-SAME: "-ldl"
+// CHECK-ASAN-LINUX-NOCXX-SAME: "-lresolv"
+// CHECK-ASAN-LINUX-NOCXX-SAME: "-lc"
+
+// RUN: %clangxx -### %s 2>&1 \
+// RUN:     --target=i386-unknown-linux -fuse-ld=ld -stdlib=platform -fsanitize=address \
+// RUN:     -resource-dir=%S/Inputs/empty_resource_dir \
+// RUN:     --sysroot=%S/Inputs/basic_linux_tree \
+// RUN:     -nostdlib++ \
+// RUN:   | FileCheck --check-prefix=CHECK-ASAN-LINUX-NOSTDCXX %s
+
+// CHECK-ASAN-LINUX-NOSTDCXX: "{{(.*[^-.0-9A-Z_a-z])?}}ld{{(.exe)?}}"
+// CHECK-ASAN-LINUX-NOSTDCXX-SAME: "--whole-archive" "{{.*}}libclang_rt.asan.a" "--no-whole-archive"
+// CHECK-ASAN-LINUX-NOSTDCXX-SAME: libclang_rt.asan_cxx
+// CHECK-ASAN-LINUX-NOSTDCXX-SAME: "--export-dynamic"
+// CHECK-ASAN-LINUX-NOSTDCXX-SAME: "-lpthread"
+// CHECK-ASAN-LINUX-NOSTDCXX-SAME: "-lrt"
+// CHECK-ASAN-LINUX-NOSTDCXX-SAME: "-ldl"
+// CHECK-ASAN-LINUX-NOSTDCXX-SAME: "-lresolv"
+// CHECK-ASAN-LINUX-NOSTDCXX-SAME: "-lc"
 
 // RUN: %clang -### %s -o /dev/null -fsanitize=address \
 // RUN:     --target=i386-unknown-linux -fuse-ld=ld -stdlib=platform \

From 9fc2fadbfcb34df5f72bdaed28a7874bf584eed7 Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard@arm.com>
Date: Thu, 19 Dec 2024 09:12:19 +0000
Subject: [PATCH 027/209] [Clang] Re-write codegen for atomic_test_and_set and
 atomic_clear (#120449)

Re-write the sema and codegen for the atomic_test_and_set and
atomic_clear builtin functions to go via AtomicExpr, like the other
atomic builtins do. This simplifies the code, because AtomicExpr already
handles things like generating code for to dynamically select the memory
ordering, which was duplicated for these builtins. This also fixes a few
crash bugs, one when passing an integer to the pointer argument, and one
when using an array.

This also adds diagnostics for the memory orderings which are not valid
for atomic_clear according to
https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html, which
were missing before.

Fixes #111293.
---
 clang/include/clang/Basic/Builtins.td    |  12 +-
 clang/lib/AST/Expr.cpp                   |   2 +
 clang/lib/CodeGen/CGAtomic.cpp           |  25 ++-
 clang/lib/CodeGen/CGBuiltin.cpp          | 141 -------------
 clang/lib/Sema/SemaChecking.cpp          |  35 +++-
 clang/test/CodeGen/atomic-test-and-set.c | 250 +++++++++++++++++++++++
 clang/test/Sema/atomic-ops.c             |   8 +-
 7 files changed, 316 insertions(+), 157 deletions(-)
 create mode 100644 clang/test/CodeGen/atomic-test-and-set.c

diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index d64a66fc9d9cf..1e77016001e48 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1977,16 +1977,16 @@ def AtomicNandFetch : AtomicBuiltin {
   let Prototype = "void(...)";
 }
 
-def AtomicTestAndSet : Builtin {
+def AtomicTestAndSet : AtomicBuiltin {
   let Spellings = ["__atomic_test_and_set"];
-  let Attributes = [NoThrow];
-  let Prototype = "bool(void volatile*, int)";
+  let Attributes = [NoThrow, CustomTypeChecking];
+  let Prototype = "void(...)";
 }
 
-def AtomicClear : Builtin {
+def AtomicClear : AtomicBuiltin {
   let Spellings = ["__atomic_clear"];
-  let Attributes = [NoThrow];
-  let Prototype = "void(void volatile*, int)";
+  let Attributes = [NoThrow, CustomTypeChecking];
+  let Prototype = "void(...)";
 }
 
 def AtomicThreadFence : Builtin {
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 8c8ccdb61dc01..7e6cb53064ff2 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -5070,6 +5070,8 @@ unsigned AtomicExpr::getNumSubExprs(AtomicOp Op) {
   case AO__opencl_atomic_init:
   case AO__c11_atomic_load:
   case AO__atomic_load_n:
+  case AO__atomic_test_and_set:
+  case AO__atomic_clear:
     return 2;
 
   case AO__scoped_atomic_load_n:
diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp
index f6cb2ad421e90..3adb2a7ad207f 100644
--- a/clang/lib/CodeGen/CGAtomic.cpp
+++ b/clang/lib/CodeGen/CGAtomic.cpp
@@ -723,6 +723,24 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, Address Dest,
   case AtomicExpr::AO__scoped_atomic_fetch_nand:
     Op = llvm::AtomicRMWInst::Nand;
     break;
+
+  case AtomicExpr::AO__atomic_test_and_set: {
+    llvm::AtomicRMWInst *RMWI =
+        CGF.emitAtomicRMWInst(llvm::AtomicRMWInst::Xchg, Ptr,
+                              CGF.Builder.getInt8(1), Order, Scope, E);
+    RMWI->setVolatile(E->isVolatile());
+    llvm::Value *Result = CGF.Builder.CreateIsNotNull(RMWI, "tobool");
+    CGF.Builder.CreateStore(Result, Dest);
+    return;
+  }
+
+  case AtomicExpr::AO__atomic_clear: {
+    llvm::StoreInst *Store =
+        CGF.Builder.CreateStore(CGF.Builder.getInt8(0), Ptr);
+    Store->setAtomic(Order, Scope);
+    Store->setVolatile(E->isVolatile());
+    return;
+  }
   }
 
   llvm::Value *LoadVal1 = CGF.Builder.CreateLoad(Val1);
@@ -878,6 +896,8 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
   case AtomicExpr::AO__c11_atomic_load:
   case AtomicExpr::AO__opencl_atomic_load:
   case AtomicExpr::AO__hip_atomic_load:
+  case AtomicExpr::AO__atomic_test_and_set:
+  case AtomicExpr::AO__atomic_clear:
     break;
 
   case AtomicExpr::AO__atomic_load:
@@ -1200,6 +1220,8 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
     case AtomicExpr::AO__opencl_atomic_fetch_max:
     case AtomicExpr::AO__scoped_atomic_fetch_max:
     case AtomicExpr::AO__scoped_atomic_max_fetch:
+    case AtomicExpr::AO__atomic_test_and_set:
+    case AtomicExpr::AO__atomic_clear:
       llvm_unreachable("Integral atomic operations always become atomicrmw!");
     }
 
@@ -1239,7 +1261,8 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
                  E->getOp() == AtomicExpr::AO__atomic_store ||
                  E->getOp() == AtomicExpr::AO__atomic_store_n ||
                  E->getOp() == AtomicExpr::AO__scoped_atomic_store ||
-                 E->getOp() == AtomicExpr::AO__scoped_atomic_store_n;
+                 E->getOp() == AtomicExpr::AO__scoped_atomic_store_n ||
+                 E->getOp() == AtomicExpr::AO__atomic_clear;
   bool IsLoad = E->getOp() == AtomicExpr::AO__c11_atomic_load ||
                 E->getOp() == AtomicExpr::AO__opencl_atomic_load ||
                 E->getOp() == AtomicExpr::AO__hip_atomic_load ||
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 4d4b7428abd50..0ea2ee4c264ae 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5099,147 +5099,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
                     ReturnValueSlot(), Args);
   }
 
-  case Builtin::BI__atomic_test_and_set: {
-    // Look at the argument type to determine whether this is a volatile
-    // operation. The parameter type is always volatile.
-    QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
-    bool Volatile =
-        PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
-
-    Address Ptr =
-        EmitPointerWithAlignment(E->getArg(0)).withElementType(Int8Ty);
-
-    Value *NewVal = Builder.getInt8(1);
-    Value *Order = EmitScalarExpr(E->getArg(1));
-    if (isa<llvm::ConstantInt>(Order)) {
-      int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
-      AtomicRMWInst *Result = nullptr;
-      switch (ord) {
-      case 0:  // memory_order_relaxed
-      default: // invalid order
-        Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
-                                         llvm::AtomicOrdering::Monotonic);
-        break;
-      case 1: // memory_order_consume
-      case 2: // memory_order_acquire
-        Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
-                                         llvm::AtomicOrdering::Acquire);
-        break;
-      case 3: // memory_order_release
-        Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
-                                         llvm::AtomicOrdering::Release);
-        break;
-      case 4: // memory_order_acq_rel
-
-        Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
-                                         llvm::AtomicOrdering::AcquireRelease);
-        break;
-      case 5: // memory_order_seq_cst
-        Result = Builder.CreateAtomicRMW(
-            llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
-            llvm::AtomicOrdering::SequentiallyConsistent);
-        break;
-      }
-      Result->setVolatile(Volatile);
-      return RValue::get(Builder.CreateIsNotNull(Result, "tobool"));
-    }
-
-    llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
-
-    llvm::BasicBlock *BBs[5] = {
-      createBasicBlock("monotonic", CurFn),
-      createBasicBlock("acquire", CurFn),
-      createBasicBlock("release", CurFn),
-      createBasicBlock("acqrel", CurFn),
-      createBasicBlock("seqcst", CurFn)
-    };
-    llvm::AtomicOrdering Orders[5] = {
-        llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Acquire,
-        llvm::AtomicOrdering::Release, llvm::AtomicOrdering::AcquireRelease,
-        llvm::AtomicOrdering::SequentiallyConsistent};
-
-    Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
-    llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]);
-
-    Builder.SetInsertPoint(ContBB);
-    PHINode *Result = Builder.CreatePHI(Int8Ty, 5, "was_set");
-
-    for (unsigned i = 0; i < 5; ++i) {
-      Builder.SetInsertPoint(BBs[i]);
-      AtomicRMWInst *RMW = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg,
-                                                   Ptr, NewVal, Orders[i]);
-      RMW->setVolatile(Volatile);
-      Result->addIncoming(RMW, BBs[i]);
-      Builder.CreateBr(ContBB);
-    }
-
-    SI->addCase(Builder.getInt32(0), BBs[0]);
-    SI->addCase(Builder.getInt32(1), BBs[1]);
-    SI->addCase(Builder.getInt32(2), BBs[1]);
-    SI->addCase(Builder.getInt32(3), BBs[2]);
-    SI->addCase(Builder.getInt32(4), BBs[3]);
-    SI->addCase(Builder.getInt32(5), BBs[4]);
-
-    Builder.SetInsertPoint(ContBB);
-    return RValue::get(Builder.CreateIsNotNull(Result, "tobool"));
-  }
-
-  case Builtin::BI__atomic_clear: {
-    QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
-    bool Volatile =
-        PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
-
-    Address Ptr = EmitPointerWithAlignment(E->getArg(0));
-    Ptr = Ptr.withElementType(Int8Ty);
-    Value *NewVal = Builder.getInt8(0);
-    Value *Order = EmitScalarExpr(E->getArg(1));
-    if (isa<llvm::ConstantInt>(Order)) {
-      int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
-      StoreInst *Store = Builder.CreateStore(NewVal, Ptr, Volatile);
-      switch (ord) {
-      case 0:  // memory_order_relaxed
-      default: // invalid order
-        Store->setOrdering(llvm::AtomicOrdering::Monotonic);
-        break;
-      case 3:  // memory_order_release
-        Store->setOrdering(llvm::AtomicOrdering::Release);
-        break;
-      case 5:  // memory_order_seq_cst
-        Store->setOrdering(llvm::AtomicOrdering::SequentiallyConsistent);
-        break;
-      }
-      return RValue::get(nullptr);
-    }
-
-    llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
-
-    llvm::BasicBlock *BBs[3] = {
-      createBasicBlock("monotonic", CurFn),
-      createBasicBlock("release", CurFn),
-      createBasicBlock("seqcst", CurFn)
-    };
-    llvm::AtomicOrdering Orders[3] = {
-        llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Release,
-        llvm::AtomicOrdering::SequentiallyConsistent};
-
-    Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
-    llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]);
-
-    for (unsigned i = 0; i < 3; ++i) {
-      Builder.SetInsertPoint(BBs[i]);
-      StoreInst *Store = Builder.CreateStore(NewVal, Ptr, Volatile);
-      Store->setOrdering(Orders[i]);
-      Builder.CreateBr(ContBB);
-    }
-
-    SI->addCase(Builder.getInt32(0), BBs[0]);
-    SI->addCase(Builder.getInt32(3), BBs[1]);
-    SI->addCase(Builder.getInt32(5), BBs[2]);
-
-    Builder.SetInsertPoint(ContBB);
-    return RValue::get(nullptr);
-  }
-
   case Builtin::BI__atomic_thread_fence:
   case Builtin::BI__atomic_signal_fence:
   case Builtin::BI__c11_atomic_thread_fence:
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index a248a6b53b0d0..324ed7f6d90b7 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3631,6 +3631,7 @@ static bool isValidOrderingForOp(int64_t Ordering, AtomicExpr::AtomicOp Op) {
   case AtomicExpr::AO__atomic_store_n:
   case AtomicExpr::AO__scoped_atomic_store:
   case AtomicExpr::AO__scoped_atomic_store_n:
+  case AtomicExpr::AO__atomic_clear:
     return OrderingCABI != llvm::AtomicOrderingCABI::consume &&
            OrderingCABI != llvm::AtomicOrderingCABI::acquire &&
            OrderingCABI != llvm::AtomicOrderingCABI::acq_rel;
@@ -3683,12 +3684,18 @@ ExprResult Sema::BuildAtomicExpr(SourceRange CallRange, SourceRange ExprRange,
     C11CmpXchg,
 
     // bool __atomic_compare_exchange(A *, C *, CP, bool, int, int)
-    GNUCmpXchg
+    GNUCmpXchg,
+
+    // bool __atomic_test_and_set(A *, int)
+    TestAndSet,
+
+    // void __atomic_clear(A *, int)
+    Clear,
   } Form = Init;
 
-  const unsigned NumForm = GNUCmpXchg + 1;
-  const unsigned NumArgs[] = { 2, 2, 3, 3, 3, 3, 4, 5, 6 };
-  const unsigned NumVals[] = { 1, 0, 1, 1, 1, 1, 2, 2, 3 };
+  const unsigned NumForm = Clear + 1;
+  const unsigned NumArgs[] = {2, 2, 3, 3, 3, 3, 4, 5, 6, 2, 2};
+  const unsigned NumVals[] = {1, 0, 1, 1, 1, 1, 2, 2, 3, 0, 0};
   // where:
   //   C is an appropriate type,
   //   A is volatile _Atomic(C) for __c11 builtins and is C for GNU builtins,
@@ -3849,6 +3856,14 @@ ExprResult Sema::BuildAtomicExpr(SourceRange CallRange, SourceRange ExprRange,
   case AtomicExpr::AO__scoped_atomic_compare_exchange_n:
     Form = GNUCmpXchg;
     break;
+
+  case AtomicExpr::AO__atomic_test_and_set:
+    Form = TestAndSet;
+    break;
+
+  case AtomicExpr::AO__atomic_clear:
+    Form = Clear;
+    break;
   }
 
   unsigned AdjustedNumArgs = NumArgs[Form];
@@ -3994,10 +4009,10 @@ ExprResult Sema::BuildAtomicExpr(SourceRange CallRange, SourceRange ExprRange,
   ValType.removeLocalVolatile();
   ValType.removeLocalConst();
   QualType ResultType = ValType;
-  if (Form == Copy || Form == LoadCopy || Form == GNUXchg ||
-      Form == Init)
+  if (Form == Copy || Form == LoadCopy || Form == GNUXchg || Form == Init ||
+      Form == Clear)
     ResultType = Context.VoidTy;
-  else if (Form == C11CmpXchg || Form == GNUCmpXchg)
+  else if (Form == C11CmpXchg || Form == GNUCmpXchg || Form == TestAndSet)
     ResultType = Context.BoolTy;
 
   // The type of a parameter passed 'by value'. In the GNU atomics, such
@@ -4042,6 +4057,10 @@ ExprResult Sema::BuildAtomicExpr(SourceRange CallRange, SourceRange ExprRange,
       APIOrderedArgs.push_back(Args[1]); // Order
       APIOrderedArgs.push_back(Args[3]); // OrderFail
       break;
+    case TestAndSet:
+    case Clear:
+      APIOrderedArgs.push_back(Args[1]); // Order
+      break;
     }
   } else
     APIOrderedArgs.append(Args.begin(), Args.end());
@@ -4127,6 +4146,8 @@ ExprResult Sema::BuildAtomicExpr(SourceRange CallRange, SourceRange ExprRange,
     SubExprs.push_back(APIOrderedArgs[1]); // Val1
     break;
   case Load:
+  case TestAndSet:
+  case Clear:
     SubExprs.push_back(APIOrderedArgs[1]); // Order
     break;
   case LoadCopy:
diff --git a/clang/test/CodeGen/atomic-test-and-set.c b/clang/test/CodeGen/atomic-test-and-set.c
new file mode 100644
index 0000000000000..bb05623f89755
--- /dev/null
+++ b/clang/test/CodeGen/atomic-test-and-set.c
@@ -0,0 +1,250 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 %s -emit-llvm -o - -triple=aarch64-none-elf | FileCheck %s
+// REQUIRES: aarch64-registered-target
+
+#include <stdatomic.h>
+
+// CHECK-LABEL: define dso_local void @clear_relaxed(
+// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store atomic i8 0, ptr [[TMP0]] monotonic, align 1
+// CHECK-NEXT:    ret void
+//
+void clear_relaxed(char *ptr) {
+  __atomic_clear(ptr, memory_order_relaxed);
+}
+
+// CHECK-LABEL: define dso_local void @clear_seq_cst(
+// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store atomic i8 0, ptr [[TMP0]] seq_cst, align 1
+// CHECK-NEXT:    ret void
+//
+void clear_seq_cst(char *ptr) {
+  __atomic_clear(ptr, memory_order_seq_cst);
+}
+
+// CHECK-LABEL: define dso_local void @clear_release(
+// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store atomic i8 0, ptr [[TMP0]] release, align 1
+// CHECK-NEXT:    ret void
+//
+void clear_release(char *ptr) {
+  __atomic_clear(ptr, memory_order_release);
+}
+
+// CHECK-LABEL: define dso_local void @clear_dynamic(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[ORDER:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ORDER_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[ORDER]], ptr [[ORDER_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ORDER_ADDR]], align 4
+// CHECK-NEXT:    switch i32 [[TMP1]], label %[[MONOTONIC:.*]] [
+// CHECK-NEXT:      i32 3, label %[[RELEASE:.*]]
+// CHECK-NEXT:      i32 5, label %[[SEQCST:.*]]
+// CHECK-NEXT:    ]
+// CHECK:       [[MONOTONIC]]:
+// CHECK-NEXT:    store atomic i8 0, ptr [[TMP0]] monotonic, align 1
+// CHECK-NEXT:    br label %[[ATOMIC_CONTINUE:.*]]
+// CHECK:       [[RELEASE]]:
+// CHECK-NEXT:    store atomic i8 0, ptr [[TMP0]] release, align 1
+// CHECK-NEXT:    br label %[[ATOMIC_CONTINUE]]
+// CHECK:       [[SEQCST]]:
+// CHECK-NEXT:    store atomic i8 0, ptr [[TMP0]] seq_cst, align 1
+// CHECK-NEXT:    br label %[[ATOMIC_CONTINUE]]
+// CHECK:       [[ATOMIC_CONTINUE]]:
+// CHECK-NEXT:    ret void
+//
+void clear_dynamic(char *ptr, int order) {
+  __atomic_clear(ptr, order);
+}
+
+// CHECK-LABEL: define dso_local void @test_and_set_relaxed(
+// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 monotonic, align 1
+// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i8 [[TMP1]], 0
+// CHECK-NEXT:    store i1 [[TOBOOL]], ptr [[ATOMIC_TEMP]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-NEXT:    ret void
+//
+void test_and_set_relaxed(char *ptr) {
+  __atomic_test_and_set(ptr, memory_order_relaxed);
+}
+
+// CHECK-LABEL: define dso_local void @test_and_set_consume(
+// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 acquire, align 1
+// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i8 [[TMP1]], 0
+// CHECK-NEXT:    store i1 [[TOBOOL]], ptr [[ATOMIC_TEMP]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-NEXT:    ret void
+//
+void test_and_set_consume(char *ptr) {
+  __atomic_test_and_set(ptr, memory_order_consume);
+}
+
+// CHECK-LABEL: define dso_local void @test_and_set_acquire(
+// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 acquire, align 1
+// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i8 [[TMP1]], 0
+// CHECK-NEXT:    store i1 [[TOBOOL]], ptr [[ATOMIC_TEMP]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-NEXT:    ret void
+//
+void test_and_set_acquire(char *ptr) {
+  __atomic_test_and_set(ptr, memory_order_acquire);
+}
+
+// CHECK-LABEL: define dso_local void @test_and_set_release(
+// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 release, align 1
+// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i8 [[TMP1]], 0
+// CHECK-NEXT:    store i1 [[TOBOOL]], ptr [[ATOMIC_TEMP]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-NEXT:    ret void
+//
+void test_and_set_release(char *ptr) {
+  __atomic_test_and_set(ptr, memory_order_release);
+}
+
+// CHECK-LABEL: define dso_local void @test_and_set_acq_rel(
+// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 acq_rel, align 1
+// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i8 [[TMP1]], 0
+// CHECK-NEXT:    store i1 [[TOBOOL]], ptr [[ATOMIC_TEMP]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-NEXT:    ret void
+//
+void test_and_set_acq_rel(char *ptr) {
+  __atomic_test_and_set(ptr, memory_order_acq_rel);
+}
+
+// CHECK-LABEL: define dso_local void @test_and_set_seq_cst(
+// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 seq_cst, align 1
+// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i8 [[TMP1]], 0
+// CHECK-NEXT:    store i1 [[TOBOOL]], ptr [[ATOMIC_TEMP]], align 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1
+// CHECK-NEXT:    ret void
+//
+void test_and_set_seq_cst(char *ptr) {
+  __atomic_test_and_set(ptr, memory_order_seq_cst);
+}
+
+// CHECK-LABEL: define dso_local void @test_and_set_dynamic(
+// CHECK-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[ORDER:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ORDER_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[ORDER]], ptr [[ORDER_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ORDER_ADDR]], align 4
+// CHECK-NEXT:    switch i32 [[TMP1]], label %[[MONOTONIC:.*]] [
+// CHECK-NEXT:      i32 1, label %[[ACQUIRE:.*]]
+// CHECK-NEXT:      i32 2, label %[[ACQUIRE]]
+// CHECK-NEXT:      i32 3, label %[[RELEASE:.*]]
+// CHECK-NEXT:      i32 4, label %[[ACQREL:.*]]
+// CHECK-NEXT:      i32 5, label %[[SEQCST:.*]]
+// CHECK-NEXT:    ]
+// CHECK:       [[MONOTONIC]]:
+// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 monotonic, align 1
+// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i8 [[TMP2]], 0
+// CHECK-NEXT:    store i1 [[TOBOOL]], ptr [[ATOMIC_TEMP]], align 1
+// CHECK-NEXT:    br label %[[ATOMIC_CONTINUE:.*]]
+// CHECK:       [[ACQUIRE]]:
+// CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 acquire, align 1
+// CHECK-NEXT:    [[TOBOOL1:%.*]] = icmp ne i8 [[TMP3]], 0
+// CHECK-NEXT:    store i1 [[TOBOOL1]], ptr [[ATOMIC_TEMP]], align 1
+// CHECK-NEXT:    br label %[[ATOMIC_CONTINUE]]
+// CHECK:       [[RELEASE]]:
+// CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 release, align 1
+// CHECK-NEXT:    [[TOBOOL2:%.*]] = icmp ne i8 [[TMP4]], 0
+// CHECK-NEXT:    store i1 [[TOBOOL2]], ptr [[ATOMIC_TEMP]], align 1
+// CHECK-NEXT:    br label %[[ATOMIC_CONTINUE]]
+// CHECK:       [[ACQREL]]:
+// CHECK-NEXT:    [[TMP5:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 acq_rel, align 1
+// CHECK-NEXT:    [[TOBOOL3:%.*]] = icmp ne i8 [[TMP5]], 0
+// CHECK-NEXT:    store i1 [[TOBOOL3]], ptr [[ATOMIC_TEMP]], align 1
+// CHECK-NEXT:    br label %[[ATOMIC_CONTINUE]]
+// CHECK:       [[SEQCST]]:
+// CHECK-NEXT:    [[TMP6:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 seq_cst, align 1
+// CHECK-NEXT:    [[TOBOOL4:%.*]] = icmp ne i8 [[TMP6]], 0
+// CHECK-NEXT:    store i1 [[TOBOOL4]], ptr [[ATOMIC_TEMP]], align 1
+// CHECK-NEXT:    br label %[[ATOMIC_CONTINUE]]
+// CHECK:       [[ATOMIC_CONTINUE]]:
+// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1
+// CHECK-NEXT:    ret void
+//
+void test_and_set_dynamic(char *ptr, int order) {
+  __atomic_test_and_set(ptr, order);
+}
+
+// CHECK-LABEL: define dso_local void @test_and_set_array(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[X:%.*]] = alloca [10 x i32], align 4
+// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [10 x i32], ptr [[X]], i64 0, i64 0
+// CHECK-NEXT:    [[TMP0:%.*]] = atomicrmw volatile xchg ptr [[ARRAYDECAY]], i8 1 seq_cst, align 4
+// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i8 [[TMP0]], 0
+// CHECK-NEXT:    store i1 [[TOBOOL]], ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 4
+// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP1]] to i1
+// CHECK-NEXT:    ret void
+//
+void test_and_set_array() {
+  volatile int x[10];
+  __atomic_test_and_set(x, memory_order_seq_cst);
+}
diff --git a/clang/test/Sema/atomic-ops.c b/clang/test/Sema/atomic-ops.c
index 2405f804d0da5..c3837cf865df8 100644
--- a/clang/test/Sema/atomic-ops.c
+++ b/clang/test/Sema/atomic-ops.c
@@ -284,11 +284,15 @@ void f(_Atomic(int) *i, const _Atomic(int) *ci,
 
   const volatile int flag_k = 0;
   volatile int flag = 0;
-  (void)(int)__atomic_test_and_set(&flag_k, memory_order_seq_cst); // expected-warning {{passing 'const volatile int *' to parameter of type 'volatile void *'}}
+  (void)(int)__atomic_test_and_set(&flag_k, memory_order_seq_cst); // expected-error {{address argument to atomic operation must be a pointer to non-const type ('const volatile int *' invalid)}}
   (void)(int)__atomic_test_and_set(&flag, memory_order_seq_cst);
-  __atomic_clear(&flag_k, memory_order_seq_cst); // expected-warning {{passing 'const volatile int *' to parameter of type 'volatile void *'}}
+  __atomic_clear(&flag_k, memory_order_seq_cst); // expected-error {{address argument to atomic operation must be a pointer to non-const type ('const volatile int *' invalid)}}
   __atomic_clear(&flag, memory_order_seq_cst);
   (int)__atomic_clear(&flag, memory_order_seq_cst); // expected-error {{operand of type 'void'}}
+  __atomic_clear(0x8000, memory_order_seq_cst); // expected-error {{address argument to atomic builtin must be a pointer ('int' invalid)}}
+  __atomic_clear(&flag, memory_order_consume); // expected-warning {{memory order argument to atomic operation is invalid}}
+  __atomic_clear(&flag, memory_order_acquire); // expected-warning {{memory order argument to atomic operation is invalid}}
+  __atomic_clear(&flag, memory_order_acq_rel); // expected-warning {{memory order argument to atomic operation is invalid}}
 
   __c11_atomic_init(ci, 0); // expected-error {{address argument to atomic operation must be a pointer to non-const _Atomic type ('const _Atomic(int) *' invalid)}}
   __c11_atomic_store(ci, 0, memory_order_release); // expected-error {{address argument to atomic operation must be a pointer to non-const _Atomic type ('const _Atomic(int) *' invalid)}}

From 2210da3b823ccf21fc634c858827c9f12c864b51 Mon Sep 17 00:00:00 2001
From: Dhruv Srivastava <dhruv.srivastava@ibm.com>
Date: Thu, 19 Dec 2024 14:53:02 +0530
Subject: [PATCH 028/209] [lldb][AIX] clang-format changes for
 ProcessLauncherPosixFork.cpp (#120459)

This PR is in reference to porting LLDB on AIX.

Link to discussions on llvm discourse and github:

1. https://discourse.llvm.org/t/port-lldb-to-ibm-aix/80640
2. https://github.com/llvm/llvm-project/issues/101657
The complete changes for porting are present in this draft PR:
https://github.com/llvm/llvm-project/pull/102601

Added clang-format changes for ProcessLauncherPosixFork.cpp which will
be followed by ptrace changes in:
- https://github.com/llvm/llvm-project/pull/120390
---
 lldb/source/Host/posix/ProcessLauncherPosixFork.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp b/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp
index 4a9469bde2f18..4a0437b634ee3 100644
--- a/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp
+++ b/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp
@@ -21,8 +21,8 @@
 #include <sys/wait.h>
 #include <unistd.h>
 
-#include <sstream>
 #include <csignal>
+#include <sstream>
 
 #ifdef __ANDROID__
 #include <android/api-level.h>
@@ -47,8 +47,7 @@ static void write_string(int error_fd, const char *str) {
   (void)r;
 }
 
-[[noreturn]] static void ExitWithError(int error_fd,
-                                       const char *operation) {
+[[noreturn]] static void ExitWithError(int error_fd, const char *operation) {
   int err = errno;
   write_string(error_fd, operation);
   write_string(error_fd, " failed: ");

From 9829598933a0b79117891dd733fde5374e59f064 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin@arm.com>
Date: Thu, 19 Dec 2024 09:40:13 +0000
Subject: [PATCH 029/209] [AArch64][SME2] Extend getRegAllocationHints for
 ZPRStridedOrContiguousReg (#119865)

ZPR2StridedOrContiguous loads used by a FORM_TRANSPOSED_REG_TUPLE
pseudo should attempt to assign a strided register to avoid unnecessary
copies, even though this may overlap with the list of SVE callee-saved registers.
---
 .../Target/AArch64/AArch64RegisterInfo.cpp    |  30 ++
 .../AArch64/sme2-intrinsics-int-dots.ll       | 472 +++++++++++++++++-
 .../CodeGen/AArch64/sme2-intrinsics-vdot.ll   | 398 ++++++++++++++-
 3 files changed, 856 insertions(+), 44 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 85a7663993a04..5973b63b5a802 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -1099,6 +1099,36 @@ bool AArch64RegisterInfo::getRegAllocationHints(
     const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
 
+  // The SVE calling convention preserves registers Z8-Z23. As a result, there
+  // are no ZPR2Strided or ZPR4Strided registers that do not overlap with the
+  // callee-saved registers and so by default these will be pushed to the back
+  // of the allocation order for the ZPRStridedOrContiguous classes.
+  // If any of the instructions which define VirtReg are used by the
+  // FORM_TRANSPOSED_REG_TUPLE pseudo, we want to favour reducing copy
+  // instructions over reducing the number of clobbered callee-save registers,
+  // so we add the strided registers as a hint.
+  unsigned RegID = MRI.getRegClass(VirtReg)->getID();
+  // Look through uses of the register for FORM_TRANSPOSED_REG_TUPLE.
+  if ((RegID == AArch64::ZPR2StridedOrContiguousRegClassID ||
+       RegID == AArch64::ZPR4StridedOrContiguousRegClassID) &&
+      any_of(MRI.use_nodbg_instructions(VirtReg), [](const MachineInstr &Use) {
+        return Use.getOpcode() ==
+                   AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO ||
+               Use.getOpcode() == AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO;
+      })) {
+    const TargetRegisterClass *StridedRC =
+        RegID == AArch64::ZPR2StridedOrContiguousRegClassID
+            ? &AArch64::ZPR2StridedRegClass
+            : &AArch64::ZPR4StridedRegClass;
+
+    for (MCPhysReg Reg : Order)
+      if (StridedRC->contains(Reg))
+        Hints.push_back(Reg);
+
+    return TargetRegisterInfo::getRegAllocationHints(VirtReg, Order, Hints, MF,
+                                                     VRM);
+  }
+
   for (MachineInstr &MI : MRI.def_instructions(VirtReg)) {
     if (MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO &&
         MI.getOpcode() != AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
index ef569e480ea3d..86ed63d743713 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll
@@ -602,8 +602,42 @@ entry:
   %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
   %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
   %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
-  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
+  ret void
+}
+
+define void @udot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: udot_form_2x_tuple_svecc:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    ld1b { z0.b, z8.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b, z9.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0]
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
   ret void
 }
 
@@ -650,10 +684,82 @@ entry:
   %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
   %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
   %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
-  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
+  ret void
+}
+
+define void @udot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: udot_form_4x_tuple_svecc:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-9
+; CHECK-NEXT:    lsl x9, x1, #1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    str z15, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    str z14, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    add x10, x9, x1
+; CHECK-NEXT:    str z13, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
+; CHECK-NEXT:    ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
+; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT:    ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #9
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
   ret void
 }
 
@@ -744,8 +850,42 @@ entry:
   %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
   %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
   %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
-  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
+  ret void
+}
+
+define void @usdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: usdot_form_2x_tuple_svecc:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    ld1b { z0.b, z8.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b, z9.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    usdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT:    usdot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0]
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
   ret void
 }
 
@@ -792,10 +932,82 @@ entry:
   %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
   %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
   %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
-  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
+  ret void
+}
+
+define void @usdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: usdot_form_4x_tuple_svecc:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-9
+; CHECK-NEXT:    lsl x9, x1, #1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    str z15, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    str z14, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    add x10, x9, x1
+; CHECK-NEXT:    str z13, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
+; CHECK-NEXT:    ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
+; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT:    ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #9
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
   ret void
 }
 
@@ -888,8 +1100,42 @@ entry:
   %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
   %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
   %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
-  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
+  ret void
+}
+
+define void @sdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: sdot_form_2x_tuple_svecc:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    ld1b { z0.b, z8.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b, z9.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0]
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
   ret void
 }
 
@@ -936,10 +1182,82 @@ entry:
   %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
   %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
   %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
-  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
+  ret void
+}
+
+define void @sdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: sdot_form_4x_tuple_svecc:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-9
+; CHECK-NEXT:    lsl x9, x1, #1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    str z15, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    str z14, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    add x10, x9, x1
+; CHECK-NEXT:    str z13, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
+; CHECK-NEXT:    ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
+; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT:    ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #9
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
   ret void
 }
 
@@ -1032,8 +1350,42 @@ entry:
   %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
   %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
   %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
-  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
+  ret void
+}
+
+define void @sudot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: sudot_form_2x_tuple_svecc:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    ld1b { z0.b, z8.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b, z9.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    sudot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
+; CHECK-NEXT:    sudot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0]
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
+  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
+  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
   ret void
 }
 
@@ -1080,10 +1432,82 @@ entry:
   %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
   %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
   %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
-  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
+  ret void
+}
+
+define void @sudot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: sudot_form_4x_tuple_svecc:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-9
+; CHECK-NEXT:    lsl x9, x1, #1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    str z15, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    str z14, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    add x10, x9, x1
+; CHECK-NEXT:    str z13, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
+; CHECK-NEXT:    ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
+; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT:    ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #9
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
index 49106e12378be..e7d1050b60799 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-vdot.ll
@@ -94,8 +94,43 @@ entry:
   %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
   %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
   %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
-  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> undef, i32 0)
-  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> undef, i32 0)
+  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> poison, i32 0)
+  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> poison, i32 0)
+  ret void
+}
+
+define void @svdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: svdot_form_2x_tuple_svecc:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x9, x0, x1
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    ld1h { z0.h, z8.h }, pn8/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h, z9.h }, pn8/z, [x9]
+; CHECK-NEXT:    svdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
+; CHECK-NEXT:    svdot za.s[w8, 0, vgx2], { z8.h, z9.h }, z0.h[0]
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
+  %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
+  %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
+  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> poison, i32 0)
+  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> poison, i32 0)
   ret void
 }
 
@@ -142,10 +177,82 @@ entry:
   %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
   %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
   %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
-  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
+  ret void
+}
+
+define void @svdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: svdot_form_4x_tuple_svecc:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-9
+; CHECK-NEXT:    lsl x9, x1, #1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    str z15, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    str z14, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    add x10, x9, x1
+; CHECK-NEXT:    str z13, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
+; CHECK-NEXT:    ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
+; CHECK-NEXT:    svdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT:    svdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT:    svdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT:    svdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT:    ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #9
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.svdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
   ret void
 }
 
@@ -210,8 +317,43 @@ entry:
   %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
   %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
   %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
-  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> undef, i32 0)
-  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> undef, i32 0)
+  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> poison, i32 0)
+  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> poison, i32 0)
+  ret void
+}
+
+define void @uvdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: uvdot_form_2x_tuple_svecc:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    add x9, x0, x1
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    ld1h { z0.h, z8.h }, pn8/z, [x0]
+; CHECK-NEXT:    ld1h { z1.h, z9.h }, pn8/z, [x9]
+; CHECK-NEXT:    uvdot za.s[w8, 0, vgx2], { z0.h, z1.h }, z0.h[0]
+; CHECK-NEXT:    uvdot za.s[w8, 0, vgx2], { z8.h, z9.h }, z0.h[0]
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #3
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
+  %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
+  %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
+  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> poison, i32 0)
+  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> poison, i32 0)
   ret void
 }
 
@@ -258,10 +400,82 @@ entry:
   %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
   %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
   %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
-  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
+  ret void
+}
+
+define void @uvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: uvdot_form_4x_tuple_svecc:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-9
+; CHECK-NEXT:    lsl x9, x1, #1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    str z15, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    str z14, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    add x10, x9, x1
+; CHECK-NEXT:    str z13, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
+; CHECK-NEXT:    ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
+; CHECK-NEXT:    uvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT:    uvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT:    uvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT:    uvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT:    ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #9
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.uvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
   ret void
 }
 
@@ -323,10 +537,82 @@ entry:
   %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
   %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
   %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
-  tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
+  ret void
+}
+
+define void @suvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: suvdot_form_4x_tuple_svecc:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-9
+; CHECK-NEXT:    lsl x9, x1, #1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    str z15, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    str z14, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    add x10, x9, x1
+; CHECK-NEXT:    str z13, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
+; CHECK-NEXT:    ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
+; CHECK-NEXT:    suvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT:    suvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT:    suvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT:    suvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT:    ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #9
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+  tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.suvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
   ret void
 }
 
@@ -388,10 +674,82 @@ entry:
   %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
   %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
   %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
-  tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> undef, i32 0)
-  tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> undef, i32 0)
+  tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
+  ret void
+}
+
+define void @usvdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
+; CHECK-LABEL: usvdot_form_4x_tuple_svecc:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-9
+; CHECK-NEXT:    lsl x9, x1, #1
+; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    ptrue pn8.b
+; CHECK-NEXT:    str z15, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    str z14, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    add x10, x9, x1
+; CHECK-NEXT:    str z13, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z12, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z11, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z10, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
+; CHECK-NEXT:    ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
+; CHECK-NEXT:    ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
+; CHECK-NEXT:    ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
+; CHECK-NEXT:    usvdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
+; CHECK-NEXT:    usvdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
+; CHECK-NEXT:    usvdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
+; CHECK-NEXT:    usvdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
+; CHECK-NEXT:    ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    addvl sp, sp, #9
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
+  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
+  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
+  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
+  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
+  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
+  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
+  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
+  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
+  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
+  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
+  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
+  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
+  tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
+  tail call void @llvm.aarch64.sme.usvdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
   ret void
 }
 

From 3c661cf03a2b1f669710a93bf73b15c831171888 Mon Sep 17 00:00:00 2001
From: Daniil Kovalev <dkovalev@accesssoftek.com>
Date: Thu, 19 Dec 2024 12:40:33 +0300
Subject: [PATCH 030/209] [PAC][MC][ELF][AArch64] Support signed TLSDESC
 (#120010)

Support the following relocations and assembly operators:

- `R_AARCH64_AUTH_TLSDESC_ADR_PAGE21` (`:tlsdesc_auth:` for `adrp`)
- `R_AARCH64_AUTH_TLSDESC_LD64_LO12` (`:tlsdesc_auth_lo12:` for `ldr`)
- `R_AARCH64_AUTH_TLSDESC_ADD_LO12` (`:tlsdesc_auth_lo12:` for `add`)
---
 .../lib/Target/AArch64/AArch64MCInstLower.cpp |  26 +++-
 .../AArch64/AsmParser/AArch64AsmParser.cpp    | 109 ++++++++-------
 .../MCTargetDesc/AArch64ELFObjectWriter.cpp   |  26 ++++
 .../AArch64/MCTargetDesc/AArch64MCExpr.cpp    |   4 +
 .../AArch64/MCTargetDesc/AArch64MCExpr.h      | 127 +++++++++---------
 llvm/test/MC/AArch64/arm64-elf-relocs.s       |  19 +++
 llvm/test/MC/AArch64/ilp32-diagnostics.s      |   9 ++
 7 files changed, 200 insertions(+), 120 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
index 9f234b0f91705..46ce151ca82b6 100644
--- a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -194,12 +194,16 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
   } else if (MO.getTargetFlags() & AArch64II::MO_TLS) {
     TLSModel::Model Model;
     if (MO.isGlobal()) {
-      const GlobalValue *GV = MO.getGlobal();
-      Model = Printer.TM.getTLSModel(GV);
-      if (!EnableAArch64ELFLocalDynamicTLSGeneration &&
-          Model == TLSModel::LocalDynamic)
+      const MachineFunction *MF = MO.getParent()->getParent()->getParent();
+      if (MF->getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()) {
         Model = TLSModel::GeneralDynamic;
-
+      } else {
+        const GlobalValue *GV = MO.getGlobal();
+        Model = Printer.TM.getTLSModel(GV);
+        if (!EnableAArch64ELFLocalDynamicTLSGeneration &&
+            Model == TLSModel::LocalDynamic)
+          Model = TLSModel::GeneralDynamic;
+      }
     } else {
       assert(MO.isSymbol() &&
              StringRef(MO.getSymbolName()) == "_TLS_MODULE_BASE_" &&
@@ -218,10 +222,18 @@ MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
     case TLSModel::LocalDynamic:
       RefFlags |= AArch64MCExpr::VK_DTPREL;
       break;
-    case TLSModel::GeneralDynamic:
-      RefFlags |= AArch64MCExpr::VK_TLSDESC;
+    case TLSModel::GeneralDynamic: {
+      // TODO: it's probably better to introduce MO_TLS_AUTH or smth and avoid
+      // running hasELFSignedGOT() every time, but existing flags already
+      // cover all 12 bits of SubReg_TargetFlags field in MachineOperand, and
+      // making the field wider breaks static assertions.
+      const MachineFunction *MF = MO.getParent()->getParent()->getParent();
+      RefFlags |= MF->getInfo<AArch64FunctionInfo>()->hasELFSignedGOT()
+                      ? AArch64MCExpr::VK_TLSDESC_AUTH
+                      : AArch64MCExpr::VK_TLSDESC;
       break;
     }
+    }
   } else if (MO.getTargetFlags() & AArch64II::MO_PREL) {
     RefFlags |= AArch64MCExpr::VK_PREL;
   } else {
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 93c85ba62f90e..d1173e5e12b22 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -904,6 +904,7 @@ class AArch64Operand : public MCParsedAsmOperand {
         ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
         ELFRefKind == AArch64MCExpr::VK_GOTTPREL_LO12_NC ||
         ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_TLSDESC_AUTH_LO12 ||
         ELFRefKind == AArch64MCExpr::VK_SECREL_LO12 ||
         ELFRefKind == AArch64MCExpr::VK_SECREL_HI12 ||
         ELFRefKind == AArch64MCExpr::VK_GOT_PAGE_LO15) {
@@ -1021,6 +1022,7 @@ class AArch64Operand : public MCParsedAsmOperand {
              ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
              ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
              ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12 ||
+             ELFRefKind == AArch64MCExpr::VK_TLSDESC_AUTH_LO12 ||
              ELFRefKind == AArch64MCExpr::VK_SECREL_HI12 ||
              ELFRefKind == AArch64MCExpr::VK_SECREL_LO12;
     }
@@ -3314,7 +3316,8 @@ ParseStatus AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
                ELFRefKind != AArch64MCExpr::VK_GOT_AUTH_PAGE &&
                ELFRefKind != AArch64MCExpr::VK_GOT_PAGE_LO15 &&
                ELFRefKind != AArch64MCExpr::VK_GOTTPREL_PAGE &&
-               ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE) {
+               ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE &&
+               ELFRefKind != AArch64MCExpr::VK_TLSDESC_AUTH_PAGE) {
       // The operand must be an @page or @gotpage qualified symbolref.
       return Error(S, "page or gotpage label reference expected");
     }
@@ -4398,56 +4401,59 @@ bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
       return TokError("expect relocation specifier in operand after ':'");
 
     std::string LowerCase = getTok().getIdentifier().lower();
-    RefKind = StringSwitch<AArch64MCExpr::VariantKind>(LowerCase)
-                  .Case("lo12", AArch64MCExpr::VK_LO12)
-                  .Case("abs_g3", AArch64MCExpr::VK_ABS_G3)
-                  .Case("abs_g2", AArch64MCExpr::VK_ABS_G2)
-                  .Case("abs_g2_s", AArch64MCExpr::VK_ABS_G2_S)
-                  .Case("abs_g2_nc", AArch64MCExpr::VK_ABS_G2_NC)
-                  .Case("abs_g1", AArch64MCExpr::VK_ABS_G1)
-                  .Case("abs_g1_s", AArch64MCExpr::VK_ABS_G1_S)
-                  .Case("abs_g1_nc", AArch64MCExpr::VK_ABS_G1_NC)
-                  .Case("abs_g0", AArch64MCExpr::VK_ABS_G0)
-                  .Case("abs_g0_s", AArch64MCExpr::VK_ABS_G0_S)
-                  .Case("abs_g0_nc", AArch64MCExpr::VK_ABS_G0_NC)
-                  .Case("prel_g3", AArch64MCExpr::VK_PREL_G3)
-                  .Case("prel_g2", AArch64MCExpr::VK_PREL_G2)
-                  .Case("prel_g2_nc", AArch64MCExpr::VK_PREL_G2_NC)
-                  .Case("prel_g1", AArch64MCExpr::VK_PREL_G1)
-                  .Case("prel_g1_nc", AArch64MCExpr::VK_PREL_G1_NC)
-                  .Case("prel_g0", AArch64MCExpr::VK_PREL_G0)
-                  .Case("prel_g0_nc", AArch64MCExpr::VK_PREL_G0_NC)
-                  .Case("dtprel_g2", AArch64MCExpr::VK_DTPREL_G2)
-                  .Case("dtprel_g1", AArch64MCExpr::VK_DTPREL_G1)
-                  .Case("dtprel_g1_nc", AArch64MCExpr::VK_DTPREL_G1_NC)
-                  .Case("dtprel_g0", AArch64MCExpr::VK_DTPREL_G0)
-                  .Case("dtprel_g0_nc", AArch64MCExpr::VK_DTPREL_G0_NC)
-                  .Case("dtprel_hi12", AArch64MCExpr::VK_DTPREL_HI12)
-                  .Case("dtprel_lo12", AArch64MCExpr::VK_DTPREL_LO12)
-                  .Case("dtprel_lo12_nc", AArch64MCExpr::VK_DTPREL_LO12_NC)
-                  .Case("pg_hi21_nc", AArch64MCExpr::VK_ABS_PAGE_NC)
-                  .Case("tprel_g2", AArch64MCExpr::VK_TPREL_G2)
-                  .Case("tprel_g1", AArch64MCExpr::VK_TPREL_G1)
-                  .Case("tprel_g1_nc", AArch64MCExpr::VK_TPREL_G1_NC)
-                  .Case("tprel_g0", AArch64MCExpr::VK_TPREL_G0)
-                  .Case("tprel_g0_nc", AArch64MCExpr::VK_TPREL_G0_NC)
-                  .Case("tprel_hi12", AArch64MCExpr::VK_TPREL_HI12)
-                  .Case("tprel_lo12", AArch64MCExpr::VK_TPREL_LO12)
-                  .Case("tprel_lo12_nc", AArch64MCExpr::VK_TPREL_LO12_NC)
-                  .Case("tlsdesc_lo12", AArch64MCExpr::VK_TLSDESC_LO12)
-                  .Case("got", AArch64MCExpr::VK_GOT_PAGE)
-                  .Case("gotpage_lo15", AArch64MCExpr::VK_GOT_PAGE_LO15)
-                  .Case("got_lo12", AArch64MCExpr::VK_GOT_LO12)
-                  .Case("got_auth", AArch64MCExpr::VK_GOT_AUTH_PAGE)
-                  .Case("got_auth_lo12", AArch64MCExpr::VK_GOT_AUTH_LO12)
-                  .Case("gottprel", AArch64MCExpr::VK_GOTTPREL_PAGE)
-                  .Case("gottprel_lo12", AArch64MCExpr::VK_GOTTPREL_LO12_NC)
-                  .Case("gottprel_g1", AArch64MCExpr::VK_GOTTPREL_G1)
-                  .Case("gottprel_g0_nc", AArch64MCExpr::VK_GOTTPREL_G0_NC)
-                  .Case("tlsdesc", AArch64MCExpr::VK_TLSDESC_PAGE)
-                  .Case("secrel_lo12", AArch64MCExpr::VK_SECREL_LO12)
-                  .Case("secrel_hi12", AArch64MCExpr::VK_SECREL_HI12)
-                  .Default(AArch64MCExpr::VK_INVALID);
+    RefKind =
+        StringSwitch<AArch64MCExpr::VariantKind>(LowerCase)
+            .Case("lo12", AArch64MCExpr::VK_LO12)
+            .Case("abs_g3", AArch64MCExpr::VK_ABS_G3)
+            .Case("abs_g2", AArch64MCExpr::VK_ABS_G2)
+            .Case("abs_g2_s", AArch64MCExpr::VK_ABS_G2_S)
+            .Case("abs_g2_nc", AArch64MCExpr::VK_ABS_G2_NC)
+            .Case("abs_g1", AArch64MCExpr::VK_ABS_G1)
+            .Case("abs_g1_s", AArch64MCExpr::VK_ABS_G1_S)
+            .Case("abs_g1_nc", AArch64MCExpr::VK_ABS_G1_NC)
+            .Case("abs_g0", AArch64MCExpr::VK_ABS_G0)
+            .Case("abs_g0_s", AArch64MCExpr::VK_ABS_G0_S)
+            .Case("abs_g0_nc", AArch64MCExpr::VK_ABS_G0_NC)
+            .Case("prel_g3", AArch64MCExpr::VK_PREL_G3)
+            .Case("prel_g2", AArch64MCExpr::VK_PREL_G2)
+            .Case("prel_g2_nc", AArch64MCExpr::VK_PREL_G2_NC)
+            .Case("prel_g1", AArch64MCExpr::VK_PREL_G1)
+            .Case("prel_g1_nc", AArch64MCExpr::VK_PREL_G1_NC)
+            .Case("prel_g0", AArch64MCExpr::VK_PREL_G0)
+            .Case("prel_g0_nc", AArch64MCExpr::VK_PREL_G0_NC)
+            .Case("dtprel_g2", AArch64MCExpr::VK_DTPREL_G2)
+            .Case("dtprel_g1", AArch64MCExpr::VK_DTPREL_G1)
+            .Case("dtprel_g1_nc", AArch64MCExpr::VK_DTPREL_G1_NC)
+            .Case("dtprel_g0", AArch64MCExpr::VK_DTPREL_G0)
+            .Case("dtprel_g0_nc", AArch64MCExpr::VK_DTPREL_G0_NC)
+            .Case("dtprel_hi12", AArch64MCExpr::VK_DTPREL_HI12)
+            .Case("dtprel_lo12", AArch64MCExpr::VK_DTPREL_LO12)
+            .Case("dtprel_lo12_nc", AArch64MCExpr::VK_DTPREL_LO12_NC)
+            .Case("pg_hi21_nc", AArch64MCExpr::VK_ABS_PAGE_NC)
+            .Case("tprel_g2", AArch64MCExpr::VK_TPREL_G2)
+            .Case("tprel_g1", AArch64MCExpr::VK_TPREL_G1)
+            .Case("tprel_g1_nc", AArch64MCExpr::VK_TPREL_G1_NC)
+            .Case("tprel_g0", AArch64MCExpr::VK_TPREL_G0)
+            .Case("tprel_g0_nc", AArch64MCExpr::VK_TPREL_G0_NC)
+            .Case("tprel_hi12", AArch64MCExpr::VK_TPREL_HI12)
+            .Case("tprel_lo12", AArch64MCExpr::VK_TPREL_LO12)
+            .Case("tprel_lo12_nc", AArch64MCExpr::VK_TPREL_LO12_NC)
+            .Case("tlsdesc_lo12", AArch64MCExpr::VK_TLSDESC_LO12)
+            .Case("tlsdesc_auth_lo12", AArch64MCExpr::VK_TLSDESC_AUTH_LO12)
+            .Case("got", AArch64MCExpr::VK_GOT_PAGE)
+            .Case("gotpage_lo15", AArch64MCExpr::VK_GOT_PAGE_LO15)
+            .Case("got_lo12", AArch64MCExpr::VK_GOT_LO12)
+            .Case("got_auth", AArch64MCExpr::VK_GOT_AUTH_PAGE)
+            .Case("got_auth_lo12", AArch64MCExpr::VK_GOT_AUTH_LO12)
+            .Case("gottprel", AArch64MCExpr::VK_GOTTPREL_PAGE)
+            .Case("gottprel_lo12", AArch64MCExpr::VK_GOTTPREL_LO12_NC)
+            .Case("gottprel_g1", AArch64MCExpr::VK_GOTTPREL_G1)
+            .Case("gottprel_g0_nc", AArch64MCExpr::VK_GOTTPREL_G0_NC)
+            .Case("tlsdesc", AArch64MCExpr::VK_TLSDESC_PAGE)
+            .Case("tlsdesc_auth", AArch64MCExpr::VK_TLSDESC_AUTH_PAGE)
+            .Case("secrel_lo12", AArch64MCExpr::VK_SECREL_LO12)
+            .Case("secrel_hi12", AArch64MCExpr::VK_SECREL_HI12)
+            .Default(AArch64MCExpr::VK_INVALID);
 
     if (RefKind == AArch64MCExpr::VK_INVALID)
       return TokError("expect relocation specifier in operand after ':'");
@@ -5821,6 +5827,7 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc,
              ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
              ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
              ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12 ||
+             ELFRefKind == AArch64MCExpr::VK_TLSDESC_AUTH_LO12 ||
              ELFRefKind == AArch64MCExpr::VK_SECREL_LO12 ||
              ELFRefKind == AArch64MCExpr::VK_SECREL_HI12) &&
             (Inst.getOpcode() == AArch64::ADDXri ||
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index 04f90e21bb3b1..947ec401238e2 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -187,6 +187,15 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
         return R_CLS(TLSIE_ADR_GOTTPREL_PAGE21);
       if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC)
         return R_CLS(TLSDESC_ADR_PAGE21);
+      if (SymLoc == AArch64MCExpr::VK_TLSDESC_AUTH && !IsNC) {
+        if (IsILP32) {
+          Ctx.reportError(Fixup.getLoc(),
+                          "ILP32 ADRP AUTH relocation not supported "
+                          "(LP64 eqv: AUTH_TLSDESC_ADR_PAGE21)");
+          return ELF::R_AARCH64_NONE;
+        }
+        return ELF::R_AARCH64_AUTH_TLSDESC_ADR_PAGE21;
+      }
       Ctx.reportError(Fixup.getLoc(),
                       "invalid symbol kind for ADRP relocation");
       return ELF::R_AARCH64_NONE;
@@ -267,6 +276,15 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
         return R_CLS(TLSLE_ADD_TPREL_LO12);
       if (RefKind == AArch64MCExpr::VK_TLSDESC_LO12)
         return R_CLS(TLSDESC_ADD_LO12);
+      if (RefKind == AArch64MCExpr::VK_TLSDESC_AUTH_LO12) {
+        if (IsILP32) {
+          Ctx.reportError(Fixup.getLoc(),
+                          "ILP32 ADD AUTH relocation not supported "
+                          "(LP64 eqv: AUTH_TLSDESC_ADD_LO12)");
+          return ELF::R_AARCH64_NONE;
+        }
+        return ELF::R_AARCH64_AUTH_TLSDESC_ADD_LO12;
+      }
       if (RefKind == AArch64MCExpr::VK_GOT_AUTH_LO12 && IsNC) {
         if (IsILP32) {
           Ctx.reportError(Fixup.getLoc(),
@@ -411,6 +429,14 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
                                         "TLSDESC_LD64_LO12)");
         return ELF::R_AARCH64_NONE;
       }
+      if (SymLoc == AArch64MCExpr::VK_TLSDESC_AUTH) {
+        if (!IsILP32)
+          return ELF::R_AARCH64_AUTH_TLSDESC_LD64_LO12;
+        Ctx.reportError(Fixup.getLoc(), "ILP32 64-bit load/store AUTH "
+                                        "relocation not supported (LP64 eqv: "
+                                        "AUTH_TLSDESC_LD64_LO12)");
+        return ELF::R_AARCH64_NONE;
+      }
       Ctx.reportError(Fixup.getLoc(),
                       "invalid fixup for 64-bit load/store instruction");
       return ELF::R_AARCH64_NONE;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index 3430b9002894f..53e4e1730f070 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -68,6 +68,7 @@ StringRef AArch64MCExpr::getVariantKindName() const {
   case VK_TPREL_LO12:          return ":tprel_lo12:";
   case VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
   case VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
+  case VK_TLSDESC_AUTH_LO12:   return ":tlsdesc_auth_lo12:";
   case VK_ABS_PAGE:            return "";
   case VK_ABS_PAGE_NC:         return ":pg_hi21_nc:";
   case VK_GOT:                 return ":got:";
@@ -81,6 +82,8 @@ StringRef AArch64MCExpr::getVariantKindName() const {
   case VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
   case VK_TLSDESC:             return "";
   case VK_TLSDESC_PAGE:        return ":tlsdesc:";
+  case VK_TLSDESC_AUTH:        return "";
+  case VK_TLSDESC_AUTH_PAGE:   return ":tlsdesc_auth:";
   case VK_SECREL_LO12:         return ":secrel_lo12:";
   case VK_SECREL_HI12:         return ":secrel_hi12:";
   case VK_GOT_AUTH:            return ":got_auth:";
@@ -154,6 +157,7 @@ void AArch64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
   case VK_GOTTPREL:
   case VK_TPREL:
   case VK_TLSDESC:
+  case VK_TLSDESC_AUTH:
     break;
   }
 
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index 699992782f67b..3f9a85d634d8f 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -28,19 +28,20 @@ class AArch64MCExpr : public MCTargetExpr {
     // Symbol locations specifying (roughly speaking) what calculation should be
     // performed to construct the final address for the relocated
     // symbol. E.g. direct, via the GOT, ...
-    VK_ABS      = 0x001,
-    VK_SABS     = 0x002,
-    VK_PREL     = 0x003,
-    VK_GOT      = 0x004,
-    VK_DTPREL   = 0x005,
-    VK_GOTTPREL = 0x006,
-    VK_TPREL    = 0x007,
-    VK_TLSDESC  = 0x008,
-    VK_SECREL   = 0x009,
-    VK_AUTH     = 0x00a,
-    VK_AUTHADDR = 0x00b,
-    VK_GOT_AUTH = 0x00c,
-    VK_SymLocBits = 0x00f,
+    VK_ABS          = 0x001,
+    VK_SABS         = 0x002,
+    VK_PREL         = 0x003,
+    VK_GOT          = 0x004,
+    VK_DTPREL       = 0x005,
+    VK_GOTTPREL     = 0x006,
+    VK_TPREL        = 0x007,
+    VK_TLSDESC      = 0x008,
+    VK_SECREL       = 0x009,
+    VK_AUTH         = 0x00a,
+    VK_AUTHADDR     = 0x00b,
+    VK_GOT_AUTH     = 0x00c,
+    VK_TLSDESC_AUTH = 0x00d,
+    VK_SymLocBits   = 0x00f,
 
     // Variants specifying which part of the final address calculation is
     // used. E.g. the low 12 bits for an ADD/LDR, the middle 16 bits for a
@@ -67,55 +68,57 @@ class AArch64MCExpr : public MCTargetExpr {
     // omitted in line with assembly syntax here (VK_LO12 rather than VK_LO12_NC
     // since a user would write ":lo12:").
     VK_CALL              = VK_ABS,
-    VK_ABS_PAGE          = VK_ABS      | VK_PAGE,
-    VK_ABS_PAGE_NC       = VK_ABS      | VK_PAGE    | VK_NC,
-    VK_ABS_G3            = VK_ABS      | VK_G3,
-    VK_ABS_G2            = VK_ABS      | VK_G2,
-    VK_ABS_G2_S          = VK_SABS     | VK_G2,
-    VK_ABS_G2_NC         = VK_ABS      | VK_G2      | VK_NC,
-    VK_ABS_G1            = VK_ABS      | VK_G1,
-    VK_ABS_G1_S          = VK_SABS     | VK_G1,
-    VK_ABS_G1_NC         = VK_ABS      | VK_G1      | VK_NC,
-    VK_ABS_G0            = VK_ABS      | VK_G0,
-    VK_ABS_G0_S          = VK_SABS     | VK_G0,
-    VK_ABS_G0_NC         = VK_ABS      | VK_G0      | VK_NC,
-    VK_LO12              = VK_ABS      | VK_PAGEOFF | VK_NC,
-    VK_PREL_G3           = VK_PREL     | VK_G3,
-    VK_PREL_G2           = VK_PREL     | VK_G2,
-    VK_PREL_G2_NC        = VK_PREL     | VK_G2      | VK_NC,
-    VK_PREL_G1           = VK_PREL     | VK_G1,
-    VK_PREL_G1_NC        = VK_PREL     | VK_G1      | VK_NC,
-    VK_PREL_G0           = VK_PREL     | VK_G0,
-    VK_PREL_G0_NC        = VK_PREL     | VK_G0      | VK_NC,
-    VK_GOT_LO12          = VK_GOT      | VK_PAGEOFF | VK_NC,
-    VK_GOT_PAGE          = VK_GOT      | VK_PAGE,
-    VK_GOT_PAGE_LO15     = VK_GOT      | VK_LO15    | VK_NC,
-    VK_GOT_AUTH_LO12     = VK_GOT_AUTH | VK_PAGEOFF | VK_NC,
-    VK_GOT_AUTH_PAGE     = VK_GOT_AUTH | VK_PAGE,
-    VK_DTPREL_G2         = VK_DTPREL   | VK_G2,
-    VK_DTPREL_G1         = VK_DTPREL   | VK_G1,
-    VK_DTPREL_G1_NC      = VK_DTPREL   | VK_G1      | VK_NC,
-    VK_DTPREL_G0         = VK_DTPREL   | VK_G0,
-    VK_DTPREL_G0_NC      = VK_DTPREL   | VK_G0      | VK_NC,
-    VK_DTPREL_HI12       = VK_DTPREL   | VK_HI12,
-    VK_DTPREL_LO12       = VK_DTPREL   | VK_PAGEOFF,
-    VK_DTPREL_LO12_NC    = VK_DTPREL   | VK_PAGEOFF | VK_NC,
-    VK_GOTTPREL_PAGE     = VK_GOTTPREL | VK_PAGE,
-    VK_GOTTPREL_LO12_NC  = VK_GOTTPREL | VK_PAGEOFF | VK_NC,
-    VK_GOTTPREL_G1       = VK_GOTTPREL | VK_G1,
-    VK_GOTTPREL_G0_NC    = VK_GOTTPREL | VK_G0      | VK_NC,
-    VK_TPREL_G2          = VK_TPREL    | VK_G2,
-    VK_TPREL_G1          = VK_TPREL    | VK_G1,
-    VK_TPREL_G1_NC       = VK_TPREL    | VK_G1      | VK_NC,
-    VK_TPREL_G0          = VK_TPREL    | VK_G0,
-    VK_TPREL_G0_NC       = VK_TPREL    | VK_G0      | VK_NC,
-    VK_TPREL_HI12        = VK_TPREL    | VK_HI12,
-    VK_TPREL_LO12        = VK_TPREL    | VK_PAGEOFF,
-    VK_TPREL_LO12_NC     = VK_TPREL    | VK_PAGEOFF | VK_NC,
-    VK_TLSDESC_LO12      = VK_TLSDESC  | VK_PAGEOFF,
-    VK_TLSDESC_PAGE      = VK_TLSDESC  | VK_PAGE,
-    VK_SECREL_LO12       = VK_SECREL   | VK_PAGEOFF,
-    VK_SECREL_HI12       = VK_SECREL   | VK_HI12,
+    VK_ABS_PAGE          = VK_ABS          | VK_PAGE,
+    VK_ABS_PAGE_NC       = VK_ABS          | VK_PAGE    | VK_NC,
+    VK_ABS_G3            = VK_ABS          | VK_G3,
+    VK_ABS_G2            = VK_ABS          | VK_G2,
+    VK_ABS_G2_S          = VK_SABS         | VK_G2,
+    VK_ABS_G2_NC         = VK_ABS          | VK_G2      | VK_NC,
+    VK_ABS_G1            = VK_ABS          | VK_G1,
+    VK_ABS_G1_S          = VK_SABS         | VK_G1,
+    VK_ABS_G1_NC         = VK_ABS          | VK_G1      | VK_NC,
+    VK_ABS_G0            = VK_ABS          | VK_G0,
+    VK_ABS_G0_S          = VK_SABS         | VK_G0,
+    VK_ABS_G0_NC         = VK_ABS          | VK_G0      | VK_NC,
+    VK_LO12              = VK_ABS          | VK_PAGEOFF | VK_NC,
+    VK_PREL_G3           = VK_PREL         | VK_G3,
+    VK_PREL_G2           = VK_PREL         | VK_G2,
+    VK_PREL_G2_NC        = VK_PREL         | VK_G2      | VK_NC,
+    VK_PREL_G1           = VK_PREL         | VK_G1,
+    VK_PREL_G1_NC        = VK_PREL         | VK_G1      | VK_NC,
+    VK_PREL_G0           = VK_PREL         | VK_G0,
+    VK_PREL_G0_NC        = VK_PREL         | VK_G0      | VK_NC,
+    VK_GOT_LO12          = VK_GOT          | VK_PAGEOFF | VK_NC,
+    VK_GOT_PAGE          = VK_GOT          | VK_PAGE,
+    VK_GOT_PAGE_LO15     = VK_GOT          | VK_LO15    | VK_NC,
+    VK_GOT_AUTH_LO12     = VK_GOT_AUTH     | VK_PAGEOFF | VK_NC,
+    VK_GOT_AUTH_PAGE     = VK_GOT_AUTH     | VK_PAGE,
+    VK_DTPREL_G2         = VK_DTPREL       | VK_G2,
+    VK_DTPREL_G1         = VK_DTPREL       | VK_G1,
+    VK_DTPREL_G1_NC      = VK_DTPREL       | VK_G1      | VK_NC,
+    VK_DTPREL_G0         = VK_DTPREL       | VK_G0,
+    VK_DTPREL_G0_NC      = VK_DTPREL       | VK_G0      | VK_NC,
+    VK_DTPREL_HI12       = VK_DTPREL       | VK_HI12,
+    VK_DTPREL_LO12       = VK_DTPREL       | VK_PAGEOFF,
+    VK_DTPREL_LO12_NC    = VK_DTPREL       | VK_PAGEOFF | VK_NC,
+    VK_GOTTPREL_PAGE     = VK_GOTTPREL     | VK_PAGE,
+    VK_GOTTPREL_LO12_NC  = VK_GOTTPREL     | VK_PAGEOFF | VK_NC,
+    VK_GOTTPREL_G1       = VK_GOTTPREL     | VK_G1,
+    VK_GOTTPREL_G0_NC    = VK_GOTTPREL     | VK_G0      | VK_NC,
+    VK_TPREL_G2          = VK_TPREL        | VK_G2,
+    VK_TPREL_G1          = VK_TPREL        | VK_G1,
+    VK_TPREL_G1_NC       = VK_TPREL        | VK_G1      | VK_NC,
+    VK_TPREL_G0          = VK_TPREL        | VK_G0,
+    VK_TPREL_G0_NC       = VK_TPREL        | VK_G0      | VK_NC,
+    VK_TPREL_HI12        = VK_TPREL        | VK_HI12,
+    VK_TPREL_LO12        = VK_TPREL        | VK_PAGEOFF,
+    VK_TPREL_LO12_NC     = VK_TPREL        | VK_PAGEOFF | VK_NC,
+    VK_TLSDESC_LO12      = VK_TLSDESC      | VK_PAGEOFF,
+    VK_TLSDESC_PAGE      = VK_TLSDESC      | VK_PAGE,
+    VK_TLSDESC_AUTH_LO12 = VK_TLSDESC_AUTH | VK_PAGEOFF,
+    VK_TLSDESC_AUTH_PAGE = VK_TLSDESC_AUTH | VK_PAGE,
+    VK_SECREL_LO12       = VK_SECREL       | VK_PAGEOFF,
+    VK_SECREL_HI12       = VK_SECREL       | VK_HI12,
 
     VK_INVALID  = 0xfff
     // clang-format on
diff --git a/llvm/test/MC/AArch64/arm64-elf-relocs.s b/llvm/test/MC/AArch64/arm64-elf-relocs.s
index 5c04e899a7b22..533063cde56dc 100644
--- a/llvm/test/MC/AArch64/arm64-elf-relocs.s
+++ b/llvm/test/MC/AArch64/arm64-elf-relocs.s
@@ -359,6 +359,25 @@ trickQuestion:
 // CHECK-OBJ-LP64-NEXT: R_AARCH64_LD64_GOTPAGE_LO15 local2{{$}}
 // CHECK-OBJ-LP64-NEXT: R_AARCH64_ADR_GOT_PAGE local3{{$}}
 
+   add x5, x0, #:tlsdesc_auth_lo12:sym
+// CHECK: add x5, x0, :tlsdesc_auth_lo12:sym
+// CHECK-OBJ-LP64: R_AARCH64_AUTH_TLSDESC_ADD_LO12 sym
+
+   add x5, x0, #:tlsdesc_auth_lo12:sym+70
+// CHECK: add x5, x0, :tlsdesc_auth_lo12:sym+70
+// CHECK-OBJ-LP64: R_AARCH64_AUTH_TLSDESC_ADD_LO12 sym+0x46
+
+   adrp x2, :tlsdesc_auth:sym
+// CHECK: adrp x2, :tlsdesc_auth:sym
+// CHECK-OBJ-LP64: R_AARCH64_AUTH_TLSDESC_ADR_PAGE21 sym
+
+   ldr x24, [x23, #:tlsdesc_auth_lo12:sym]
+   ldr d22, [x21, :tlsdesc_auth_lo12:sym]
+// CHECK: ldr x24, [x23, :tlsdesc_auth_lo12:sym]
+// CHECK: ldr d22, [x21, :tlsdesc_auth_lo12:sym]
+// CHECK-OBJ-LP64: R_AARCH64_AUTH_TLSDESC_LD64_LO12 sym
+// CHECK-OBJ-LP64: R_AARCH64_AUTH_TLSDESC_LD64_LO12 sym
+
 .data
 local0: .long 0
 local1: .long 0
diff --git a/llvm/test/MC/AArch64/ilp32-diagnostics.s b/llvm/test/MC/AArch64/ilp32-diagnostics.s
index 3c22ff84b0d7e..23c06a455261e 100644
--- a/llvm/test/MC/AArch64/ilp32-diagnostics.s
+++ b/llvm/test/MC/AArch64/ilp32-diagnostics.s
@@ -93,3 +93,12 @@ ldr x24, :got_auth:sym
 
 adr x24, :got_auth:sym
 // ERROR: [[#@LINE-1]]:1: error: ILP32 ADR AUTH relocation not supported (LP64 eqv: AUTH_GOT_ADR_PREL_LO21)
+
+adrp x24, :tlsdesc_auth:sym
+// ERROR: [[#@LINE-1]]:1: error: ILP32 ADRP AUTH relocation not supported (LP64 eqv: AUTH_TLSDESC_ADR_PAGE21)
+
+ldr x24, [x23, :tlsdesc_auth_lo12:sym]
+// ERROR: [[#@LINE-1]]:1: error: ILP32 64-bit load/store AUTH relocation not supported (LP64 eqv: AUTH_TLSDESC_LD64_LO12)
+
+add x24, x23, :tlsdesc_auth_lo12:sym
+// ERROR: [[#@LINE-1]]:1: error: ILP32 ADD AUTH relocation not supported (LP64 eqv: AUTH_TLSDESC_ADD_LO12)

From 431975b630d475920dfba4f38ac501d521427b34 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 18 Dec 2024 15:53:27 +0000
Subject: [PATCH 031/209] [X86] LowerShift - directly initialize SmallVector
 with build vector operands. NFC.

Don't push_back the operands separately.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3d8af69380125..70faea056b2f3 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30125,11 +30125,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     // element to the other.
     // This optimized lowering is only valid if the elements in a pair can
     // be treated identically.
-    SmallVector<SDValue, 32> AmtWideElts;
-    AmtWideElts.reserve(NumElts);
-    for (unsigned I = 0; I != NumElts; ++I) {
-      AmtWideElts.push_back(Amt.getOperand(I));
-    }
+    SmallVector<SDValue, 32> AmtWideElts(Amt->op_begin(), Amt->op_end());
     SmallVector<SDValue, 32> TmpAmtWideElts;
     int WideEltSizeInBits = EltSizeInBits;
     while (WideEltSizeInBits < 32) {

From 976f877388cec5c8976ebe404e6ee68ff7bd5906 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 19 Dec 2024 09:40:53 +0000
Subject: [PATCH 032/209] [X86] ExtendToType - directly initialize SmallVector
 with build vector operands. NFC.

Don't push_back the operands separately.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 70faea056b2f3..874aaf8c3d645 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32649,8 +32649,7 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
          "Unexpected request for vector widening");
 
   SDLoc dl(InOp);
-  if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
-      InOp.getNumOperands() == 2) {
+  if (InOp.getOpcode() == ISD::CONCAT_VECTORS && InOp.getNumOperands() == 2) {
     SDValue N1 = InOp.getOperand(1);
     if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
         N1.isUndef()) {
@@ -32661,22 +32660,17 @@ static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
   }
   if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
       ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
-    SmallVector<SDValue, 16> Ops;
-    for (unsigned i = 0; i < InNumElts; ++i)
-      Ops.push_back(InOp.getOperand(i));
-
     EVT EltVT = InOp.getOperand(0).getValueType();
-
-    SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
-      DAG.getUNDEF(EltVT);
-    for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
-      Ops.push_back(FillVal);
+    SDValue FillVal =
+        FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : DAG.getUNDEF(EltVT);
+    SmallVector<SDValue, 16> Ops(InOp->op_begin(), InOp->op_end());
+    Ops.append(WidenNumElts - InNumElts, FillVal);
     return DAG.getBuildVector(NVT, dl, Ops);
   }
-  SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
-    DAG.getUNDEF(NVT);
-  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
-                     InOp, DAG.getIntPtrConstant(0, dl));
+  SDValue FillVal =
+      FillWithZeroes ? DAG.getConstant(0, dl, NVT) : DAG.getUNDEF(NVT);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, InOp,
+                     DAG.getIntPtrConstant(0, dl));
 }
 
 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,

From c616fdc8d0d6598679f667c351b458cb1cf8101b Mon Sep 17 00:00:00 2001
From: Edd Dawson <edd.dawson@sony.com>
Date: Thu, 19 Dec 2024 09:42:44 +0000
Subject: [PATCH 033/209] [PS5][Driver] Pass user search paths to linker before
 implict ones (#119875)

Responsibility for setting up implicit library search paths was recently
transferred to the PS5 driver (llvm#109796). Prior to this, SIE private
patches in lld performed this function. During the transition, I failed
to maintain the order in which implicit and user-supplied search paths
were supplied/considered. This change ensures user-supplied search paths
appear before any implicit ones on the link line.

SIE tracker: TOOLCHAIN-17490
---
 clang/lib/Driver/ToolChains/PS4CPU.cpp |  5 +++--
 clang/test/Driver/ps5-linker.c         | 26 ++++++++++++--------------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
index c2eeb8f513066..fd4c2f9bf68cd 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
+++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
@@ -361,9 +361,10 @@ void tools::PS5cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (StringRef Jobs = getLTOParallelism(Args, D); !Jobs.empty())
     AddLTOFlag(Twine("jobs=") + Jobs);
 
+  Args.AddAllArgs(CmdArgs, options::OPT_L);
   TC.AddFilePathLibArgs(Args, CmdArgs);
-  Args.addAllArgs(CmdArgs, {options::OPT_L, options::OPT_T_Group,
-                            options::OPT_s, options::OPT_t});
+  Args.addAllArgs(CmdArgs,
+                  {options::OPT_T_Group, options::OPT_s, options::OPT_t});
 
   if (Args.hasArg(options::OPT_Z_Xlinker__no_demangle))
     CmdArgs.push_back("--no-demangle");
diff --git a/clang/test/Driver/ps5-linker.c b/clang/test/Driver/ps5-linker.c
index 62aa3a40e455a..53f89a914f4fa 100644
--- a/clang/test/Driver/ps5-linker.c
+++ b/clang/test/Driver/ps5-linker.c
@@ -172,29 +172,27 @@
 // CHECK-SYSROOT: {{ld(\.exe)?}}"
 // CHECK-SYSROOT-SAME: "--sysroot=mysdk"
 
-// Test that "." is always added to library search paths. This is long-standing
-// behavior, unique to PlayStation toolchains.
-
-// RUN: %clang --target=x64_64-sie-ps5 %s -### 2>&1 | FileCheck --check-prefixes=CHECK-LDOT %s
-
-// CHECK-LDOT: {{ld(\.exe)?}}"
-// CHECK-LDOT-SAME: "-L."
-
-// Test that <sdk-root>/target/lib is added to library search paths, if it
-// exists and no --sysroot is specified. Also confirm that CRT objects are
-// found there.
+// Test implicit library search paths are supplied to the linker, after any
+// search paths specified by the user. <sdk-root>/target/lib is implicitly
+// added if it exists and no --sysroot is specified. CRT objects are found
+// there. "." is always implicitly added to library search paths. This is
+// long-standing behavior, unique to PlayStation toolchains.
 
 // RUN: rm -rf %t.dir && mkdir %t.dir
-// RUN: env SCE_PROSPERO_SDK_DIR=%t.dir %clang --target=x64_64-sie-ps5 %s -### 2>&1 | FileCheck --check-prefixes=CHECK-NO-TARGETLIB %s
-// RUN: env SCE_PROSPERO_SDK_DIR=%t.dir %clang --target=x64_64-sie-ps5 %s -### --sysroot=%t.dir 2>&1 | FileCheck --check-prefixes=CHECK-NO-TARGETLIB %s
+// RUN: env SCE_PROSPERO_SDK_DIR=%t.dir %clang --target=x64_64-sie-ps5 %s -### -Luser 2>&1 | FileCheck --check-prefixes=CHECK-NO-TARGETLIB %s
+// RUN: env SCE_PROSPERO_SDK_DIR=%t.dir %clang --target=x64_64-sie-ps5 %s -### -Luser --sysroot=%t.dir 2>&1 | FileCheck --check-prefixes=CHECK-NO-TARGETLIB %s
 
 // CHECK-NO-TARGETLIB: {{ld(\.exe)?}}"
+// CHECK-NO-TARGETLIB-SAME: "-Luser"
 // CHECK-NO-TARGETLIB-NOT: "-L{{.*[/\\]}}target/lib"
+// CHECK-NO-TARGETLIB-SAME: "-L."
 
 // RUN: mkdir -p %t.dir/target/lib
 // RUN: touch %t.dir/target/lib/crti.o
-// RUN: env SCE_PROSPERO_SDK_DIR=%t.dir %clang --target=x64_64-sie-ps5 %s -### 2>&1 | FileCheck --check-prefixes=CHECK-TARGETLIB %s
+// RUN: env SCE_PROSPERO_SDK_DIR=%t.dir %clang --target=x64_64-sie-ps5 %s -### -Luser 2>&1 | FileCheck --check-prefixes=CHECK-TARGETLIB %s
 
 // CHECK-TARGETLIB: {{ld(\.exe)?}}"
+// CHECK-TARGETLIB-SAME: "-Luser"
 // CHECK-TARGETLIB-SAME: "-L{{.*[/\\]}}target/lib"
+// CHECK-TARGETLIB-SAME: "-L."
 // CHECK-TARGETLIB-SAME: "{{.*[/\\]}}target{{/|\\\\}}lib{{/|\\\\}}crti.o"

From 10d84a86e54cfd0dcd3412e415edeb36b3f622c3 Mon Sep 17 00:00:00 2001
From: Mikhail Goncharov <goncharov.mikhail@gmail.com>
Date: Thu, 19 Dec 2024 10:45:20 +0100
Subject: [PATCH 034/209] [bazel] port 79e859e049c77b5190a54fc1ecf1d262e3ef9f11

---
 utils/bazel/llvm-project-overlay/lld/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
index 41467a09c102d..40154de2c7a02 100644
--- a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
@@ -50,6 +50,7 @@ cc_library(
         "//llvm:DebugInfoDWARF",
         "//llvm:IRPrinter",
         "//llvm:Option",
+        "//llvm:ProfileData",
         "//llvm:Support",
         "//llvm:Target",
         "//llvm:TargetParser",

From cffe22a93726a64e6a205b5dcd1c306a62488412 Mon Sep 17 00:00:00 2001
From: Mikhail Goncharov <goncharov.mikhail@gmail.com>
Date: Thu, 19 Dec 2024 10:56:02 +0100
Subject: [PATCH 035/209] Revert "[NFC] Move DroppedVariableStats code to
 Analysis (#120502)"

that introduces a circular dependency of analysis -> codegen -> target

This reverts commit e389492d6a00e1c49a034e13343098541ebd03c6.
---
 llvm/include/llvm/{Analysis => Passes}/DroppedVariableStats.h | 0
 llvm/include/llvm/Passes/StandardInstrumentations.h           | 2 +-
 llvm/lib/Analysis/CMakeLists.txt                              | 1 -
 llvm/lib/Passes/CMakeLists.txt                                | 1 +
 llvm/lib/{Analysis => Passes}/DroppedVariableStats.cpp        | 2 +-
 5 files changed, 3 insertions(+), 3 deletions(-)
 rename llvm/include/llvm/{Analysis => Passes}/DroppedVariableStats.h (100%)
 rename llvm/lib/{Analysis => Passes}/DroppedVariableStats.cpp (99%)

diff --git a/llvm/include/llvm/Analysis/DroppedVariableStats.h b/llvm/include/llvm/Passes/DroppedVariableStats.h
similarity index 100%
rename from llvm/include/llvm/Analysis/DroppedVariableStats.h
rename to llvm/include/llvm/Passes/DroppedVariableStats.h
diff --git a/llvm/include/llvm/Passes/StandardInstrumentations.h b/llvm/include/llvm/Passes/StandardInstrumentations.h
index 23bfee7115df6..6ba466f9269f0 100644
--- a/llvm/include/llvm/Passes/StandardInstrumentations.h
+++ b/llvm/include/llvm/Passes/StandardInstrumentations.h
@@ -19,13 +19,13 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/Analysis/DroppedVariableStats.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/OptBisect.h"
 #include "llvm/IR/PassTimingInfo.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/Passes/DroppedVariableStats.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Transforms/IPO/SampleProfileProbe.h"
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index 5cf3777253fbe..0db5b80f336cb 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -60,7 +60,6 @@ add_llvm_component_library(LLVMAnalysis
   DomPrinter.cpp
   DomTreeUpdater.cpp
   DominanceFrontier.cpp
-  DroppedVariableStats.cpp
   DXILResource.cpp
   DXILMetadataAnalysis.cpp
   FunctionPropertiesAnalysis.cpp
diff --git a/llvm/lib/Passes/CMakeLists.txt b/llvm/lib/Passes/CMakeLists.txt
index 6425f4934b210..9e16a446c9b39 100644
--- a/llvm/lib/Passes/CMakeLists.txt
+++ b/llvm/lib/Passes/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_llvm_component_library(LLVMPasses
   CodeGenPassBuilder.cpp
+  DroppedVariableStats.cpp
   OptimizationLevel.cpp
   PassBuilder.cpp
   PassBuilderBindings.cpp
diff --git a/llvm/lib/Analysis/DroppedVariableStats.cpp b/llvm/lib/Passes/DroppedVariableStats.cpp
similarity index 99%
rename from llvm/lib/Analysis/DroppedVariableStats.cpp
rename to llvm/lib/Passes/DroppedVariableStats.cpp
index 7162cece4f3d9..5dc6b75fb8ace 100644
--- a/llvm/lib/Analysis/DroppedVariableStats.cpp
+++ b/llvm/lib/Passes/DroppedVariableStats.cpp
@@ -11,7 +11,7 @@
 ///
 ///===---------------------------------------------------------------------===//
 
-#include "llvm/Analysis/DroppedVariableStats.h"
+#include "llvm/Passes/DroppedVariableStats.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Module.h"

From a3bb2d675f2cf0409b681273719b5e064a2c137f Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 19 Dec 2024 09:57:43 +0000
Subject: [PATCH 036/209] [gn build] Port cffe22a93726

---
 llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn | 1 -
 llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn   | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn
index 905f4fda9b7bf..ba78c2cf9e75f 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Analysis/BUILD.gn
@@ -51,7 +51,6 @@ static_library("Analysis") {
     "DomPrinter.cpp",
     "DomTreeUpdater.cpp",
     "DominanceFrontier.cpp",
-    "DroppedVariableStats.cpp",
     "FunctionPropertiesAnalysis.cpp",
     "GlobalsModRef.cpp",
     "GuardUtils.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
index 274f5b54345c7..655264509db59 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
@@ -21,6 +21,7 @@ static_library("Passes") {
   ]
   sources = [
     "CodeGenPassBuilder.cpp",
+    "DroppedVariableStats.cpp",
     "OptimizationLevel.cpp",
     "PassBuilder.cpp",
     "PassBuilderBindings.cpp",

From c18fda02e1c5dd68ce65b8505d3976f0d5714d52 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Thu, 19 Dec 2024 10:07:13 +0000
Subject: [PATCH 037/209] [LoopVectorize] Use new single string variant of
 reportVectorizationFailure (#120414)

---
 .../Vectorize/LoopVectorizationLegality.cpp   | 20 +++----------------
 .../Transforms/Vectorize/LoopVectorize.cpp    |  2 --
 .../X86/vectorization-remarks-missed.ll       |  4 ++--
 3 files changed, 5 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 555c8435dd330..1c82fd174dbec 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -666,7 +666,6 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {
   // Check whether we are able to set up outer loop induction.
   if (!setupOuterLoopInductions()) {
     reportVectorizationFailure("Unsupported outer loop Phi(s)",
-                               "Unsupported outer loop Phi(s)",
                                "UnsupportedPhi", ORE, TheLoop);
     if (DoExtraAnalysis)
       Result = false;
@@ -962,7 +961,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         Type *T = ST->getValueOperand()->getType();
         if (!VectorType::isValidElementType(T)) {
           reportVectorizationFailure("Store instruction cannot be vectorized",
-                                     "store instruction cannot be vectorized",
                                      "CantVectorizeStore", ORE, TheLoop, ST);
           return false;
         }
@@ -975,7 +973,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           assert(VecTy && "did not find vectorized version of stored type");
           if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
             reportVectorizationFailure(
-                "nontemporal store instruction cannot be vectorized",
                 "nontemporal store instruction cannot be vectorized",
                 "CantVectorizeNontemporalStore", ORE, TheLoop, ST);
             return false;
@@ -990,7 +987,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           assert(VecTy && "did not find vectorized version of load type");
           if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
             reportVectorizationFailure(
-                "nontemporal load instruction cannot be vectorized",
                 "nontemporal load instruction cannot be vectorized",
                 "CantVectorizeNontemporalLoad", ORE, TheLoop, LD);
             return false;
@@ -1020,7 +1016,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           continue;
         }
         reportVectorizationFailure("Value cannot be used outside the loop",
-                                   "value cannot be used outside the loop",
                                    "ValueUsedOutsideLoop", ORE, TheLoop, &I);
         return false;
       }
@@ -1442,9 +1437,7 @@ bool LoopVectorizationLegality::blockCanBePredicated(
 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   if (!EnableIfConversion) {
     reportVectorizationFailure("If-conversion is disabled",
-                               "if-conversion is disabled",
-                               "IfConversionDisabled",
-                               ORE, TheLoop);
+                               "IfConversionDisabled", ORE, TheLoop);
     return false;
   }
 
@@ -1493,14 +1486,12 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
     if (isa<SwitchInst>(BB->getTerminator())) {
       if (TheLoop->isLoopExiting(BB)) {
         reportVectorizationFailure("Loop contains an unsupported switch",
-                                   "loop contains an unsupported switch",
                                    "LoopContainsUnsupportedSwitch", ORE,
                                    TheLoop, BB->getTerminator());
         return false;
       }
     } else if (!isa<BranchInst>(BB->getTerminator())) {
       reportVectorizationFailure("Loop contains an unsupported terminator",
-                                 "loop contains an unsupported terminator",
                                  "LoopContainsUnsupportedTerminator", ORE,
                                  TheLoop, BB->getTerminator());
       return false;
@@ -1510,8 +1501,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
     if (blockNeedsPredication(BB) &&
         !blockCanBePredicated(BB, SafePointers, MaskedOp)) {
       reportVectorizationFailure(
-          "Control flow cannot be substituted for a select",
-          "control flow cannot be substituted for a select", "NoCFGForSelect",
+          "Control flow cannot be substituted for a select", "NoCFGForSelect",
           ORE, TheLoop, BB->getTerminator());
       return false;
     }
@@ -1700,8 +1690,6 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
         return false;
       } else if (!IsSafeOperation(&I)) {
         reportVectorizationFailure("Early exit loop contains operations that "
-                                   "cannot be speculatively executed",
-                                   "Early exit loop contains operations that "
                                    "cannot be speculatively executed",
                                    "UnsafeOperationsEarlyExitLoop", ORE,
                                    TheLoop);
@@ -1764,9 +1752,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
 
     if (!canVectorizeOuterLoop()) {
       reportVectorizationFailure("Unsupported outer loop",
-                                 "unsupported outer loop",
-                                 "UnsupportedOuterLoop",
-                                 ORE, TheLoop);
+                                 "UnsupportedOuterLoop", ORE, TheLoop);
       // TODO: Implement DoExtraAnalysis when subsequent legal checks support
       // outer loops.
       return false;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index a8511483e00fb..1f6996cd9c1f4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4274,7 +4274,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
 
   if (TC == 0) {
     reportVectorizationFailure(
-        "Unable to calculate the loop count due to complex control flow",
         "unable to calculate the loop count due to complex control flow",
         "UnknownLoopCountComplexCFG", ORE, TheLoop);
     return FixedScalableVFPair::getNone();
@@ -9360,7 +9359,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
   if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
     reportVectorizationFailure(
-        "Some exit values in loop with uncountable exit not supported yet",
         "Some exit values in loop with uncountable exit not supported yet",
         "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
     return nullptr;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
index 99911b251c81e..4c0317e300f19 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
@@ -41,7 +41,7 @@
 ;   }
 ;   return k;
 ; }
-; CHECK: remark: source.cpp:29:7: loop not vectorized: control flow cannot be substituted for a select
+; CHECK: remark: source.cpp:29:7: loop not vectorized: Control flow cannot be substituted for a select
 ; CHECK: remark: source.cpp:27:3: loop not vectorized
 
 ; YAML:       --- !Analysis
@@ -104,7 +104,7 @@
 ; YAML-NEXT: Function:        test_multiple_failures
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'loop not vectorized: '
-; YAML-NEXT:   - String:          control flow cannot be substituted for a select
+; YAML-NEXT:   - String:          Control flow cannot be substituted for a select
 ; YAML-NEXT: ...
 ; YAML-NEXT: --- !Analysis
 ; YAML-NEXT: Pass:            loop-vectorize

From eaf482f01252a0276a6b422dabe810a1abc7e168 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Thu, 19 Dec 2024 10:07:41 +0000
Subject: [PATCH 038/209] [AArch64] Tweak truncate costs for some scalable
 vector types (#119542)

== We were previously returning an invalid cost when truncating
anything to <vscale x 2 x i1>, which is incorrect since we can
generate perfectly good code for this.

== The costs for truncating legal or unpacked types to predicates
seemed overly optimistic. For example, when truncating
<vscale x 8 x i16> to <vscale x 8 x i1> we typically do
something like

  and z0.h, z0.h, #0x1
  cmpne   p0.h, p0/z, z0.h, #0

I guess it might depend upon whether the input value is
generated in the same block or not and if we can avoid the
inreg zero-extend. However, it feels safe to take the more
conservative cost here.

== The costs for some truncates such as

  trunc <vscale x 2 x i32> %a to <vscale x 2 x i16>

were 1, whereas in actual fact they are free and no instructions
are required.

== Also, for this

  trunc <vscale x 8 x i32> %a to <vscale x 8 x i16>

it's just a single uzp1 instruction so I reduced the cost to 1.

In general, I've added costs for all cases where the destination
type is legal or unpacked. One unfortunate side effect of this
is the costs for some fixed-width truncates when using SVE now
look too optimistic.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 49 ++++++---
 .../Analysis/CostModel/AArch64/sve-cast.ll    | 36 +++----
 .../CostModel/AArch64/sve-intrinsics.ll       | 32 +++---
 .../Analysis/CostModel/AArch64/sve-trunc.ll   | 99 +++++++++++++------
 4 files changed, 136 insertions(+), 80 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 6c2e04c3f8a7c..817beb8c72a8f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2782,22 +2782,39 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
       {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
 
       // Truncations on nxvmiN
-      {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1},
-      {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1},
-      {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1},
-      {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1},
-      {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1},
-      {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2},
-      {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1},
-      {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3},
-      {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5},
-      {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1},
-      {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1},
-      {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1},
-      {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1},
-      {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2},
-      {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3},
-      {ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6},
+      {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
+      {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
+      {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
+      {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
+      {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
+      {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
+      {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
+      {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
+      {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
+      {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
+      {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
+      {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
+      {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
+      {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
+      {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
+      {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
+      {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
+      {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
+      {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
+      {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
+      {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
+      {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
+      {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
+      {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
+      {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
+      {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
+      {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
+      {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
+      {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
+      {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
+      {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
+      {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
+      {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
 
       // The number of shll instructions for the extension.
       {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-cast.ll b/llvm/test/Analysis/CostModel/AArch64/sve-cast.ll
index 4b7b1ff7a8b47..0b051169a1b36 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-cast.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-cast.ll
@@ -418,27 +418,27 @@ define void @trunc() {
 ; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s2i8i16 = trunc <2 x i16> undef to <2 x i8>
 ; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s2i8i32 = trunc <2 x i32> undef to <2 x i8>
 ; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s2i8i64 = trunc <2 x i64> undef to <2 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s2i16i32 = trunc <2 x i32> undef to <2 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s2i16i32 = trunc <2 x i32> undef to <2 x i16>
 ; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s2i16i64 = trunc <2 x i64> undef to <2 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s2i32i64 = trunc <2 x i64> undef to <2 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s2i32i64 = trunc <2 x i64> undef to <2 x i32>
 ; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s4i8i16 = trunc <4 x i16> undef to <4 x i8>
 ; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s4i8i32 = trunc <4 x i32> undef to <4 x i8>
 ; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s4i8i64 = trunc <4 x i64> undef to <4 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s4i16i32 = trunc <4 x i32> undef to <4 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s4i16i32 = trunc <4 x i32> undef to <4 x i16>
 ; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s4i16i64 = trunc <4 x i64> undef to <4 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
 ; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8i8i16 = trunc <8 x i16> undef to <8 x i8>
 ; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8i8i32 = trunc <8 x i32> undef to <8 x i8>
 ; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8i8i64 = trunc <8 x i64> undef to <8 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
 ; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8i16i64 = trunc <8 x i64> undef to <8 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
 ; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s16i8i16 = trunc <16 x i16> undef to <16 x i8>
 ; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s16i8i32 = trunc <16 x i32> undef to <16 x i8>
 ; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s16i8i64 = trunc <16 x i64> undef to <16 x i8>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
 ; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s16i16i64 = trunc <16 x i64> undef to <16 x i16>
-; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
+; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
 ; SVE128-NO-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; FIXED-MIN-256-LABEL: 'trunc'
@@ -463,19 +463,19 @@ define void @trunc() {
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s4i8i64 = trunc <4 x i64> undef to <4 x i8>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s4i16i32 = trunc <4 x i32> undef to <4 x i16>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s4i16i64 = trunc <4 x i64> undef to <4 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8i8i16 = trunc <8 x i16> undef to <8 x i8>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8i8i32 = trunc <8 x i32> undef to <8 x i8>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8i8i64 = trunc <8 x i64> undef to <8 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8i16i64 = trunc <8 x i64> undef to <8 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s16i8i16 = trunc <16 x i16> undef to <16 x i8>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s16i8i32 = trunc <16 x i32> undef to <16 x i8>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s16i8i64 = trunc <16 x i64> undef to <16 x i8>
-; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
+; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s16i16i64 = trunc <16 x i64> undef to <16 x i16>
-; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
+; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
 ; FIXED-MIN-256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; FIXED-MIN-2048-LABEL: 'trunc'
@@ -500,19 +500,19 @@ define void @trunc() {
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s4i8i64 = trunc <4 x i64> undef to <4 x i8>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s4i16i32 = trunc <4 x i32> undef to <4 x i16>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s4i16i64 = trunc <4 x i64> undef to <4 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s4i32i64 = trunc <4 x i64> undef to <4 x i32>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8i8i16 = trunc <8 x i16> undef to <8 x i8>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8i8i32 = trunc <8 x i32> undef to <8 x i8>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8i8i64 = trunc <8 x i64> undef to <8 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8i16i32 = trunc <8 x i32> undef to <8 x i16>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8i16i64 = trunc <8 x i64> undef to <8 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8i32i64 = trunc <8 x i64> undef to <8 x i32>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s16i8i16 = trunc <16 x i16> undef to <16 x i8>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s16i8i32 = trunc <16 x i32> undef to <16 x i8>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s16i8i64 = trunc <16 x i64> undef to <16 x i8>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s16i16i32 = trunc <16 x i32> undef to <16 x i16>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s16i16i64 = trunc <16 x i64> undef to <16 x i16>
-; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
+; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s16i32i64 = trunc <16 x i64> undef to <16 x i32>
 ; FIXED-MIN-2048-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %r8 = trunc i8 undef to i1
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index dd3909ade5315..3e5de313c3cac 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -638,10 +638,10 @@ define void @vector_splice() #0 {
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 1)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 1)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 1)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv16i8_neg = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv32i8_neg = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i16_neg = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
@@ -671,10 +671,10 @@ define void @vector_splice() #0 {
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4bf16_neg = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8bf16_neg = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16bf16_neg = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv8i1_neg = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv8i1_neg = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i1_neg = call <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 -1)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -702,10 +702,10 @@ define void @vector_splice() #0 {
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 1)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %splice_nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 1)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice_nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 1)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv16i1 = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 1)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv8i1 = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 1)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv4i1 = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 1)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice_nxv2i1 = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 1)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv16i8_neg = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer, i32 -1)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv32i8_neg = call <vscale x 32 x i8> @llvm.vector.splice.nxv32i8(<vscale x 32 x i8> zeroinitializer, <vscale x 32 x i8> zeroinitializer, i32 -1)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i16_neg = call <vscale x 1 x i16> @llvm.vector.splice.nxv1i16(<vscale x 1 x i16> zeroinitializer, <vscale x 1 x i16> zeroinitializer, i32 -1)
@@ -735,10 +735,10 @@ define void @vector_splice() #0 {
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv4bf16_neg = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %splice_nxv8bf16_neg = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16bf16_neg = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv8i1_neg = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv16i1_neg = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer, i32 -1)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv8i1_neg = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> zeroinitializer, <vscale x 8 x i1> zeroinitializer, i32 -1)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv4i1_neg = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, i32 -1)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %splice_nxv2i1_neg = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> zeroinitializer, <vscale x 2 x i1> zeroinitializer, i32 -1)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Invalid cost for instruction: %splice_nxv1i1_neg = call <vscale x 1 x i1> @llvm.vector.splice.nxv1i1(<vscale x 1 x i1> zeroinitializer, <vscale x 1 x i1> zeroinitializer, i32 -1)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll b/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll
index 767bd5f5b75cb..e754d264c1b41 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll
@@ -5,43 +5,82 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @sve_truncs() {
 ; CHECK-LABEL: 'sve_truncs'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i16_to_i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i32_to_i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i64_to_i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v4i16_to_i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v4i32_to_i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %trunc_v4i64_to_i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v8i16_to_i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %trunc_v8i32_to_i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %trunc_v8i64_to_i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i32_to_i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i64_to_i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v4i32_to_i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %trunc_v4i64_to_i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %trunc_v8i32_to_i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %trunc_v8i64_to_i32 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %trunc_nxv2i8_to_i1 = trunc <vscale x 2 x i8> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %trunc_nxv2i16_to_i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %trunc_nxv2i32_to_i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %trunc_nxv2i64_to_i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %trunc_nxv4i8_to_i1 = trunc <vscale x 4 x i8> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %trunc_nxv4i16_to_i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %trunc_nxv4i32_to_i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %trunc_nxv4i64_to_i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %trunc_nxv8i8_to_i1 = trunc <vscale x 8 x i8> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %trunc_nxv8i16_to_i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %trunc_nxv8i32_to_i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %trunc_nxv8i64_to_i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc_nxv2i16_to_i8 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc_nxv2i32_to_i8 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc_nxv2i64_to_i8 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc_nxv2i32_to_i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc_nxv2i64_to_i16 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc_nxv2i64_to_i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc_nxv4i16_to_i8 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc_nxv4i32_to_i8 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_nxv4i64_to_i8 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc_nxv4i32_to_i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_nxv4i64_to_i16 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_nxv4i64_to_i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc_nxv8i16_to_i8 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_nxv8i32_to_i8 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %trunc_nxv8i64_to_i8 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_nxv8i32_to_i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %trunc_nxv8i64_to_i16 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_nxv16i16_to_i8 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %trunc_nxv16i32_to_i8 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i8>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %trunc_nxv16i64_to_i8 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i8>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  %trunc_v2i16_to_i1  = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
-  %trunc_v2i32_to_i1  = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
-  %trunc_v2i64_to_i1  = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
+  %trunc_nxv2i8_to_i1   = trunc <vscale x 2 x i8>  undef to <vscale x 2 x i1>
+  %trunc_nxv2i16_to_i1  = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
+  %trunc_nxv2i32_to_i1  = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
+  %trunc_nxv2i64_to_i1  = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
 
-  %trunc_v4i16_to_i1  = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
-  %trunc_v4i32_to_i1  = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
-  %trunc_v4i64_to_i1  = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
+  %trunc_nxv4i8_to_i1   = trunc <vscale x 4 x i8>  undef to <vscale x 4 x i1>
+  %trunc_nxv4i16_to_i1  = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
+  %trunc_nxv4i32_to_i1  = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
+  %trunc_nxv4i64_to_i1  = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
 
-  %trunc_v8i16_to_i1  = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
-  %trunc_v8i32_to_i1  = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
-  %trunc_v8i64_to_i1  = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
+  %trunc_nxv8i8_to_i1   = trunc <vscale x 8 x i8>  undef to <vscale x 8 x i1>
+  %trunc_nxv8i16_to_i1  = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
+  %trunc_nxv8i32_to_i1  = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
+  %trunc_nxv8i64_to_i1  = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
 
-  %trunc_v2i32_to_i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
-  %trunc_v2i64_to_i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
+; Truncates to unpacked or legal types with vscale x 2 elements
+  %trunc_nxv2i16_to_i8 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i8>
+  %trunc_nxv2i32_to_i8 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i8>
+  %trunc_nxv2i64_to_i8 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i8>
+  %trunc_nxv2i32_to_i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
+  %trunc_nxv2i64_to_i16 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i16>
+  %trunc_nxv2i64_to_i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
 
-  %trunc_v4i32_to_i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
-  %trunc_v4i64_to_i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
+; Truncates to unpacked or legal with vscale x 4 elements
+  %trunc_nxv4i16_to_i8  = trunc <vscale x 4 x i16> undef to <vscale x 4 x i8>
+  %trunc_nxv4i32_to_i8  = trunc <vscale x 4 x i32> undef to <vscale x 4 x i8>
+  %trunc_nxv4i64_to_i8  = trunc <vscale x 4 x i64> undef to <vscale x 4 x i8>
+  %trunc_nxv4i32_to_i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
+  %trunc_nxv4i64_to_i16 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i16>
+  %trunc_nxv4i64_to_i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
 
-  %trunc_v8i32_to_i16 = trunc <vscale x 8 x i32> undef to <vscale x  8 x i16>
-  %trunc_v8i64_to_i32 = trunc <vscale x 8 x i64> undef to <vscale x  8 x i32>
+; Truncates to unpacked or legal with vscale x 8 elements
+  %trunc_nxv8i16_to_i8  = trunc <vscale x 8 x i16> undef to <vscale x 8 x i8>
+  %trunc_nxv8i32_to_i8  = trunc <vscale x 8 x i32> undef to <vscale x 8 x i8>
+  %trunc_nxv8i64_to_i8  = trunc <vscale x 8 x i64> undef to <vscale x 8 x i8>
+  %trunc_nxv8i32_to_i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
+  %trunc_nxv8i64_to_i16 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i16>
+
+; Truncates to unpacked or legal with vscale x 16 elements
+  %trunc_nxv16i16_to_i8 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i8>
+  %trunc_nxv16i32_to_i8 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i8>
+  %trunc_nxv16i64_to_i8 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i8>
 
   ret void
 }

From e020f460275aab9053d9e090d0b777b40da14a81 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 19 Dec 2024 10:20:35 +0000
Subject: [PATCH 039/209] [ARM] Fix BF16 lowering with FullFP16

This adds test coverage for bf16 instructions, making sure that lowering bf16
works with and without +fullfp16.
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |   21 +-
 llvm/lib/Target/ARM/ARMISelLowering.h         |    1 +
 llvm/test/CodeGen/Thumb2/bf16-instructions.ll | 2336 +++++++++++++++++
 3 files changed, 2357 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/Thumb2/bf16-instructions.ll

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 88293c1b1101a..860d13f3d1217 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -804,6 +804,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setAllExpand(MVT::bf16);
     if (!Subtarget->hasFullFP16())
       setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
+  } else {
+    setOperationAction(ISD::BF16_TO_FP, MVT::f32, Expand);
+    setOperationAction(ISD::FP_TO_BF16, MVT::f32, Custom);
   }
 
   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
@@ -6301,10 +6304,13 @@ SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
                      DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
 
   if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
-      (SrcVT == MVT::f16 || SrcVT == MVT::bf16))
+      (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
+    if (Subtarget->hasFullFP16() && !Subtarget->hasBF16())
+      Op = DAG.getBitcast(MVT::f16, Op);
     return DAG.getNode(
         ISD::TRUNCATE, SDLoc(N), DstVT,
         MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
+  }
 
   if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
     return SDValue();
@@ -10588,6 +10594,17 @@ SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getFrameIndex(FI, VT);
 }
 
+SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MakeLibCallOptions CallOptions;
+  MVT SVT = Op.getOperand(0).getSimpleValueType();
+  RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
+  SDValue Res =
+      makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
+  return DAG.getBitcast(MVT::i32, Res);
+}
+
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
   switch (Op.getOpcode()) {
@@ -10713,6 +10730,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
   case ISD::SPONENTRY:
     return LowerSPONENTRY(Op, DAG);
+  case ISD::FP_TO_BF16:
+    return LowerFP_TO_BF16(Op, DAG);
   case ARMISD::WIN__DBZCHK: return SDValue();
   }
 }
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 4fa600e0cfcc4..49416e2c8b25e 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -886,6 +886,7 @@ class VectorType;
     SDValue LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const;
     void LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
                    SelectionDAG &DAG) const;
+    SDValue LowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
 
     Register getRegisterByName(const char* RegName, LLT VT,
                                const MachineFunction &MF) const override;
diff --git a/llvm/test/CodeGen/Thumb2/bf16-instructions.ll b/llvm/test/CodeGen/Thumb2/bf16-instructions.ll
new file mode 100644
index 0000000000000..11c9c6028d342
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/bf16-instructions.ll
@@ -0,0 +1,2336 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple thumbv8.1m.main-none-eabi | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP
+; RUN: llc < %s -mtriple thumbv8.1m.main-none-eabihf -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP
+
+define bfloat @test_fadd(bfloat %a, bfloat %b) {
+; CHECK-NOFP-LABEL: test_fadd:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_fadd
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fadd:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vadd.f32 s0, s2, s0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = fadd bfloat %a, %b
+  ret bfloat %r
+}
+
+define bfloat @test_fsub(bfloat %a, bfloat %b) {
+;
+; CHECK-NOFP-LABEL: test_fsub:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_fsub
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fsub:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vsub.f32 s0, s2, s0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = fsub bfloat %a, %b
+  ret bfloat %r
+}
+
+define bfloat @test_fmul(bfloat %a, bfloat %b) {
+;
+; CHECK-NOFP-LABEL: test_fmul:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_fmul
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fmul:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vmul.f32 s0, s2, s0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = fmul bfloat %a, %b
+  ret bfloat %r
+}
+
+define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) {
+;
+; CHECK-NOFP-LABEL: test_fmadd:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r4, lr}
+; CHECK-NOFP-NEXT:    push {r4, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    mov r4, r2
+; CHECK-NOFP-NEXT:    bl __aeabi_fmul
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r4, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_fadd
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r4, pc}
+;
+; CHECK-FP-LABEL: test_fmadd:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    .vsave {d8}
+; CHECK-FP-NEXT:    vpush {d8}
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    vmov.f32 s16, s2
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vmul.f32 s0, s2, s0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov r0, s16
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vadd.f32 s0, s2, s0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    vpop {d8}
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %mul = fmul fast bfloat %a, %b
+  %r = fadd fast bfloat %mul, %c
+  ret bfloat %r
+}
+
+define bfloat @test_fdiv(bfloat %a, bfloat %b) {
+;
+; CHECK-NOFP-LABEL: test_fdiv:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_fdiv
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fdiv:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vdiv.f32 s0, s2, s0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = fdiv bfloat %a, %b
+  ret bfloat %r
+}
+
+define bfloat @test_frem(bfloat %a, bfloat %b) {
+;
+; CHECK-NOFP-LABEL: test_frem:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    bl fmodf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_frem:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    lsls r1, r1, #16
+; CHECK-FP-NEXT:    vmov s1, r0
+; CHECK-FP-NEXT:    vmov s0, r1
+; CHECK-FP-NEXT:    bl fmodf
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = frem bfloat %a, %b
+  ret bfloat %r
+}
+
+define void @test_store(bfloat %a, ptr %b) {
+; CHECK-NOFP-LABEL: test_store:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    strh r0, [r1]
+; CHECK-NOFP-NEXT:    bx lr
+;
+; CHECK-FP-LABEL: test_store:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    strh r1, [r0]
+; CHECK-FP-NEXT:    bx lr
+  store bfloat %a, ptr %b
+  ret void
+}
+
+define bfloat @test_load(ptr %a) {
+; CHECK-NOFP-LABEL: test_load:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    ldrh r0, [r0]
+; CHECK-NOFP-NEXT:    bx lr
+;
+; CHECK-FP-LABEL: test_load:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    ldrh r0, [r0]
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bx lr
+  %r = load bfloat, ptr %a
+  ret bfloat %r
+}
+
+declare bfloat @test_callee(bfloat %a, bfloat %b)
+
+define bfloat @test_call(bfloat %a, bfloat %b) {
+; CHECK-LABEL: test_call:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    bl test_callee
+; CHECK-NEXT:    pop {r7, pc}
+  %r = call bfloat @test_callee(bfloat %a, bfloat %b)
+  ret bfloat %r
+}
+
+define void @test_call_store(bfloat %a, bfloat %b, ptr %p) {
+; CHECK-NOFP-LABEL: test_call_store:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r4, lr}
+; CHECK-NOFP-NEXT:    push {r4, lr}
+; CHECK-NOFP-NEXT:    mov r4, r2
+; CHECK-NOFP-NEXT:    bl test_callee
+; CHECK-NOFP-NEXT:    strh r0, [r4]
+; CHECK-NOFP-NEXT:    pop {r4, pc}
+;
+; CHECK-FP-LABEL: test_call_store:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r4, lr}
+; CHECK-FP-NEXT:    push {r4, lr}
+; CHECK-FP-NEXT:    mov r4, r0
+; CHECK-FP-NEXT:    bl test_callee
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    strh r0, [r4]
+; CHECK-FP-NEXT:    pop {r4, pc}
+  %r = call bfloat @test_callee(bfloat %a, bfloat %b)
+  store bfloat %r, ptr %p
+  ret void
+}
+
+define bfloat @test_call_flipped(bfloat %a, bfloat %b) {
+; CHECK-NOFP-LABEL: test_call_flipped:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    mov r2, r0
+; CHECK-NOFP-NEXT:    mov r0, r1
+; CHECK-NOFP-NEXT:    mov r1, r2
+; CHECK-NOFP-NEXT:    bl test_callee
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_call_flipped:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov.f32 s2, s0
+; CHECK-FP-NEXT:    vmov.f32 s0, s1
+; CHECK-FP-NEXT:    vmov.f32 s1, s2
+; CHECK-FP-NEXT:    bl test_callee
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @test_callee(bfloat %b, bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_tailcall_flipped(bfloat %a, bfloat %b) {
+; CHECK-NOFP-LABEL: test_tailcall_flipped:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    mov r2, r0
+; CHECK-NOFP-NEXT:    mov r0, r1
+; CHECK-NOFP-NEXT:    mov r1, r2
+; CHECK-NOFP-NEXT:    b test_callee
+;
+; CHECK-FP-LABEL: test_tailcall_flipped:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov.f32 s2, s0
+; CHECK-FP-NEXT:    vmov.f32 s0, s1
+; CHECK-FP-NEXT:    vmov.f32 s1, s2
+; CHECK-FP-NEXT:    b test_callee
+  %r = tail call bfloat @test_callee(bfloat %b, bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_select(bfloat %a, bfloat %b, i1 zeroext %c) {
+; CHECK-NOFP-LABEL: test_select:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    cmp r2, #0
+; CHECK-NOFP-NEXT:    csel r0, r0, r1, ne
+; CHECK-NOFP-NEXT:    bx lr
+;
+; CHECK-FP-LABEL: test_select:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r1, s1
+; CHECK-FP-NEXT:    cmp r0, #0
+; CHECK-FP-NEXT:    it ne
+; CHECK-FP-NEXT:    vmovne r1, s0
+; CHECK-FP-NEXT:    vmov.f16 s0, r1
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bx lr
+  %r = select i1 %c, bfloat %a, bfloat %b
+  ret bfloat %r
+}
+
+define bfloat @test_select_cc(bfloat %a, bfloat %b, bfloat %c, bfloat %d) {
+; CHECK-NOFP-LABEL: test_select_cc:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NOFP-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NOFP-NEXT:    mov r4, r1
+; CHECK-NOFP-NEXT:    mov r5, r0
+; CHECK-NOFP-NEXT:    lsls r0, r2, #16
+; CHECK-NOFP-NEXT:    lsls r1, r3, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmpeq
+; CHECK-NOFP-NEXT:    cmp r0, #0
+; CHECK-NOFP-NEXT:    csel r0, r5, r4, eq
+; CHECK-NOFP-NEXT:    pop {r4, r5, r7, pc}
+;
+; CHECK-FP-LABEL: test_select_cc:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s3
+; CHECK-FP-NEXT:    vmov r1, s2
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s4, r0
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vcmp.f32 s4, s2
+; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-FP-NEXT:    it ne
+; CHECK-FP-NEXT:    vmovne r0, s0
+; CHECK-FP-NEXT:    vmov.f16 s0, r0
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bx lr
+  %cc = fcmp une bfloat %c, %d
+  %r = select i1 %cc, bfloat %a, bfloat %b
+  ret bfloat %r
+}
+
+define float @test_select_cc_f32_f16(float %a, float %b, bfloat %c, bfloat %d) {
+; CHECK-NOFP-LABEL: test_select_cc_f32_f16:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NOFP-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NOFP-NEXT:    mov r4, r1
+; CHECK-NOFP-NEXT:    mov r5, r0
+; CHECK-NOFP-NEXT:    lsls r0, r2, #16
+; CHECK-NOFP-NEXT:    lsls r1, r3, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmpeq
+; CHECK-NOFP-NEXT:    cmp r0, #0
+; CHECK-NOFP-NEXT:    csel r0, r5, r4, eq
+; CHECK-NOFP-NEXT:    pop {r4, r5, r7, pc}
+;
+; CHECK-FP-LABEL: test_select_cc_f32_f16:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s3
+; CHECK-FP-NEXT:    vmov r1, s2
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s4, r0
+; CHECK-FP-NEXT:    vcmp.f32 s4, s2
+; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-FP-NEXT:    vseleq.f32 s0, s1, s0
+; CHECK-FP-NEXT:    bx lr
+  %cc = fcmp une bfloat %c, %d
+  %r = select i1 %cc, float %a, float %b
+  ret float %r
+}
+
+define bfloat @test_select_cc_f16_f32(bfloat %a, bfloat %b, float %c, float %d) {
+; CHECK-NOFP-LABEL: test_select_cc_f16_f32:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NOFP-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NOFP-NEXT:    mov r4, r1
+; CHECK-NOFP-NEXT:    mov r5, r0
+; CHECK-NOFP-NEXT:    mov r0, r2
+; CHECK-NOFP-NEXT:    mov r1, r3
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmpeq
+; CHECK-NOFP-NEXT:    cmp r0, #0
+; CHECK-NOFP-NEXT:    csel r0, r5, r4, eq
+; CHECK-NOFP-NEXT:    pop {r4, r5, r7, pc}
+;
+; CHECK-FP-LABEL: test_select_cc_f16_f32:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vcmp.f32 s2, s3
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-FP-NEXT:    it ne
+; CHECK-FP-NEXT:    vmovne r0, s0
+; CHECK-FP-NEXT:    vmov.f16 s0, r0
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bx lr
+  %cc = fcmp une float %c, %d
+  %r = select i1 %cc, bfloat %a, bfloat %b
+  ret bfloat %r
+}
+
+define i1 @test_fcmp_une(bfloat %a, bfloat %b) {
+; CHECK-NOFP-LABEL: test_fcmp_une:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmpeq
+; CHECK-NOFP-NEXT:    cmp r0, #0
+; CHECK-NOFP-NEXT:    cset r0, eq
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fcmp_une:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vcmp.f32 s2, s0
+; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-FP-NEXT:    cset r0, ne
+; CHECK-FP-NEXT:    bx lr
+  %r = fcmp une bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ueq(bfloat %a, bfloat %b) {
+; CHECK-NOFP-LABEL: test_fcmp_ueq:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NOFP-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NOFP-NEXT:    lsls r4, r0, #16
+; CHECK-NOFP-NEXT:    lsls r5, r1, #16
+; CHECK-NOFP-NEXT:    mov r0, r4
+; CHECK-NOFP-NEXT:    mov r1, r5
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmpeq
+; CHECK-NOFP-NEXT:    mov r6, r0
+; CHECK-NOFP-NEXT:    mov r0, r4
+; CHECK-NOFP-NEXT:    mov r1, r5
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmpun
+; CHECK-NOFP-NEXT:    orrs r0, r6
+; CHECK-NOFP-NEXT:    cset r0, ne
+; CHECK-NOFP-NEXT:    pop {r4, r5, r6, pc}
+;
+; CHECK-FP-LABEL: test_fcmp_ueq:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vcmp.f32 s2, s0
+; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-FP-NEXT:    cset r0, eq
+; CHECK-FP-NEXT:    csinc r0, r0, zr, vc
+; CHECK-FP-NEXT:    bx lr
+  %r = fcmp ueq bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ugt(bfloat %a, bfloat %b) {
+; CHECK-NOFP-LABEL: test_fcmp_ugt:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmple
+; CHECK-NOFP-NEXT:    cmp r0, #0
+; CHECK-NOFP-NEXT:    cset r0, eq
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fcmp_ugt:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vcmp.f32 s2, s0
+; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-FP-NEXT:    cset r0, hi
+; CHECK-FP-NEXT:    bx lr
+  %r = fcmp ugt bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_uge(bfloat %a, bfloat %b) {
+; CHECK-NOFP-LABEL: test_fcmp_uge:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmplt
+; CHECK-NOFP-NEXT:    cmp r0, #0
+; CHECK-NOFP-NEXT:    cset r0, eq
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fcmp_uge:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vcmp.f32 s2, s0
+; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-FP-NEXT:    cset r0, pl
+; CHECK-FP-NEXT:    bx lr
+  %r = fcmp uge bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ult(bfloat %a, bfloat %b) {
+; CHECK-NOFP-LABEL: test_fcmp_ult:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmpge
+; CHECK-NOFP-NEXT:    cmp r0, #0
+; CHECK-NOFP-NEXT:    cset r0, eq
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fcmp_ult:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vcmp.f32 s2, s0
+; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-FP-NEXT:    cset r0, lt
+; CHECK-FP-NEXT:    bx lr
+  %r = fcmp ult bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ule(bfloat %a, bfloat %b) {
+; CHECK-NOFP-LABEL: test_fcmp_ule:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmpgt
+; CHECK-NOFP-NEXT:    cmp r0, #0
+; CHECK-NOFP-NEXT:    cset r0, eq
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fcmp_ule:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vcmp.f32 s2, s0
+; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-FP-NEXT:    cset r0, le
+; CHECK-FP-NEXT:    bx lr
+  %r = fcmp ule bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_uno(bfloat %a, bfloat %b) {
+; CHECK-NOFP-LABEL: test_fcmp_uno:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmpun
+; CHECK-NOFP-NEXT:    cmp r0, #0
+; CHECK-NOFP-NEXT:    cset r0, ne
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fcmp_uno:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vcmp.f32 s2, s0
+; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-FP-NEXT:    cset r0, vs
+; CHECK-FP-NEXT:    bx lr
+  %r = fcmp uno bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_one(bfloat %a, bfloat %b) {
+; CHECK-NOFP-LABEL: test_fcmp_one:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NOFP-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NOFP-NEXT:    lsls r4, r0, #16
+; CHECK-NOFP-NEXT:    lsls r5, r1, #16
+; CHECK-NOFP-NEXT:    mov r0, r4
+; CHECK-NOFP-NEXT:    mov r1, r5
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmpeq
+; CHECK-NOFP-NEXT:    mov r6, r0
+; CHECK-NOFP-NEXT:    mov r0, r4
+; CHECK-NOFP-NEXT:    mov r1, r5
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmpun
+; CHECK-NOFP-NEXT:    orrs r0, r6
+; CHECK-NOFP-NEXT:    cset r0, eq
+; CHECK-NOFP-NEXT:    pop {r4, r5, r6, pc}
+;
+; CHECK-FP-LABEL: test_fcmp_one:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vcmp.f32 s2, s0
+; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-FP-NEXT:    cset r0, mi
+; CHECK-FP-NEXT:    csinc r0, r0, zr, le
+; CHECK-FP-NEXT:    bx lr
+  %r = fcmp one bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_oeq(bfloat %a, bfloat %b) {
+; CHECK-NOFP-LABEL: test_fcmp_oeq:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmpeq
+; CHECK-NOFP-NEXT:    cmp r0, #0
+; CHECK-NOFP-NEXT:    cset r0, ne
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fcmp_oeq:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vcmp.f32 s2, s0
+; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-FP-NEXT:    cset r0, eq
+; CHECK-FP-NEXT:    bx lr
+  %r = fcmp oeq bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ogt(bfloat %a, bfloat %b) {
+; CHECK-NOFP-LABEL: test_fcmp_ogt:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmpgt
+; CHECK-NOFP-NEXT:    cmp r0, #0
+; CHECK-NOFP-NEXT:    cset r0, ne
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fcmp_ogt:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vcmp.f32 s2, s0
+; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-FP-NEXT:    cset r0, gt
+; CHECK-FP-NEXT:    bx lr
+  %r = fcmp ogt bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_oge(bfloat %a, bfloat %b) {
+; CHECK-NOFP-LABEL: test_fcmp_oge:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmpge
+; CHECK-NOFP-NEXT:    cmp r0, #0
+; CHECK-NOFP-NEXT:    cset r0, ne
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fcmp_oge:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vcmp.f32 s2, s0
+; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-FP-NEXT:    cset r0, ge
+; CHECK-FP-NEXT:    bx lr
+  %r = fcmp oge bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_olt(bfloat %a, bfloat %b) {
+; CHECK-NOFP-LABEL: test_fcmp_olt:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmplt
+; CHECK-NOFP-NEXT:    cmp r0, #0
+; CHECK-NOFP-NEXT:    cset r0, ne
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fcmp_olt:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vcmp.f32 s2, s0
+; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-FP-NEXT:    cset r0, mi
+; CHECK-FP-NEXT:    bx lr
+  %r = fcmp olt bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ole(bfloat %a, bfloat %b) {
+; CHECK-NOFP-LABEL: test_fcmp_ole:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmple
+; CHECK-NOFP-NEXT:    cmp r0, #0
+; CHECK-NOFP-NEXT:    cset r0, ne
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fcmp_ole:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vcmp.f32 s2, s0
+; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-FP-NEXT:    cset r0, ls
+; CHECK-FP-NEXT:    bx lr
+  %r = fcmp ole bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ord(bfloat %a, bfloat %b) {
+; CHECK-NOFP-LABEL: test_fcmp_ord:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmpun
+; CHECK-NOFP-NEXT:    cmp r0, #0
+; CHECK-NOFP-NEXT:    cset r0, eq
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fcmp_ord:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vcmp.f32 s2, s0
+; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-FP-NEXT:    cset r0, vc
+; CHECK-FP-NEXT:    bx lr
+  %r = fcmp ord bfloat %a, %b
+  ret i1 %r
+}
+
+define void @test_fccmp(bfloat %in, ptr %out) {
+; CHECK-NOFP-LABEL: test_fccmp:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r4, r5, r6, r7, lr}
+; CHECK-NOFP-NEXT:    push {r4, r5, r6, r7, lr}
+; CHECK-NOFP-NEXT:    .pad #4
+; CHECK-NOFP-NEXT:    sub sp, #4
+; CHECK-NOFP-NEXT:    lsls r6, r0, #16
+; CHECK-NOFP-NEXT:    mov r4, r1
+; CHECK-NOFP-NEXT:    mov r5, r0
+; CHECK-NOFP-NEXT:    mov r0, r6
+; CHECK-NOFP-NEXT:    mov.w r1, #1207959552
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmpgt
+; CHECK-NOFP-NEXT:    mov r7, r0
+; CHECK-NOFP-NEXT:    mov r0, r6
+; CHECK-NOFP-NEXT:    mov.w r1, #1157627904
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmplt
+; CHECK-NOFP-NEXT:    mov.w r1, #17664
+; CHECK-NOFP-NEXT:    cmp r0, #0
+; CHECK-NOFP-NEXT:    csel r0, r5, r1, ne
+; CHECK-NOFP-NEXT:    cmp r7, #0
+; CHECK-NOFP-NEXT:    csel r0, r0, r1, ne
+; CHECK-NOFP-NEXT:    strh r0, [r4]
+; CHECK-NOFP-NEXT:    add sp, #4
+; CHECK-NOFP-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; CHECK-FP-LABEL: test_fccmp:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    vldr s0, .LCPI30_0
+; CHECK-FP-NEXT:    vldr s4, .LCPI30_1
+; CHECK-FP-NEXT:    lsls r2, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r2
+; CHECK-FP-NEXT:    mov.w r2, #17664
+; CHECK-FP-NEXT:    vcmp.f32 s2, s0
+; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-FP-NEXT:    vcmp.f32 s2, s4
+; CHECK-FP-NEXT:    csel r1, r1, r2, mi
+; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-FP-NEXT:    csel r1, r1, r2, gt
+; CHECK-FP-NEXT:    strh r1, [r0]
+; CHECK-FP-NEXT:    bx lr
+; CHECK-FP-NEXT:    .p2align 2
+; CHECK-FP-NEXT:  @ %bb.1:
+; CHECK-FP-NEXT:  .LCPI30_0:
+; CHECK-FP-NEXT:    .long 0x45000000 @ float 2048
+; CHECK-FP-NEXT:  .LCPI30_1:
+; CHECK-FP-NEXT:    .long 0x48000000 @ float 131072
+  %cmp1 = fcmp ogt bfloat %in, 0xR4800
+  %cmp2 = fcmp olt bfloat %in, 0xR4500
+  %cond = and i1 %cmp1, %cmp2
+  %result = select i1 %cond, bfloat %in, bfloat 0xR4500
+  store bfloat %result, ptr %out
+  ret void
+}
+
+define void @test_br_cc(bfloat %a, bfloat %b, ptr %p1, ptr %p2) {
+; CHECK-NOFP-LABEL: test_br_cc:
+; CHECK-NOFP:       @ %bb.0: @ %common.ret
+; CHECK-NOFP-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NOFP-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    mov r4, r3
+; CHECK-NOFP-NEXT:    mov r5, r2
+; CHECK-NOFP-NEXT:    bl __aeabi_fcmplt
+; CHECK-NOFP-NEXT:    cmp r0, #0
+; CHECK-NOFP-NEXT:    mov.w r1, #0
+; CHECK-NOFP-NEXT:    csel r0, r5, r4, eq
+; CHECK-NOFP-NEXT:    str r1, [r0]
+; CHECK-NOFP-NEXT:    pop {r4, r5, r7, pc}
+;
+; CHECK-FP-LABEL: test_br_cc:
+; CHECK-FP:       @ %bb.0: @ %common.ret
+; CHECK-FP-NEXT:    vmov r2, s1
+; CHECK-FP-NEXT:    vmov r3, s0
+; CHECK-FP-NEXT:    lsls r2, r2, #16
+; CHECK-FP-NEXT:    vmov s0, r2
+; CHECK-FP-NEXT:    lsls r2, r3, #16
+; CHECK-FP-NEXT:    vmov s2, r2
+; CHECK-FP-NEXT:    vcmp.f32 s2, s0
+; CHECK-FP-NEXT:    vmrs APSR_nzcv, fpscr
+; CHECK-FP-NEXT:    csel r0, r0, r1, pl
+; CHECK-FP-NEXT:    movs r1, #0
+; CHECK-FP-NEXT:    str r1, [r0]
+; CHECK-FP-NEXT:    bx lr
+  %c = fcmp uge bfloat %a, %b
+  br i1 %c, label %then, label %else
+then:
+  store i32 0, ptr %p1
+  ret void
+else:
+  store i32 0, ptr %p2
+  ret void
+}
+
+define bfloat @test_phi(ptr %p1) {
+; CHECK-NOFP-LABEL: test_phi:
+; CHECK-NOFP:       @ %bb.0: @ %entry
+; CHECK-NOFP-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NOFP-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NOFP-NEXT:    ldrh r6, [r0]
+; CHECK-NOFP-NEXT:    mov r4, r0
+; CHECK-NOFP-NEXT:  .LBB32_1: @ %loop
+; CHECK-NOFP-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-NOFP-NEXT:    mov r0, r4
+; CHECK-NOFP-NEXT:    mov r5, r6
+; CHECK-NOFP-NEXT:    ldrh r6, [r4]
+; CHECK-NOFP-NEXT:    bl test_dummy
+; CHECK-NOFP-NEXT:    lsls r0, r0, #31
+; CHECK-NOFP-NEXT:    bne .LBB32_1
+; CHECK-NOFP-NEXT:  @ %bb.2: @ %return
+; CHECK-NOFP-NEXT:    mov r0, r5
+; CHECK-NOFP-NEXT:    pop {r4, r5, r6, pc}
+;
+; CHECK-FP-LABEL: test_phi:
+; CHECK-FP:       @ %bb.0: @ %entry
+; CHECK-FP-NEXT:    .save {r4, lr}
+; CHECK-FP-NEXT:    push {r4, lr}
+; CHECK-FP-NEXT:    .vsave {d8, d9}
+; CHECK-FP-NEXT:    vpush {d8, d9}
+; CHECK-FP-NEXT:    mov r4, r0
+; CHECK-FP-NEXT:    ldrh r0, [r0]
+; CHECK-FP-NEXT:    vmov s18, r0
+; CHECK-FP-NEXT:  .LBB32_1: @ %loop
+; CHECK-FP-NEXT:    @ =>This Inner Loop Header: Depth=1
+; CHECK-FP-NEXT:    ldrh r0, [r4]
+; CHECK-FP-NEXT:    vmov.f32 s16, s18
+; CHECK-FP-NEXT:    vmov s18, r0
+; CHECK-FP-NEXT:    mov r0, r4
+; CHECK-FP-NEXT:    bl test_dummy
+; CHECK-FP-NEXT:    lsls r0, r0, #31
+; CHECK-FP-NEXT:    bne .LBB32_1
+; CHECK-FP-NEXT:  @ %bb.2: @ %return
+; CHECK-FP-NEXT:    vmov.f32 s0, s16
+; CHECK-FP-NEXT:    vpop {d8, d9}
+; CHECK-FP-NEXT:    pop {r4, pc}
+entry:
+  %a = load bfloat, ptr %p1
+  br label %loop
+loop:
+  %r = phi bfloat [%a, %entry], [%b, %loop]
+  %b = load bfloat, ptr %p1
+  %c = call i1 @test_dummy(ptr %p1)
+  br i1 %c, label %loop, label %return
+return:
+  ret bfloat %r
+}
+
+declare i1 @test_dummy(ptr %p1) #0
+
+define i32 @test_fptosi_i32(bfloat %a) {
+; CHECK-NOFP-LABEL: test_fptosi_i32:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_f2iz
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fptosi_i32:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    vcvt.s32.f32 s0, s0
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    bx lr
+  %r = fptosi bfloat %a to i32
+  ret i32 %r
+}
+
+define i64 @test_fptosi_i64(bfloat %a) {
+; CHECK-NOFP-LABEL: test_fptosi_i64:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_f2lz
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fptosi_i64:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    bl __aeabi_f2lz
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = fptosi bfloat %a to i64
+  ret i64 %r
+}
+
+define i32 @test_fptoui_i32(bfloat %a) {
+; CHECK-NOFP-LABEL: test_fptoui_i32:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_f2uiz
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fptoui_i32:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    vcvt.u32.f32 s0, s0
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    bx lr
+  %r = fptoui bfloat %a to i32
+  ret i32 %r
+}
+
+define i64 @test_fptoui_i64(bfloat %a) {
+; CHECK-NOFP-LABEL: test_fptoui_i64:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_f2ulz
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fptoui_i64:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    bl __aeabi_f2ulz
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = fptoui bfloat %a to i64
+  ret i64 %r
+}
+
+define bfloat @test_uitofp_i32(i32 %a) {
+;
+; CHECK-NOFP-LABEL: test_uitofp_i32:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    bl __aeabi_ui2f
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_uitofp_i32:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    vcvt.f32.u32 s0, s0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = uitofp i32 %a to bfloat
+  ret bfloat %r
+}
+
+define bfloat @test_uitofp_i64(i64 %a) {
+;
+; CHECK-NOFP-LABEL: test_uitofp_i64:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    bl __aeabi_ul2f
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_uitofp_i64:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    bl __aeabi_ul2f
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = uitofp i64 %a to bfloat
+  ret bfloat %r
+}
+
+define bfloat @test_sitofp_i32(i32 %a) {
+;
+; CHECK-NOFP-LABEL: test_sitofp_i32:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    bl __aeabi_i2f
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_sitofp_i32:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    vcvt.f32.s32 s0, s0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = sitofp i32 %a to bfloat
+  ret bfloat %r
+}
+
+define bfloat @test_sitofp_i64(i64 %a) {
+;
+; CHECK-NOFP-LABEL: test_sitofp_i64:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    bl __aeabi_l2f
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_sitofp_i64:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    bl __aeabi_l2f
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = sitofp i64 %a to bfloat
+  ret bfloat %r
+}
+
+define bfloat @test_uitofp_i32_fadd(i32 %a, bfloat %b) {
+;
+; CHECK-NOFP-LABEL: test_uitofp_i32_fadd:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r4, lr}
+; CHECK-NOFP-NEXT:    push {r4, lr}
+; CHECK-NOFP-NEXT:    mov r4, r1
+; CHECK-NOFP-NEXT:    bl __aeabi_ui2f
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    lsls r2, r4, #16
+; CHECK-NOFP-NEXT:    lsls r1, r0, #16
+; CHECK-NOFP-NEXT:    mov r0, r2
+; CHECK-NOFP-NEXT:    bl __aeabi_fadd
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r4, pc}
+;
+; CHECK-FP-LABEL: test_uitofp_i32_fadd:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r4, lr}
+; CHECK-FP-NEXT:    push {r4, lr}
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vmov r4, s0
+; CHECK-FP-NEXT:    vcvt.f32.u32 s2, s2
+; CHECK-FP-NEXT:    vmov.f32 s0, s2
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r1, r4, #16
+; CHECK-FP-NEXT:    vmov s0, r1
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r4, pc}
+  %c = uitofp i32 %a to bfloat
+  %r = fadd bfloat %b, %c
+  ret bfloat %r
+}
+
+define bfloat @test_sitofp_i32_fadd(i32 %a, bfloat %b) {
+;
+; CHECK-NOFP-LABEL: test_sitofp_i32_fadd:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r4, lr}
+; CHECK-NOFP-NEXT:    push {r4, lr}
+; CHECK-NOFP-NEXT:    mov r4, r1
+; CHECK-NOFP-NEXT:    bl __aeabi_i2f
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    lsls r2, r4, #16
+; CHECK-NOFP-NEXT:    lsls r1, r0, #16
+; CHECK-NOFP-NEXT:    mov r0, r2
+; CHECK-NOFP-NEXT:    bl __aeabi_fadd
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r4, pc}
+;
+; CHECK-FP-LABEL: test_sitofp_i32_fadd:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r4, lr}
+; CHECK-FP-NEXT:    push {r4, lr}
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vmov r4, s0
+; CHECK-FP-NEXT:    vcvt.f32.s32 s2, s2
+; CHECK-FP-NEXT:    vmov.f32 s0, s2
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r1, r4, #16
+; CHECK-FP-NEXT:    vmov s0, r1
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vadd.f32 s0, s0, s2
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r4, pc}
+  %c = sitofp i32 %a to bfloat
+  %r = fadd bfloat %b, %c
+  ret bfloat %r
+}
+
+define bfloat @test_fptrunc_float(float %a) {
+;
+; CHECK-NOFP-LABEL: test_fptrunc_float:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fptrunc_float:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = fptrunc float %a to bfloat
+  ret bfloat %r
+}
+
+define bfloat @test_fptrunc_double(double %a) {
+;
+; CHECK-NOFP-LABEL: test_fptrunc_double:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    bl __truncdfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fptrunc_double:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, r1, d0
+; CHECK-FP-NEXT:    bl __aeabi_d2f
+; CHECK-FP-NEXT:    lsrs r0, r0, #16
+; CHECK-FP-NEXT:    vmov.f16 s0, r0
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = fptrunc double %a to bfloat
+  ret bfloat %r
+}
+
+define float @test_fpext_float(bfloat %a) {
+; CHECK-NOFP-LABEL: test_fpext_float:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bx lr
+;
+; CHECK-FP-LABEL: test_fpext_float:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bx lr
+  %r = fpext bfloat %a to float
+  ret float %r
+}
+
+define double @test_fpext_double(bfloat %a) {
+; CHECK-NOFP-LABEL: test_fpext_double:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_f2d
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fpext_double:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    bl __aeabi_f2d
+; CHECK-FP-NEXT:    vmov d0, r0, r1
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = fpext bfloat %a to double
+  ret double %r
+}
+
+define i16 @test_bitcast_bfloattoi16(bfloat %a) {
+; CHECK-NOFP-LABEL: test_bitcast_bfloattoi16:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    bx lr
+;
+; CHECK-FP-LABEL: test_bitcast_bfloattoi16:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    bx lr
+  %r = bitcast bfloat %a to i16
+  ret i16 %r
+}
+
+define bfloat @test_bitcast_i16tobfloat(i16 %a) {
+; CHECK-NOFP-LABEL: test_bitcast_i16tobfloat:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    bx lr
+;
+; CHECK-FP-LABEL: test_bitcast_i16tobfloat:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bx lr
+  %r = bitcast i16 %a to bfloat
+  ret bfloat %r
+}
+
+declare bfloat @llvm.sqrt.f16(bfloat %a) #0
+declare bfloat @llvm.powi.f16.i32(bfloat %a, i32 %b) #0
+declare bfloat @llvm.sin.f16(bfloat %a) #0
+declare bfloat @llvm.cos.f16(bfloat %a) #0
+declare bfloat @llvm.tan.f16(bfloat %a) #0
+declare bfloat @llvm.asin.f16(bfloat %a) #0
+declare bfloat @llvm.acos.f16(bfloat %a) #0
+declare bfloat @llvm.atan.f16(bfloat %a) #0
+declare bfloat @llvm.atan2.f16(bfloat %a, bfloat %b) #0
+declare bfloat @llvm.sinh.f16(bfloat %a) #0
+declare bfloat @llvm.cosh.f16(bfloat %a) #0
+declare bfloat @llvm.tanh.f16(bfloat %a) #0
+declare bfloat @llvm.pow.f16(bfloat %a, bfloat %b) #0
+declare bfloat @llvm.exp.f16(bfloat %a) #0
+declare bfloat @llvm.exp2.f16(bfloat %a) #0
+declare bfloat @llvm.log.f16(bfloat %a) #0
+declare bfloat @llvm.log10.f16(bfloat %a) #0
+declare bfloat @llvm.log2.f16(bfloat %a) #0
+declare bfloat @llvm.fma.f16(bfloat %a, bfloat %b, bfloat %c) #0
+declare bfloat @llvm.fabs.f16(bfloat %a) #0
+declare bfloat @llvm.minnum.f16(bfloat %a, bfloat %b) #0
+declare bfloat @llvm.maxnum.f16(bfloat %a, bfloat %b) #0
+declare bfloat @llvm.copysign.f16(bfloat %a, bfloat %b) #0
+declare bfloat @llvm.floor.f16(bfloat %a) #0
+declare bfloat @llvm.ceil.f16(bfloat %a) #0
+declare bfloat @llvm.trunc.f16(bfloat %a) #0
+declare bfloat @llvm.rint.f16(bfloat %a) #0
+declare bfloat @llvm.nearbyint.f16(bfloat %a) #0
+declare bfloat @llvm.round.f16(bfloat %a) #0
+declare bfloat @llvm.roundeven.f16(bfloat %a) #0
+declare bfloat @llvm.fmuladd.f16(bfloat %a, bfloat %b, bfloat %c) #0
+
+
+define bfloat @test_sqrt(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_sqrt:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl sqrtf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_sqrt:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    vsqrt.f32 s0, s0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.sqrt.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_powi(bfloat %a, i32 %b) {
+;
+; CHECK-NOFP-LABEL: test_powi:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl __powisf2
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_powi:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r1, r1, #16
+; CHECK-FP-NEXT:    vmov s0, r1
+; CHECK-FP-NEXT:    bl __powisf2
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.powi.f16.i32(bfloat %a, i32 %b)
+  ret bfloat %r
+}
+
+
+define bfloat @test_sin(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_sin:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl sinf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_sin:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bl sinf
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.sin.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_cos(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_cos:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl cosf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_cos:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bl cosf
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.cos.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_tan(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_tan:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl tanf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_tan:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bl tanf
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.tan.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_acos(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_acos:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl acosf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_acos:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bl acosf
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.acos.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_asin(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_asin:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl asinf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_asin:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bl asinf
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.asin.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_atan(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_atan:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl atanf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_atan:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bl atanf
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.atan.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_atan2(bfloat %a, bfloat %b) {
+;
+; CHECK-NOFP-LABEL: test_atan2:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    bl atan2f
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_atan2:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    lsls r1, r1, #16
+; CHECK-FP-NEXT:    vmov s1, r0
+; CHECK-FP-NEXT:    vmov s0, r1
+; CHECK-FP-NEXT:    bl atan2f
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.atan2.f16(bfloat %a, bfloat %b)
+  ret bfloat %r
+}
+
+define bfloat @test_cosh(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_cosh:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl coshf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_cosh:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bl coshf
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.cosh.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_sinh(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_sinh:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl sinhf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_sinh:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bl sinhf
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.sinh.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_tanh(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_tanh:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl tanhf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_tanh:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bl tanhf
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.tanh.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_pow(bfloat %a, bfloat %b) {
+;
+; CHECK-NOFP-LABEL: test_pow:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    bl powf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_pow:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    lsls r1, r1, #16
+; CHECK-FP-NEXT:    vmov s1, r0
+; CHECK-FP-NEXT:    vmov s0, r1
+; CHECK-FP-NEXT:    bl powf
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.pow.f16(bfloat %a, bfloat %b)
+  ret bfloat %r
+}
+
+define bfloat @test_exp(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_exp:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl expf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_exp:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bl expf
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.exp.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_exp2(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_exp2:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl exp2f
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_exp2:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bl exp2f
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.exp2.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_log(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_log:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl logf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_log:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bl logf
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.log.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_log10(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_log10:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl log10f
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_log10:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bl log10f
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.log10.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_log2(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_log2:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl log2f
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_log2:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bl log2f
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.log2.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_fma(bfloat %a, bfloat %b, bfloat %c) {
+;
+; CHECK-NOFP-LABEL: test_fma:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    lsls r2, r2, #16
+; CHECK-NOFP-NEXT:    bl fmaf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_fma:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    vmov r1, s1
+; CHECK-FP-NEXT:    vmov r2, s2
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    lsls r1, r1, #16
+; CHECK-FP-NEXT:    vmov s4, r0
+; CHECK-FP-NEXT:    lsls r0, r2, #16
+; CHECK-FP-NEXT:    vmov s2, r1
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    vfma.f32 s0, s4, s2
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.fma.f16(bfloat %a, bfloat %b, bfloat %c)
+  ret bfloat %r
+}
+
+define bfloat @test_fabs(bfloat %a) {
+; CHECK-NOFP-LABEL: test_fabs:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    bfc r0, #15, #17
+; CHECK-NOFP-NEXT:    bx lr
+;
+; CHECK-FP-LABEL: test_fabs:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    bfc r0, #15, #17
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bx lr
+  %r = call bfloat @llvm.fabs.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_minnum(bfloat %a, bfloat %b) {
+;
+; CHECK-NOFP-LABEL: test_minnum:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    bl fminf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_minnum:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vminnm.f32 s0, s2, s0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.minnum.f16(bfloat %a, bfloat %b)
+  ret bfloat %r
+}
+
+define bfloat @test_maxnum(bfloat %a, bfloat %b) {
+;
+; CHECK-NOFP-LABEL: test_maxnum:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    bl fmaxf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_maxnum:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vmaxnm.f32 s0, s2, s0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.maxnum.f16(bfloat %a, bfloat %b)
+  ret bfloat %r
+}
+
+define bfloat @test_copysign(bfloat %a, bfloat %b) {
+;
+; CHECK-NOFP-LABEL: test_copysign:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    lsrs r1, r1, #15
+; CHECK-NOFP-NEXT:    bfi r0, r1, #15, #17
+; CHECK-NOFP-NEXT:    bx lr
+;
+; CHECK-FP-LABEL: test_copysign:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    vmov r1, s1
+; CHECK-FP-NEXT:    and r1, r1, #32768
+; CHECK-FP-NEXT:    bfc r0, #15, #17
+; CHECK-FP-NEXT:    add r0, r1
+; CHECK-FP-NEXT:    vmov.f16 s0, r0
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bx lr
+  %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b)
+  ret bfloat %r
+}
+
+define bfloat @test_copysign_f32(bfloat %a, float %b) {
+;
+; CHECK-NOFP-LABEL: test_copysign_f32:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    and r1, r1, #-2147483648
+; CHECK-NOFP-NEXT:    bfc r0, #15, #17
+; CHECK-NOFP-NEXT:    orr.w r0, r0, r1, lsr #16
+; CHECK-NOFP-NEXT:    bx lr
+;
+; CHECK-FP-LABEL: test_copysign_f32:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    vmov r1, s1
+; CHECK-FP-NEXT:    and r1, r1, #-2147483648
+; CHECK-FP-NEXT:    bfc r0, #15, #17
+; CHECK-FP-NEXT:    orr.w r0, r0, r1, lsr #16
+; CHECK-FP-NEXT:    vmov.f16 s0, r0
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bx lr
+  %tb = fptrunc float %b to bfloat
+  %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %tb)
+  ret bfloat %r
+}
+
+define bfloat @test_copysign_f64(bfloat %a, double %b) {
+;
+; CHECK-NOFP-LABEL: test_copysign_f64:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    and r1, r3, #-2147483648
+; CHECK-NOFP-NEXT:    bfc r0, #15, #17
+; CHECK-NOFP-NEXT:    orr.w r0, r0, r1, lsr #16
+; CHECK-NOFP-NEXT:    bx lr
+;
+; CHECK-FP-LABEL: test_copysign_f64:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    vmov r1, r2, d1
+; CHECK-FP-NEXT:    and r1, r2, #-2147483648
+; CHECK-FP-NEXT:    bfc r0, #15, #17
+; CHECK-FP-NEXT:    orr.w r0, r0, r1, lsr #16
+; CHECK-FP-NEXT:    vmov.f16 s0, r0
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bx lr
+  %tb = fptrunc double %b to bfloat
+  %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %tb)
+  ret bfloat %r
+}
+
+; away the (fpext (fp_round <result>)) here.
+
+define float @test_copysign_extended(bfloat %a, bfloat %b) {
+;
+; CHECK-NOFP-LABEL: test_copysign_extended:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    lsrs r1, r1, #15
+; CHECK-NOFP-NEXT:    bfi r0, r1, #15, #17
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bx lr
+;
+; CHECK-FP-LABEL: test_copysign_extended:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    vmov r1, s1
+; CHECK-FP-NEXT:    and r1, r1, #32768
+; CHECK-FP-NEXT:    bfc r0, #15, #17
+; CHECK-FP-NEXT:    add r0, r1
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bx lr
+  %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b)
+  %xr = fpext bfloat %r to float
+  ret float %xr
+}
+
+define bfloat @test_floor(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_floor:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl floorf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_floor:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    vrintm.f32 s0, s0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.floor.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_ceil(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_ceil:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl ceilf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_ceil:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    vrintp.f32 s0, s0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.ceil.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_trunc(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_trunc:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl truncf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_trunc:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    vrintz.f32 s0, s0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.trunc.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_rint(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_rint:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl rintf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_rint:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    vrintx.f32 s0, s0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.rint.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_nearbyint(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_nearbyint:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl nearbyintf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_nearbyint:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    vrintr.f32 s0, s0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.nearbyint.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_round(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_round:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl roundf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_round:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    vrinta.f32 s0, s0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.round.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_roundeven(bfloat %a) {
+;
+; CHECK-NOFP-LABEL: test_roundeven:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl roundevenf
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_roundeven:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bl roundevenf
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.roundeven.f16(bfloat %a)
+  ret bfloat %r
+}
+
+define bfloat @test_fmuladd(bfloat %a, bfloat %b, bfloat %c) {
+;
+; CHECK-NOFP-LABEL: test_fmuladd:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r4, lr}
+; CHECK-NOFP-NEXT:    push {r4, lr}
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r1, #16
+; CHECK-NOFP-NEXT:    mov r4, r2
+; CHECK-NOFP-NEXT:    bl __aeabi_fmul
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    lsls r1, r4, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_fadd
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    pop {r4, pc}
+;
+; CHECK-FP-LABEL: test_fmuladd:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    .vsave {d8}
+; CHECK-FP-NEXT:    vpush {d8}
+; CHECK-FP-NEXT:    vmov r0, s1
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    vmov.f32 s16, s2
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vmul.f32 s0, s2, s0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov r0, s16
+; CHECK-FP-NEXT:    vmov r1, s0
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    lsls r0, r1, #16
+; CHECK-FP-NEXT:    vmov s2, r0
+; CHECK-FP-NEXT:    vadd.f32 s0, s2, s0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov.f16 r0, s0
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    vpop {d8}
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = call bfloat @llvm.fmuladd.f16(bfloat %a, bfloat %b, bfloat %c)
+  ret bfloat %r
+}

From ecdc5289afec1af98640b6375a52aaf448fe7388 Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard@arm.com>
Date: Thu, 19 Dec 2024 10:34:56 +0000
Subject: [PATCH 040/209] [Clang] Fix crash in __builtin_assume_aligned
 (#114217)

The CodeGen for __builtin_assume_aligned assumes that the first argument
is a pointer, so crashes if the int-conversion error is downgraded or
disabled. Emit a non-downgradable error if the argument is not a
pointer, like we currently do for __builtin_launder.

Fixes #110914.
---
 .../checkers/cppcoreguidelines/pro-type-vararg.cpp       | 4 ++--
 clang/include/clang/Basic/DiagnosticSemaKinds.td         | 2 ++
 clang/lib/Sema/SemaChecking.cpp                          | 6 ++++--
 clang/test/Sema/builtin-assume-aligned-downgrade.c       | 9 +++++++++
 clang/test/Sema/builtin-assume-aligned.c                 | 2 +-
 5 files changed, 18 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/Sema/builtin-assume-aligned-downgrade.c

diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-vararg.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-vararg.cpp
index 6792c7920dd11..3f73d1de333f4 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-vararg.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-vararg.cpp
@@ -51,8 +51,8 @@ void my_printf(const char* format, ...) {
 
 int my_vprintf(const char* format, va_list arg ); // OK to declare function taking va_list
 
-void ignoredBuiltinsTest() {
-  (void)__builtin_assume_aligned(0, 8);
+void ignoredBuiltinsTest(void *ptr) {
+  (void)__builtin_assume_aligned(ptr, 8);
   (void)__builtin_constant_p(0);
   (void)__builtin_fpclassify(0, 0, 0, 0, 0, 0.f);
   (void)__builtin_isinf_sign(0.f);
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 7bd154e7da2f4..de34bcbf9ad4a 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12324,6 +12324,8 @@ def warn_noderef_to_dereferenceable_pointer : Warning<
 def err_builtin_launder_invalid_arg : Error<
   "%select{non-pointer|function pointer|void pointer}0 argument to "
   "'__builtin_launder' is not allowed">;
+def err_builtin_assume_aligned_invalid_arg : Error<
+  "non-pointer argument to '__builtin_assume_aligned' is not allowed">;
 
 def err_builtin_is_within_lifetime_invalid_arg : Error<
   "%select{non-|function }0pointer argument to '__builtin_is_within_lifetime' "
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 324ed7f6d90b7..be5d3694aec15 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5341,9 +5341,11 @@ bool Sema::BuiltinAssumeAligned(CallExpr *TheCall) {
   {
     ExprResult FirstArgResult =
         DefaultFunctionArrayLvalueConversion(FirstArg);
-    if (checkBuiltinArgument(*this, TheCall, 0))
+    if (!FirstArgResult.get()->getType()->isPointerType()) {
+      Diag(TheCall->getBeginLoc(), diag::err_builtin_assume_aligned_invalid_arg)
+          << TheCall->getSourceRange();
       return true;
-    /// In-place updation of FirstArg by checkBuiltinArgument is ignored.
+    }
     TheCall->setArg(0, FirstArgResult.get());
   }
 
diff --git a/clang/test/Sema/builtin-assume-aligned-downgrade.c b/clang/test/Sema/builtin-assume-aligned-downgrade.c
new file mode 100644
index 0000000000000..93631e7364373
--- /dev/null
+++ b/clang/test/Sema/builtin-assume-aligned-downgrade.c
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -fsyntax-only -Wno-int-conversion -triple x86_64-linux -verify %s
+
+// Check that the pointer->int conversion error is not downgradable for the
+// pointer argument to __builtin_assume_aligned.
+
+int test(int *a, int b) {
+  a = (int *)__builtin_assume_aligned(b, 32); // expected-error {{non-pointer argument to '__builtin_assume_aligned' is not allowed}}
+  int *y = __builtin_assume_aligned(1, 1); // expected-error {{non-pointer argument to '__builtin_assume_aligned' is not allowed}}
+}
diff --git a/clang/test/Sema/builtin-assume-aligned.c b/clang/test/Sema/builtin-assume-aligned.c
index 33e8557845152..57378a3426524 100644
--- a/clang/test/Sema/builtin-assume-aligned.c
+++ b/clang/test/Sema/builtin-assume-aligned.c
@@ -74,7 +74,7 @@ int test13(int *a) {
 }
 
 int test14(int *a, int b) {
-  a = (int *)__builtin_assume_aligned(b, 32); // expected-error {{incompatible integer to pointer conversion passing 'int' to parameter of type 'const void *}}
+  a = (int *)__builtin_assume_aligned(b, 32); // expected-error {{non-pointer argument to '__builtin_assume_aligned' is not allowed}}
 }
 
 int test15(int *b) {

From 30f386cb4d43d0bd8f57c49f68f71defd7dcf968 Mon Sep 17 00:00:00 2001
From: SpencerAbson <Spencer.Abson@arm.com>
Date: Thu, 19 Dec 2024 10:45:07 +0000
Subject: [PATCH 041/209] [AArch64] Fixup destructive floating-point precision
 conversions (#118788)

This patch changes the zeroing forms of `FCVTXNT`, `FCVTNT`, and
`BFCVTNT` such that their destination operand is also listed as a dag
input. These narrowing down-conversions leave the even elements of the
destination vector unchanged, regardless of the predicate type.

This patch also makes the merging form of `BFCVTNT` non-movprfx'able.

- `FCVTXNT` - [Arm
Developer](https://developer.arm.com/documentation/ddi0602/2024-09/SVE-Instructions/FCVTXNT--Floating-point-down-convert--rounding-to-odd--top--predicated--?lang=en)
- `FCVTNT` - [Arm
Developer](https://developer.arm.com/documentation/ddi0602/2024-09/SVE-Instructions/FCVTNT--predicated---Floating-point-down-convert-and-narrow--top--predicated--?lang=en)
- `BFCVTNT` - [Arm
Developer](https://developer.arm.com/documentation/ddi0602/2024-09/SVE-Instructions/BFCVTNT--Floating-point-down-convert-and-narrow-to-BFloat16--top--predicated--?lang=en)
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 16 ++--
 llvm/lib/Target/AArch64/SVEInstrFormats.td    | 92 ++++++-------------
 .../test/MC/AArch64/SVE/bfcvtnt-diagnostics.s | 11 ++-
 llvm/test/MC/AArch64/SVE/bfcvtnt.s            | 20 ----
 4 files changed, 48 insertions(+), 91 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 4a4412f9df6a1..dfdc78e00f2a2 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2455,8 +2455,9 @@ let Predicates = [HasBF16, HasSVEorSME] in {
   defm BFMLALT_ZZZ : sve2_fp_mla_long<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt>;
   defm BFMLALB_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b100, "bfmlalb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalb_lane_v2>;
   defm BFMLALT_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt_lane_v2>;
-  defm BFCVT_ZPmZ   : sve_bfloat_convert<0b1, "bfcvt",   int_aarch64_sve_fcvt_bf16f32_v2, AArch64fcvtr_mt>;
-  defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32_v2>;
+
+  defm BFCVT_ZPmZ   : sve_bfloat_convert<"bfcvt", int_aarch64_sve_fcvt_bf16f32_v2, AArch64fcvtr_mt>;
+  defm BFCVTNT_ZPmZ : sve_bfloat_convert_top<"bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32_v2>;
 } // End HasBF16, HasSVEorSME
 
 let Predicates = [HasSVEorSME] in {
@@ -4268,17 +4269,16 @@ let Predicates = [HasSVE2p2orSME2p2] in {
   defm FCVT_ZPzZ : sve_fp_z2op_p_zd_b_0<"fcvt", "int_aarch64_sve_fcvt">;
 
   // SVE2p2 floating-point convert precision down (placing odd), zeroing predicate
-  defm FCVTNT_ZPzZ      : sve_fp_fcvtntz<"fcvtnt">;
-  def FCVTXNT_ZPzZ_DtoS : sve_fp_fcvt2z<0b0010, "fcvtxnt", ZPR32, ZPR64>;
+  defm FCVTNT_ZPzZ : sve2_fp_convert_down_narrow_z<"fcvtnt">;
+  def FCVTXNT_ZPzZ : sve2_fp_convert_precision<0b0010, 0b0, "fcvtxnt", ZPR32, ZPR64, /*destructive*/ true>;
   // Placing even
-  defm FCVTX_ZPzZ       : sve_fp_z2op_p_zd<"fcvtx", int_aarch64_sve_fcvtx_f32f64>;
+  defm FCVTX_ZPzZ  : sve_fp_z2op_p_zd<"fcvtx", int_aarch64_sve_fcvtx_f32f64>;
 
   // SVE2p2 floating-point convert precision up, zeroing predicate
-  defm FCVTLT_ZPzZ      : sve_fp_fcvtltz<"fcvtlt", "int_aarch64_sve_fcvtlt">;
+  defm FCVTLT_ZPzZ : sve2_fp_convert_up_long_z<"fcvtlt", "int_aarch64_sve_fcvtlt">;
 
   // SVE2p2 floating-point convert single-to-bf (placing odd), zeroing predicate
-  def BFCVTNT_ZPzZ      : sve_fp_fcvt2z<0b1010, "bfcvtnt", ZPR16, ZPR32>;
-  // Placing corresponding
+  def BFCVTNT_ZPzZ      : sve2_fp_convert_precision<0b1010, 0b0, "bfcvtnt", ZPR16, ZPR32, /*destructive*/ true>;
   defm BFCVT_ZPzZ_StoH  : sve_fp_z2op_p_zd_bfcvt<"bfcvt", int_aarch64_sve_fcvt_bf16f32_v2>;
 
   // Floating-point convert to integer, zeroing predicate
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index f8a633872cb96..a831de878a910 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -2787,10 +2787,12 @@ multiclass sve_fp_fcadd<string asm, SDPatternOperator op> {
 // SVE2 Floating Point Convert Group
 //===----------------------------------------------------------------------===//
 
-class sve2_fp_convert_precision<bits<4> opc, string asm,
-                                ZPRRegOp zprty1, ZPRRegOp zprty2>
-: I<(outs zprty1:$Zd), (ins zprty1:$_Zd, PPR3bAny:$Pg, zprty2:$Zn),
-  asm, "\t$Zd, $Pg/m, $Zn",
+class sve2_fp_convert_precision<bits<4> opc, bit merging, string asm,
+                                ZPRRegOp zprty1, ZPRRegOp zprty2, bit destructive=merging>
+: I<(outs zprty1:$Zd),
+  !if(destructive, (ins zprty1:$_Zd, PPR3bAny:$Pg, zprty2:$Zn),
+                   (ins PPR3bAny:$Pg, zprty2:$Zn)),
+  asm, "\t$Zd, " # !if(merging, "$Pg/m", "$Pg/z")  # ", $Zn",
   "",
   []>, Sched<[]> {
   bits<5> Zd;
@@ -2798,74 +2800,55 @@ class sve2_fp_convert_precision<bits<4> opc, string asm,
   bits<3> Pg;
   let Inst{31-24} = 0b01100100;
   let Inst{23-22} = opc{3-2};
-  let Inst{21-18} = 0b0010;
+  let Inst{21-20} = 0b00;
+  let Inst{19}    = merging;
+  let Inst{18}    = 0b0;
   let Inst{17-16} = opc{1-0};
   let Inst{15-13} = 0b101;
   let Inst{12-10} = Pg;
   let Inst{9-5}   = Zn;
   let Inst{4-0}   = Zd;
 
-  let Constraints = "$Zd = $_Zd";
+  let Constraints = !if(destructive, "$Zd = $_Zd", "");
   let hasSideEffects = 0;
   let mayRaiseFPException = 1;
 }
 
 multiclass sve2_fp_convert_down_narrow<string asm, string op> {
-  def _StoH : sve2_fp_convert_precision<0b1000, asm, ZPR16, ZPR32>;
-  def _DtoS : sve2_fp_convert_precision<0b1110, asm, ZPR32, ZPR64>;
+  def _StoH : sve2_fp_convert_precision<0b1000, 0b1, asm, ZPR16, ZPR32>;
+  def _DtoS : sve2_fp_convert_precision<0b1110, 0b1, asm, ZPR32, ZPR64>;
 
   def : SVE_3_Op_Pat<nxv8f16, !cast<SDPatternOperator>(op # _f16f32), nxv8f16, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _StoH)>;
   def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
 }
 
 multiclass sve2_fp_convert_up_long<string asm, string op> {
-  def _HtoS : sve2_fp_convert_precision<0b1001, asm, ZPR32, ZPR16>;
-  def _StoD : sve2_fp_convert_precision<0b1111, asm, ZPR64, ZPR32>;
+  def _HtoS : sve2_fp_convert_precision<0b1001, 0b1, asm, ZPR32, ZPR16>;
+  def _StoD : sve2_fp_convert_precision<0b1111, 0b1, asm, ZPR64, ZPR32>;
 
   def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f16), nxv4f32, nxv4i1, nxv8f16, !cast<Instruction>(NAME # _HtoS)>;
   def : SVE_3_Op_Pat<nxv2f64, !cast<SDPatternOperator>(op # _f64f32), nxv2f64, nxv2i1, nxv4f32, !cast<Instruction>(NAME # _StoD)>;
 }
 
 multiclass sve2_fp_convert_down_odd_rounding_top<string asm, string op> {
-  def _DtoS : sve2_fp_convert_precision<0b0010, asm, ZPR32, ZPR64>;
+  def _DtoS : sve2_fp_convert_precision<0b0010, 0b1, asm, ZPR32, ZPR64>;
 
   def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
 }
 
-class sve_fp_fcvt2z<bits<4> opc, string asm, ZPRRegOp zprty1,
-                    ZPRRegOp zprty2>
-  : I<(outs zprty1:$Zd), (ins PPR3bAny:$Pg, zprty2:$Zn),
-    asm, "\t$Zd, $Pg/z, $Zn",
-    "",
-    []>, Sched<[]> {
-    bits<5> Zd;
-    bits<5> Zn;
-    bits<3> Pg;
-    let Inst{31-24} = 0b01100100;
-    let Inst{23-22} = opc{3-2};
-    let Inst{21-18} = 0b0000;
-    let Inst{17-16} = opc{1-0};
-    let Inst{15-13} = 0b101;
-    let Inst{12-10} = Pg;
-    let Inst{9-5}   = Zn;
-    let Inst{4-0}   = Zd;
-    let hasSideEffects = 0;
-    let mayRaiseFPException = 1;
-}
-
-multiclass sve_fp_fcvtntz<string asm> {
-  def _StoH : sve_fp_fcvt2z<0b1000, asm,  ZPR16, ZPR32>;
-  def _DtoS : sve_fp_fcvt2z<0b1110, asm,  ZPR32, ZPR64>;
-}
-
-multiclass sve_fp_fcvtltz<string asm, string op> {
-  def _HtoS  : sve_fp_fcvt2z<0b1001, asm,  ZPR32, ZPR16>;
-  def _StoD  : sve_fp_fcvt2z<0b1111, asm,  ZPR64, ZPR32>;
+multiclass sve2_fp_convert_up_long_z<string asm, string op> {
+  def _HtoS : sve2_fp_convert_precision<0b1001, 0b0, asm, ZPR32, ZPR16>;
+  def _StoD : sve2_fp_convert_precision<0b1111, 0b0, asm, ZPR64, ZPR32>;
 
   def : SVE_3_Op_UndefZero_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f16), nxv4f32, nxv4i1, nxv8f16, !cast<Instruction>(NAME # _HtoS)>;
   def : SVE_3_Op_UndefZero_Pat<nxv2f64, !cast<SDPatternOperator>(op # _f64f32), nxv2f64, nxv2i1, nxv4f32, !cast<Instruction>(NAME # _StoD)>;
 }
 
+multiclass sve2_fp_convert_down_narrow_z<string asm> {
+  def _StoH : sve2_fp_convert_precision<0b1000, 0b0, asm,  ZPR16, ZPR32, /*destructive*/ true>;
+  def _DtoS : sve2_fp_convert_precision<0b1110, 0b0, asm,  ZPR32, ZPR64, /*destructive*/ true>;
+}
+
 //===----------------------------------------------------------------------===//
 // SVE2 Floating Point Pairwise Group
 //===----------------------------------------------------------------------===//
@@ -9311,33 +9294,18 @@ multiclass sve_float_dot_indexed<bit bf, bits<2> opc, ZPRRegOp src1_ty,
   def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, InVT, InVT, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME)>;
 }
 
-class sve_bfloat_convert<bit N, string asm>
-: I<(outs ZPR16:$Zd), (ins ZPR16:$_Zd, PPR3bAny:$Pg, ZPR32:$Zn),
-  asm, "\t$Zd, $Pg/m, $Zn", "", []>, Sched<[]> {
-  bits<5> Zd;
-  bits<3> Pg;
-  bits<5> Zn;
-  let Inst{31-25} = 0b0110010;
-  let Inst{24}    = N;
-  let Inst{23-13} = 0b10001010101;
-  let Inst{12-10} = Pg;
-  let Inst{9-5}   = Zn;
-  let Inst{4-0}   = Zd;
+multiclass sve_bfloat_convert<string asm, SDPatternOperator op, SDPatternOperator ir_op> {
+  def NAME : sve_fp_2op_p_zd<0b1001010, asm, ZPR32, ZPR16, ElementSizeS>;
 
-  let Constraints = "$Zd = $_Zd";
-  let DestructiveInstType = DestructiveOther;
-  let ElementSize = ElementSizeS;
-  let hasSideEffects = 0;
-  let mayRaiseFPException = 1;
+  def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv4i1, nxv4f32, !cast<Instruction>(NAME)>;
+  def : SVE_1_Op_Passthru_Round_Pat<nxv4bf16, ir_op, nxv4i1, nxv4f32, !cast<Instruction>(NAME)>;
+  def : SVE_1_Op_Passthru_Round_Pat<nxv2bf16, ir_op, nxv2i1, nxv2f32, !cast<Instruction>(NAME)>;
 }
 
-multiclass sve_bfloat_convert<bit N, string asm, SDPatternOperator op,
-                              SDPatternOperator ir_op = null_frag> {
-  def NAME : sve_bfloat_convert<N, asm>;
+multiclass sve_bfloat_convert_top<string asm,  SDPatternOperator op> {
+  def NAME : sve2_fp_convert_precision<0b1010, 0b1, asm, ZPR16, ZPR32>;
 
   def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv4i1, nxv4f32, !cast<Instruction>(NAME)>;
-  def : SVE_1_Op_Passthru_Round_Pat<nxv4bf16, ir_op, nxv4i1, nxv4f32, !cast<Instruction>(NAME)>;
-  def : SVE_1_Op_Passthru_Round_Pat<nxv2bf16, ir_op, nxv2i1, nxv2f32, !cast<Instruction>(NAME)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/MC/AArch64/SVE/bfcvtnt-diagnostics.s b/llvm/test/MC/AArch64/SVE/bfcvtnt-diagnostics.s
index d21a555ff87c6..644fe82ab9409 100644
--- a/llvm/test/MC/AArch64/SVE/bfcvtnt-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE/bfcvtnt-diagnostics.s
@@ -20,8 +20,17 @@ bfcvtnt z0.h, p8/m, z1.s
 // CHECK-NEXT: bfcvtnt z0.h, p8/m, z1.s
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
 movprfx z0.h, p0/m, z7.h
 bfcvtnt z0.h, p0/m, z1.s
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx with a different element size
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
 // CHECK-NEXT: bfcvtnt z0.h, p0/m, z1.s
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+bfcvtnt z0.h, p7/m, z1.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: bfcvtnt z0.h, p7/m, z1.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
\ No newline at end of file
diff --git a/llvm/test/MC/AArch64/SVE/bfcvtnt.s b/llvm/test/MC/AArch64/SVE/bfcvtnt.s
index 5f3b71e28b91e..b374a27ecfb9a 100644
--- a/llvm/test/MC/AArch64/SVE/bfcvtnt.s
+++ b/llvm/test/MC/AArch64/SVE/bfcvtnt.s
@@ -9,23 +9,3 @@ bfcvtnt z0.H, p0/m, z1.S
 // CHECK-INST: bfcvtnt z0.h, p0/m, z1.s
 // CHECK-ENCODING: [0x20,0xa0,0x8a,0x64]
 // CHECK-ERROR: instruction requires: bf16 sve or sme
-
-movprfx z0.S, p0/m, z2.S
-// CHECK-INST: movprfx z0.s, p0/m, z2.s
-// CHECK-ENCODING: [0x40,0x20,0x91,0x04]
-// CHECK-ERROR: instruction requires: sve or sme
-
-bfcvtnt z0.H, p0/m, z1.S
-// CHECK-INST: bfcvtnt z0.h, p0/m, z1.s
-// CHECK-ENCODING: [0x20,0xa0,0x8a,0x64]
-// CHECK-ERROR: instruction requires: bf16 sve or sme
-
-movprfx z0, z2
-// CHECK-INST: movprfx z0, z2
-// CHECK-ENCODING: [0x40,0xbc,0x20,0x04]
-// CHECK-ERROR: instruction requires: sve or sme
-
-bfcvtnt z0.H, p0/m, z1.S
-// CHECK-INST: bfcvtnt z0.h, p0/m, z1.s
-// CHECK-ENCODING: [0x20,0xa0,0x8a,0x64]
-// CHECK-ERROR: instruction requires: bf16 sve or sme

From 5a3f1acad7e8ce0e8cb90165794dce71f4b80bcd Mon Sep 17 00:00:00 2001
From: Pedro Lobo <pedro.lobo@tecnico.ulisboa.pt>
Date: Thu, 19 Dec 2024 10:48:29 +0000
Subject: [PATCH 042/209] [LLParser] Remove redundant code (NFC) (#120478)

---
 llvm/lib/AsmParser/LLParser.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 558ab3af63211..52d48a69f0eb5 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -7796,12 +7796,10 @@ bool LLParser::parseCast(Instruction *&Inst, PerFunctionState &PFS,
       parseType(DestTy))
     return true;
 
-  if (!CastInst::castIsValid((Instruction::CastOps)Opc, Op, DestTy)) {
-    CastInst::castIsValid((Instruction::CastOps)Opc, Op, DestTy);
+  if (!CastInst::castIsValid((Instruction::CastOps)Opc, Op, DestTy))
     return error(Loc, "invalid cast opcode for cast from '" +
                           getTypeString(Op->getType()) + "' to '" +
                           getTypeString(DestTy) + "'");
-  }
   Inst = CastInst::Create((Instruction::CastOps)Opc, Op, DestTy);
   return false;
 }

From 5fb8d70e5f1c5d26bfa6ca9034863c10f3d8669d Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 19 Dec 2024 17:55:19 +0700
Subject: [PATCH 043/209] ARM: Handle vldrh and vstrh in stack access hooks
 (#120527)

---
 llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index e6b37dd916168..e3e2e83fd5c7e 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1330,6 +1330,7 @@ Register ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
   case ARM::tSTRspi:
   case ARM::VSTRD:
   case ARM::VSTRS:
+  case ARM::VSTRH:
   case ARM::VSTR_P0_off:
   case ARM::VSTR_FPSCR_NZCVQC_off:
   case ARM::MVE_VSTRWU32:
@@ -1588,6 +1589,7 @@ Register ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
   case ARM::tLDRspi:
   case ARM::VLDRD:
   case ARM::VLDRS:
+  case ARM::VLDRH:
   case ARM::VLDR_P0_off:
   case ARM::VLDR_FPSCR_NZCVQC_off:
   case ARM::MVE_VLDRWU32:

From 056e5eccaf440e9127990f9fba1e5cacac399a14 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 19 Dec 2024 11:01:59 +0000
Subject: [PATCH 044/209] [AMDGPU] Remove unneeded use of !dag. NFC. (#120546)

---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 6a349d2bf06ea..16a7a9cfbc49a 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -903,10 +903,10 @@ multiclass SRegClass<int numRegs,
 
     def SReg_ # suffix :
       SIRegisterClass<"AMDGPU", regTypes, 32,
-                    !con(!dag(add, [!cast<RegisterClass>(sgprName)], ["sgpr"]),
-                    !if(hasTTMP,
-                        !dag(add, [!cast<RegisterClass>(ttmpName)], ["ttmp"]),
-                        (add)))> {
+                      !con((add !cast<RegisterClass>(sgprName)),
+                           !if(hasTTMP,
+                               (add !cast<RegisterClass>(ttmpName)),
+                               (add)))> {
       let isAllocatable = 0;
       let BaseClassOrder = !mul(numRegs, 32);
     }

From b41240be6b9e58687011b2bd1b942c6625cbb5ad Mon Sep 17 00:00:00 2001
From: Balazs Benics <benicsbalazs@gmail.com>
Date: Thu, 19 Dec 2024 12:04:04 +0100
Subject: [PATCH 045/209] [analyzer][NFC] Introduce APSIntPtr, a safe wrapper
 of APSInt (1/4) (#120435)

One could create dangling APSInt references in various ways in the past, that were sometimes assumed to be persisted in the BasicValueFactor.

One should always use BasicValueFactory to create persistent APSInts, that could be used by ConcreteInts or SymIntExprs and similar long-living objects.
If one used a temporary or local variables for this, these would dangle.
To enforce the contract of the analyzer BasicValueFactory and the uses of APSInts, let's have a dedicated strong-type for this.

The idea is that APSIntPtr is always owned by the BasicValueFactory, and that is the only component that can construct it.

These PRs are all NFC - besides fixing dangling APSInt references.
---
 .../Core/PathSensitive/APSIntPtr.h            | 64 +++++++++++++++++++
 .../Core/PathSensitive/BasicValueFactory.h    | 60 ++++++++---------
 .../Core/PathSensitive/ProgramState.h         |  1 -
 .../Core/PathSensitive/SMTConstraintManager.h |  9 ++-
 .../Checkers/CStringChecker.cpp               |  4 +-
 .../Checkers/StdLibraryFunctionsChecker.cpp   | 14 ++--
 .../Checkers/VLASizeChecker.cpp               |  2 +-
 .../StaticAnalyzer/Core/BasicValueFactory.cpp | 64 +++++++++----------
 .../StaticAnalyzer/Core/SimpleSValBuilder.cpp | 20 +++---
 9 files changed, 148 insertions(+), 90 deletions(-)
 create mode 100644 clang/include/clang/StaticAnalyzer/Core/PathSensitive/APSIntPtr.h

diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/APSIntPtr.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/APSIntPtr.h
new file mode 100644
index 0000000000000..84a6bf1406ac6
--- /dev/null
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/APSIntPtr.h
@@ -0,0 +1,64 @@
+//== APSIntPtr.h - Wrapper for APSInt objects owned separately -*- C++ -*--==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_STATICANALYZER_CORE_PATHSENSITIVE_APSIntPtr_H
+#define LLVM_CLANG_STATICANALYZER_CORE_PATHSENSITIVE_APSIntPtr_H
+
+#include "llvm/ADT/APSInt.h"
+#include "llvm/Support/Compiler.h"
+
+namespace clang::ento {
+
+/// A safe wrapper around APSInt objects allocated and owned by
+/// \c BasicValueFactory. This just wraps a common llvm::APSInt.
+class APSIntPtr {
+  using APSInt = llvm::APSInt;
+
+public:
+  APSIntPtr() = delete;
+  APSIntPtr(const APSIntPtr &) = default;
+  APSIntPtr &operator=(const APSIntPtr &) & = default;
+  ~APSIntPtr() = default;
+
+  /// You should not use this API.
+  /// If do, ensure that the \p Ptr not going to dangle.
+  /// Prefer using \c BasicValueFactory::getValue() to get an APSIntPtr object.
+  static APSIntPtr unsafeConstructor(const APSInt *Ptr) {
+    return APSIntPtr(Ptr);
+  }
+
+  LLVM_ATTRIBUTE_RETURNS_NONNULL
+  const APSInt *get() const { return Ptr; }
+  /*implicit*/ operator const APSInt &() const { return *get(); }
+
+  APSInt operator-() const { return -*Ptr; }
+  APSInt operator~() const { return ~*Ptr; }
+
+#define DEFINE_OPERATOR(OP)                                                    \
+  bool operator OP(APSIntPtr Other) const { return (*Ptr)OP(*Other.Ptr); }
+  DEFINE_OPERATOR(>)
+  DEFINE_OPERATOR(>=)
+  DEFINE_OPERATOR(<)
+  DEFINE_OPERATOR(<=)
+  DEFINE_OPERATOR(==)
+  DEFINE_OPERATOR(!=)
+#undef DEFINE_OPERATOR
+
+  const APSInt &operator*() const { return *Ptr; }
+  const APSInt *operator->() const { return Ptr; }
+
+private:
+  explicit APSIntPtr(const APSInt *Ptr) : Ptr(Ptr) {}
+
+  /// Owned by \c BasicValueFactory.
+  const APSInt *Ptr;
+};
+
+} // namespace clang::ento
+
+#endif // LLVM_CLANG_STATICANALYZER_CORE_PATHSENSITIVE_APSIntPtr_H
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/BasicValueFactory.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/BasicValueFactory.h
index ec503b41b381a..ef04f9c485e88 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/BasicValueFactory.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/BasicValueFactory.h
@@ -18,10 +18,11 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/Type.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/APSIntPtr.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/APSIntType.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SVals.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/StoreRef.h"
-#include "clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/ImmutableList.h"
@@ -129,7 +130,7 @@ class BasicValueFactory {
 
   // This is private because external clients should use the factory
   // method that takes a QualType.
-  const llvm::APSInt& getValue(uint64_t X, unsigned BitWidth, bool isUnsigned);
+  APSIntPtr getValue(uint64_t X, unsigned BitWidth, bool isUnsigned);
 
 public:
   BasicValueFactory(ASTContext &ctx, llvm::BumpPtrAllocator &Alloc)
@@ -140,9 +141,9 @@ class BasicValueFactory {
 
   ASTContext &getContext() const { return Ctx; }
 
-  const llvm::APSInt& getValue(const llvm::APSInt& X);
-  const llvm::APSInt& getValue(const llvm::APInt& X, bool isUnsigned);
-  const llvm::APSInt& getValue(uint64_t X, QualType T);
+  APSIntPtr getValue(const llvm::APSInt &X);
+  APSIntPtr getValue(const llvm::APInt &X, bool isUnsigned);
+  APSIntPtr getValue(uint64_t X, QualType T);
 
   /// Returns the type of the APSInt used to store values of the given QualType.
   APSIntType getAPSIntType(QualType T) const {
@@ -165,79 +166,70 @@ class BasicValueFactory {
 
   /// Convert - Create a new persistent APSInt with the same value as 'From'
   ///  but with the bitwidth and signedness of 'To'.
-  const llvm::APSInt &Convert(const llvm::APSInt& To,
-                              const llvm::APSInt& From) {
+  APSIntPtr Convert(const llvm::APSInt &To, const llvm::APSInt &From) {
     APSIntType TargetType(To);
     if (TargetType == APSIntType(From))
-      return From;
+      return getValue(From);
 
     return getValue(TargetType.convert(From));
   }
 
-  const llvm::APSInt &Convert(QualType T, const llvm::APSInt &From) {
+  APSIntPtr Convert(QualType T, const llvm::APSInt &From) {
     APSIntType TargetType = getAPSIntType(T);
     return Convert(TargetType, From);
   }
 
-  const llvm::APSInt &Convert(APSIntType TargetType, const llvm::APSInt &From) {
+  APSIntPtr Convert(APSIntType TargetType, const llvm::APSInt &From) {
     if (TargetType == APSIntType(From))
-      return From;
+      return getValue(From);
 
     return getValue(TargetType.convert(From));
   }
 
-  const llvm::APSInt &getIntValue(uint64_t X, bool isUnsigned) {
+  APSIntPtr getIntValue(uint64_t X, bool isUnsigned) {
     QualType T = isUnsigned ? Ctx.UnsignedIntTy : Ctx.IntTy;
     return getValue(X, T);
   }
 
-  const llvm::APSInt &getMaxValue(const llvm::APSInt &v) {
+  APSIntPtr getMaxValue(const llvm::APSInt &v) {
     return getValue(APSIntType(v).getMaxValue());
   }
 
-  const llvm::APSInt &getMinValue(const llvm::APSInt &v) {
+  APSIntPtr getMinValue(const llvm::APSInt &v) {
     return getValue(APSIntType(v).getMinValue());
   }
 
-  const llvm::APSInt &getMaxValue(QualType T) {
-    return getMaxValue(getAPSIntType(T));
-  }
+  APSIntPtr getMaxValue(QualType T) { return getMaxValue(getAPSIntType(T)); }
 
-  const llvm::APSInt &getMinValue(QualType T) {
-    return getMinValue(getAPSIntType(T));
-  }
+  APSIntPtr getMinValue(QualType T) { return getMinValue(getAPSIntType(T)); }
 
-  const llvm::APSInt &getMaxValue(APSIntType T) {
-    return getValue(T.getMaxValue());
-  }
+  APSIntPtr getMaxValue(APSIntType T) { return getValue(T.getMaxValue()); }
 
-  const llvm::APSInt &getMinValue(APSIntType T) {
-    return getValue(T.getMinValue());
-  }
+  APSIntPtr getMinValue(APSIntType T) { return getValue(T.getMinValue()); }
 
-  const llvm::APSInt &Add1(const llvm::APSInt &V) {
+  APSIntPtr Add1(const llvm::APSInt &V) {
     llvm::APSInt X = V;
     ++X;
     return getValue(X);
   }
 
-  const llvm::APSInt &Sub1(const llvm::APSInt &V) {
+  APSIntPtr Sub1(const llvm::APSInt &V) {
     llvm::APSInt X = V;
     --X;
     return getValue(X);
   }
 
-  const llvm::APSInt &getZeroWithTypeSize(QualType T) {
+  APSIntPtr getZeroWithTypeSize(QualType T) {
     assert(T->isScalarType());
     return getValue(0, Ctx.getTypeSize(T), true);
   }
 
-  const llvm::APSInt &getTruthValue(bool b, QualType T) {
+  APSIntPtr getTruthValue(bool b, QualType T) {
     return getValue(b ? 1 : 0, Ctx.getIntWidth(T),
                     T->isUnsignedIntegerOrEnumerationType());
   }
 
-  const llvm::APSInt &getTruthValue(bool b) {
+  APSIntPtr getTruthValue(bool b) {
     return getTruthValue(b, Ctx.getLogicalOperationType());
   }
 
@@ -273,9 +265,9 @@ class BasicValueFactory {
   accumCXXBase(llvm::iterator_range<CastExpr::path_const_iterator> PathRange,
                const nonloc::PointerToMember &PTM, const clang::CastKind &kind);
 
-  const llvm::APSInt* evalAPSInt(BinaryOperator::Opcode Op,
-                                     const llvm::APSInt& V1,
-                                     const llvm::APSInt& V2);
+  std::optional<APSIntPtr> evalAPSInt(BinaryOperator::Opcode Op,
+                                      const llvm::APSInt &V1,
+                                      const llvm::APSInt &V2);
 
   const std::pair<SVal, uintptr_t>&
   getPersistentSValWithData(const SVal& V, uintptr_t Data);
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
index 29f534eba2a26..a20516b003c7d 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
@@ -70,7 +70,6 @@ template <typename T> struct ProgramStateTrait {
 ///  values will never change.
 class ProgramState : public llvm::FoldingSetNode {
 public:
-  typedef llvm::ImmutableSet<llvm::APSInt*>                IntSetTy;
   typedef llvm::ImmutableMap<void*, void*>                 GenericDataMap;
 
 private:
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h
index 5766af1fc78a4..72038b92f8edf 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h
@@ -16,6 +16,7 @@
 
 #include "clang/Basic/JsonSupport.h"
 #include "clang/Basic/TargetInfo.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/BasicValueFactory.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/RangedConstraintManager.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h"
 #include <optional>
@@ -154,7 +155,7 @@ class SMTConstraintManager : public clang::ento::SimpleConstraintManager {
         return nullptr;
 
       // This is the only solution, store it
-      return &BVF.getValue(Value);
+      return BVF.getValue(Value).get();
     }
 
     if (const SymbolCast *SC = dyn_cast<SymbolCast>(Sym)) {
@@ -167,7 +168,7 @@ class SMTConstraintManager : public clang::ento::SimpleConstraintManager {
       const llvm::APSInt *Value;
       if (!(Value = getSymVal(State, CastSym)))
         return nullptr;
-      return &BVF.Convert(SC->getType(), *Value);
+      return BVF.Convert(SC->getType(), *Value).get();
     }
 
     if (const BinarySymExpr *BSE = dyn_cast<BinarySymExpr>(Sym)) {
@@ -195,7 +196,9 @@ class SMTConstraintManager : public clang::ento::SimpleConstraintManager {
       std::tie(ConvertedRHS, RTy) = SMTConv::fixAPSInt(Ctx, *RHS);
       SMTConv::doIntTypeConversion<llvm::APSInt, &SMTConv::castAPSInt>(
           Solver, Ctx, ConvertedLHS, LTy, ConvertedRHS, RTy);
-      return BVF.evalAPSInt(BSE->getOpcode(), ConvertedLHS, ConvertedRHS);
+      std::optional<APSIntPtr> Res =
+          BVF.evalAPSInt(BSE->getOpcode(), ConvertedLHS, ConvertedRHS);
+      return Res ? Res.value().get() : nullptr;
     }
 
     llvm_unreachable("Unsupported expression to get symbol value!");
diff --git a/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
index 21a2d8828249d..1a14f38e34f0e 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CStringChecker.cpp
@@ -1025,8 +1025,8 @@ SVal CStringChecker::getCStringLengthForRegion(CheckerContext &C,
       BasicValueFactory &BVF = svalBuilder.getBasicValueFactory();
       const llvm::APSInt &maxValInt = BVF.getMaxValue(sizeTy);
       llvm::APSInt fourInt = APSIntType(maxValInt).getValue(4);
-      const llvm::APSInt *maxLengthInt = BVF.evalAPSInt(BO_Div, maxValInt,
-                                                        fourInt);
+      std::optional<APSIntPtr> maxLengthInt =
+          BVF.evalAPSInt(BO_Div, maxValInt, fourInt);
       NonLoc maxLength = svalBuilder.makeIntVal(*maxLengthInt);
       SVal evalLength = svalBuilder.evalBinOpNN(state, BO_LE, *strLn, maxLength,
                                                 svalBuilder.getConditionType());
diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
index 4f30b2a0e7e7d..356d63e3e8b80 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
@@ -1643,7 +1643,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
   public:
     GetMaxValue(BasicValueFactory &BVF) : BVF(BVF) {}
     std::optional<RangeInt> operator()(QualType Ty) {
-      return BVF.getMaxValue(Ty).getLimitedValue();
+      return BVF.getMaxValue(Ty)->getLimitedValue();
     }
     std::optional<RangeInt> operator()(std::optional<QualType> Ty) {
       if (Ty) {
@@ -1687,11 +1687,11 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
   const QualType SizePtrTy = getPointerTy(SizeTy);
   const QualType SizePtrRestrictTy = getRestrictTy(SizePtrTy);
 
-  const RangeInt IntMax = BVF.getMaxValue(IntTy).getLimitedValue();
+  const RangeInt IntMax = BVF.getMaxValue(IntTy)->getLimitedValue();
   const RangeInt UnsignedIntMax =
-      BVF.getMaxValue(UnsignedIntTy).getLimitedValue();
-  const RangeInt LongMax = BVF.getMaxValue(LongTy).getLimitedValue();
-  const RangeInt SizeMax = BVF.getMaxValue(SizeTy).getLimitedValue();
+      BVF.getMaxValue(UnsignedIntTy)->getLimitedValue();
+  const RangeInt LongMax = BVF.getMaxValue(LongTy)->getLimitedValue();
+  const RangeInt SizeMax = BVF.getMaxValue(SizeTy)->getLimitedValue();
 
   // Set UCharRangeMax to min of int or uchar maximum value.
   // The C standard states that the arguments of functions like isalpha must
@@ -1700,7 +1700,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
   // to be true for commonly used and well tested instruction set
   // architectures, but not for others.
   const RangeInt UCharRangeMax =
-      std::min(BVF.getMaxValue(ACtx.UnsignedCharTy).getLimitedValue(), IntMax);
+      std::min(BVF.getMaxValue(ACtx.UnsignedCharTy)->getLimitedValue(), IntMax);
 
   // Get platform dependent values of some macros.
   // Try our best to parse this from the Preprocessor, otherwise fallback to a
@@ -3704,7 +3704,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
 
   // Functions for testing.
   if (AddTestFunctions) {
-    const RangeInt IntMin = BVF.getMinValue(IntTy).getLimitedValue();
+    const RangeInt IntMin = BVF.getMinValue(IntTy)->getLimitedValue();
 
     addToFunctionSummaryMap(
         "__not_null", Signature(ArgTypes{IntPtrTy}, RetType{IntTy}),
diff --git a/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp
index 8d17ba5d690b9..ba91b3632abbf 100644
--- a/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/VLASizeChecker.cpp
@@ -96,7 +96,7 @@ ProgramStateRef VLASizeChecker::checkVLA(CheckerContext &C,
   SValBuilder &SVB = C.getSValBuilder();
   CanQualType SizeTy = Ctx.getSizeType();
   uint64_t SizeMax =
-      SVB.getBasicValueFactory().getMaxValue(SizeTy).getZExtValue();
+      SVB.getBasicValueFactory().getMaxValue(SizeTy)->getZExtValue();
 
   // Get the element size.
   CharUnits EleSize = Ctx.getTypeSizeInChars(VLALast->getElementType());
diff --git a/clang/lib/StaticAnalyzer/Core/BasicValueFactory.cpp b/clang/lib/StaticAnalyzer/Core/BasicValueFactory.cpp
index 827c04143e658..02f34bc30f554 100644
--- a/clang/lib/StaticAnalyzer/Core/BasicValueFactory.cpp
+++ b/clang/lib/StaticAnalyzer/Core/BasicValueFactory.cpp
@@ -87,7 +87,7 @@ BasicValueFactory::~BasicValueFactory() {
   delete (PersistentSValPairsTy*) PersistentSValPairs;
 }
 
-const llvm::APSInt& BasicValueFactory::getValue(const llvm::APSInt& X) {
+APSIntPtr BasicValueFactory::getValue(const llvm::APSInt &X) {
   llvm::FoldingSetNodeID ID;
   void *InsertPos;
 
@@ -101,23 +101,23 @@ const llvm::APSInt& BasicValueFactory::getValue(const llvm::APSInt& X) {
     APSIntSet.InsertNode(P, InsertPos);
   }
 
-  return *P;
+  // We own the APSInt object. It's safe here.
+  return APSIntPtr::unsafeConstructor(&P->getValue());
 }
 
-const llvm::APSInt& BasicValueFactory::getValue(const llvm::APInt& X,
-                                                bool isUnsigned) {
+APSIntPtr BasicValueFactory::getValue(const llvm::APInt &X, bool isUnsigned) {
   llvm::APSInt V(X, isUnsigned);
   return getValue(V);
 }
 
-const llvm::APSInt& BasicValueFactory::getValue(uint64_t X, unsigned BitWidth,
-                                           bool isUnsigned) {
+APSIntPtr BasicValueFactory::getValue(uint64_t X, unsigned BitWidth,
+                                      bool isUnsigned) {
   llvm::APSInt V(BitWidth, isUnsigned);
   V = X;
   return getValue(V);
 }
 
-const llvm::APSInt& BasicValueFactory::getValue(uint64_t X, QualType T) {
+APSIntPtr BasicValueFactory::getValue(uint64_t X, QualType T) {
   return getValue(getAPSIntType(T).getValue(X));
 }
 
@@ -242,45 +242,45 @@ const PointerToMemberData *BasicValueFactory::accumCXXBase(
   return getPointerToMemberData(ND, BaseSpecList);
 }
 
-const llvm::APSInt*
-BasicValueFactory::evalAPSInt(BinaryOperator::Opcode Op,
-                             const llvm::APSInt& V1, const llvm::APSInt& V2) {
+std::optional<APSIntPtr>
+BasicValueFactory::evalAPSInt(BinaryOperator::Opcode Op, const llvm::APSInt &V1,
+                              const llvm::APSInt &V2) {
   switch (Op) {
     default:
       llvm_unreachable("Invalid Opcode.");
 
     case BO_Mul:
-      return &getValue( V1 * V2 );
+      return getValue(V1 * V2);
 
     case BO_Div:
       if (V2 == 0) // Avoid division by zero
-        return nullptr;
-      return &getValue( V1 / V2 );
+        return std::nullopt;
+      return getValue(V1 / V2);
 
     case BO_Rem:
       if (V2 == 0) // Avoid division by zero
-        return nullptr;
-      return &getValue( V1 % V2 );
+        return std::nullopt;
+      return getValue(V1 % V2);
 
     case BO_Add:
-      return &getValue( V1 + V2 );
+      return getValue(V1 + V2);
 
     case BO_Sub:
-      return &getValue( V1 - V2 );
+      return getValue(V1 - V2);
 
     case BO_Shl: {
       // FIXME: This logic should probably go higher up, where we can
       // test these conditions symbolically.
 
       if (V2.isNegative() || V2.getBitWidth() > 64)
-        return nullptr;
+        return std::nullopt;
 
       uint64_t Amt = V2.getZExtValue();
 
       if (Amt >= V1.getBitWidth())
-        return nullptr;
+        return std::nullopt;
 
-      return &getValue( V1.operator<<( (unsigned) Amt ));
+      return getValue(V1.operator<<((unsigned)Amt));
     }
 
     case BO_Shr: {
@@ -288,44 +288,44 @@ BasicValueFactory::evalAPSInt(BinaryOperator::Opcode Op,
       // test these conditions symbolically.
 
       if (V2.isNegative() || V2.getBitWidth() > 64)
-        return nullptr;
+        return std::nullopt;
 
       uint64_t Amt = V2.getZExtValue();
 
       if (Amt >= V1.getBitWidth())
-        return nullptr;
+        return std::nullopt;
 
-      return &getValue( V1.operator>>( (unsigned) Amt ));
+      return getValue(V1.operator>>((unsigned)Amt));
     }
 
     case BO_LT:
-      return &getTruthValue( V1 < V2 );
+      return getTruthValue(V1 < V2);
 
     case BO_GT:
-      return &getTruthValue( V1 > V2 );
+      return getTruthValue(V1 > V2);
 
     case BO_LE:
-      return &getTruthValue( V1 <= V2 );
+      return getTruthValue(V1 <= V2);
 
     case BO_GE:
-      return &getTruthValue( V1 >= V2 );
+      return getTruthValue(V1 >= V2);
 
     case BO_EQ:
-      return &getTruthValue( V1 == V2 );
+      return getTruthValue(V1 == V2);
 
     case BO_NE:
-      return &getTruthValue( V1 != V2 );
+      return getTruthValue(V1 != V2);
 
       // Note: LAnd, LOr, Comma are handled specially by higher-level logic.
 
     case BO_And:
-      return &getValue( V1 & V2 );
+      return getValue(V1 & V2);
 
     case BO_Or:
-      return &getValue( V1 | V2 );
+      return getValue(V1 | V2);
 
     case BO_Xor:
-      return &getValue( V1 ^ V2 );
+      return getValue(V1 ^ V2);
   }
 }
 
diff --git a/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
index 229169f848e22..7b7fc801ec7f4 100644
--- a/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
@@ -193,7 +193,7 @@ SVal SimpleSValBuilder::MakeSymIntVal(const SymExpr *LHS,
 
   // If we reach this point, the expression cannot be simplified.
   // Make a SymbolVal for the entire expression, after converting the RHS.
-  const llvm::APSInt *ConvertedRHS = &RHS;
+  std::optional<APSIntPtr> ConvertedRHS = BasicVals.getValue(RHS);
   if (BinaryOperator::isComparisonOp(op)) {
     // We're looking for a type big enough to compare the symbolic value
     // with the given constant.
@@ -205,13 +205,13 @@ SVal SimpleSValBuilder::MakeSymIntVal(const SymExpr *LHS,
 
     if (ValWidth < TypeWidth) {
       // If the value is too small, extend it.
-      ConvertedRHS = &BasicVals.Convert(SymbolType, RHS);
+      ConvertedRHS = BasicVals.Convert(SymbolType, RHS);
     } else if (ValWidth == TypeWidth) {
       // If the value is signed but the symbol is unsigned, do the comparison
       // in unsigned space. [C99 6.3.1.8]
       // (For the opposite case, the value is already unsigned.)
       if (RHS.isSigned() && !SymbolType->isSignedIntegerOrEnumerationType())
-        ConvertedRHS = &BasicVals.Convert(SymbolType, RHS);
+        ConvertedRHS = BasicVals.Convert(SymbolType, RHS);
     }
   } else if (BinaryOperator::isAdditiveOp(op) && RHS.isNegative()) {
     // Change a+(-N) into a-N, and a-(-N) into a+N
@@ -219,13 +219,13 @@ SVal SimpleSValBuilder::MakeSymIntVal(const SymExpr *LHS,
     // subtraction/addition of the negated value.
     APSIntType resultIntTy = BasicVals.getAPSIntType(resultTy);
     if (isNegationValuePreserving(RHS, resultIntTy)) {
-      ConvertedRHS = &BasicVals.getValue(-resultIntTy.convert(RHS));
+      ConvertedRHS = BasicVals.getValue(-resultIntTy.convert(RHS));
       op = (op == BO_Add) ? BO_Sub : BO_Add;
     } else {
-      ConvertedRHS = &BasicVals.Convert(resultTy, RHS);
+      ConvertedRHS = BasicVals.Convert(resultTy, RHS);
     }
   } else
-    ConvertedRHS = &BasicVals.Convert(resultTy, RHS);
+    ConvertedRHS = BasicVals.Convert(resultTy, RHS);
 
   return makeNonLoc(LHS, op, *ConvertedRHS, resultTy);
 }
@@ -541,8 +541,8 @@ SVal SimpleSValBuilder::evalBinOpNN(ProgramStateRef state,
           IntType.apply(RHSValue);
         }
 
-        const llvm::APSInt *Result =
-          BasicVals.evalAPSInt(op, LHSValue, RHSValue);
+        std::optional<APSIntPtr> Result =
+            BasicVals.evalAPSInt(op, LHSValue, RHSValue);
         if (!Result) {
           if (op == BO_Shl || op == BO_Shr) {
             // FIXME: At this point the constant folding claims that the result
@@ -682,7 +682,7 @@ SVal SimpleSValBuilder::evalBinOpNN(ProgramStateRef state,
               // as consequence x+1U-10 produces x-9U, instead
               // of x+4294967287U, that would be produced without this
               // additional check.
-              const llvm::APSInt *newRHS;
+              std::optional<APSIntPtr> newRHS;
               if (lop == op) {
                 newRHS = BasicVals.evalAPSInt(BO_Add, first, second);
               } else if (first >= second) {
@@ -874,7 +874,7 @@ SVal SimpleSValBuilder::evalBinOpLL(ProgramStateRef state,
     if (std::optional<loc::ConcreteInt> rInt = rhs.getAs<loc::ConcreteInt>()) {
       assert(BinaryOperator::isComparisonOp(op) || op == BO_Sub);
 
-      if (const auto *ResultInt =
+      if (std::optional<APSIntPtr> ResultInt =
               BasicVals.evalAPSInt(op, L.getValue(), rInt->getValue()))
         return evalCast(nonloc::ConcreteInt(*ResultInt), resultTy, QualType{});
       return UnknownVal();

From 060d62b48aeb5080ffcae1dc56e41a06c6f56701 Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy@arm.com>
Date: Thu, 19 Dec 2024 11:42:40 +0000
Subject: [PATCH 046/209] [LoopVectorizer] Add support for partial reductions
 (#92418)

Following on from https://github.com/llvm/llvm-project/pull/94499, this
patch adds support to the Loop Vectorizer to emit the partial reduction
intrinsics where they may be beneficial for the target.

---------

Co-authored-by: Samuel Tebbs <samuel.tebbs@arm.com>
---
 .../llvm/Analysis/TargetTransformInfo.h       |   39 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   |    9 +
 llvm/lib/Analysis/TargetTransformInfo.cpp     |   17 +
 .../AArch64/AArch64TargetTransformInfo.h      |   56 +
 .../Transforms/Vectorize/LoopVectorize.cpp    |  136 +-
 .../Transforms/Vectorize/VPRecipeBuilder.h    |   59 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   63 +-
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |    8 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   74 +-
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |    1 +
 .../AArch64/fully-unrolled-cost.ll            |   20 +-
 .../partial-reduce-dot-product-epilogue.ll    |   99 +
 .../partial-reduce-dot-product-neon.ll        | 1375 +++++++++++++
 .../AArch64/partial-reduce-dot-product.ll     | 1733 +++++++++++++++++
 .../AArch64/partial-reduce-no-dotprod.ll      |   61 +
 .../LoopVectorize/AArch64/vplan-printing.ll   |   93 +
 16 files changed, 3812 insertions(+), 31 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index c4d5459d25092..cd8e9b7887b66 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -211,6 +211,12 @@ typedef TargetTransformInfo TTI;
 /// for IR-level transformations.
 class TargetTransformInfo {
 public:
+  enum PartialReductionExtendKind { PR_None, PR_SignExtend, PR_ZeroExtend };
+
+  /// Get the kind of extension that an instruction represents.
+  static PartialReductionExtendKind
+  getPartialReductionExtendKind(Instruction *I);
+
   /// Construct a TTI object using a type implementing the \c Concept
   /// API below.
   ///
@@ -1274,6 +1280,18 @@ class TargetTransformInfo {
   /// \return if target want to issue a prefetch in address space \p AS.
   bool shouldPrefetchAddressSpace(unsigned AS) const;
 
+  /// \return The cost of a partial reduction, which is a reduction from a
+  /// vector to another vector with fewer elements of larger size. They are
+  /// represented by the llvm.experimental.partial.reduce.add intrinsic, which
+  /// takes an accumulator and a binary operation operand that itself is fed by
+  /// two extends. An example of an operation that uses a partial reduction is a
+  /// dot product, which reduces a vector to another of 4 times fewer elements.
+  InstructionCost
+  getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
+                          ElementCount VF, PartialReductionExtendKind OpAExtend,
+                          PartialReductionExtendKind OpBExtend,
+                          std::optional<unsigned> BinOp = std::nullopt) const;
+
   /// \return The maximum interleave factor that any transform should try to
   /// perform for this target. This number depends on the level of parallelism
   /// and the number of execution units in the CPU.
@@ -2098,6 +2116,18 @@ class TargetTransformInfo::Concept {
   /// \return if target want to issue a prefetch in address space \p AS.
   virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0;
 
+  /// \return The cost of a partial reduction, which is a reduction from a
+  /// vector to another vector with fewer elements of larger size. They are
+  /// represented by the llvm.experimental.partial.reduce.add intrinsic, which
+  /// takes an accumulator and a binary operation operand that itself is fed by
+  /// two extends. An example of an operation that uses a partial reduction is a
+  /// dot product, which reduces a vector to another of 4 times fewer elements.
+  virtual InstructionCost
+  getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
+                          ElementCount VF, PartialReductionExtendKind OpAExtend,
+                          PartialReductionExtendKind OpBExtend,
+                          std::optional<unsigned> BinOp) const = 0;
+
   virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0;
   virtual InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
@@ -2772,6 +2802,15 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.shouldPrefetchAddressSpace(AS);
   }
 
+  InstructionCost getPartialReductionCost(
+      unsigned Opcode, Type *InputType, Type *AccumType, ElementCount VF,
+      PartialReductionExtendKind OpAExtend,
+      PartialReductionExtendKind OpBExtend,
+      std::optional<unsigned> BinOp = std::nullopt) const override {
+    return Impl.getPartialReductionCost(Opcode, InputType, AccumType, VF,
+                                        OpAExtend, OpBExtend, BinOp);
+  }
+
   unsigned getMaxInterleaveFactor(ElementCount VF) override {
     return Impl.getMaxInterleaveFactor(VF);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 48ebffff8cbfc..885fe4390e568 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -580,6 +580,15 @@ class TargetTransformInfoImplBase {
   bool enableWritePrefetching() const { return false; }
   bool shouldPrefetchAddressSpace(unsigned AS) const { return !AS; }
 
+  InstructionCost
+  getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
+                          ElementCount VF,
+                          TTI::PartialReductionExtendKind OpAExtend,
+                          TTI::PartialReductionExtendKind OpBExtend,
+                          std::optional<unsigned> BinOp = std::nullopt) const {
+    return InstructionCost::getInvalid();
+  }
+
   unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }
 
   InstructionCost getArithmeticInstrCost(
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index d4b6c08c5a32b..efd92ebf92150 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -858,6 +858,14 @@ bool TargetTransformInfo::shouldPrefetchAddressSpace(unsigned AS) const {
   return TTIImpl->shouldPrefetchAddressSpace(AS);
 }
 
+InstructionCost TargetTransformInfo::getPartialReductionCost(
+    unsigned Opcode, Type *InputType, Type *AccumType, ElementCount VF,
+    PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend,
+    std::optional<unsigned> BinOp) const {
+  return TTIImpl->getPartialReductionCost(Opcode, InputType, AccumType, VF,
+                                          OpAExtend, OpBExtend, BinOp);
+}
+
 unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
   return TTIImpl->getMaxInterleaveFactor(VF);
 }
@@ -969,6 +977,15 @@ InstructionCost TargetTransformInfo::getShuffleCost(
   return Cost;
 }
 
+TargetTransformInfo::PartialReductionExtendKind
+TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) {
+  if (isa<SExtInst>(I))
+    return PR_SignExtend;
+  if (isa<ZExtInst>(I))
+    return PR_ZeroExtend;
+  return PR_None;
+}
+
 TTI::CastContextHint
 TargetTransformInfo::getCastContextHint(const Instruction *I) {
   if (!I)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 83b86e31565e4..2a31cacc203f4 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/InstructionCost.h"
 #include <cstdint>
 #include <optional>
 
@@ -357,6 +358,61 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     return BaseT::isLegalNTLoad(DataType, Alignment);
   }
 
+  InstructionCost
+  getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
+                          ElementCount VF,
+                          TTI::PartialReductionExtendKind OpAExtend,
+                          TTI::PartialReductionExtendKind OpBExtend,
+                          std::optional<unsigned> BinOp) const {
+
+    InstructionCost Invalid = InstructionCost::getInvalid();
+    InstructionCost Cost(TTI::TCC_Basic);
+
+    if (Opcode != Instruction::Add)
+      return Invalid;
+
+    EVT InputEVT = EVT::getEVT(InputType);
+    EVT AccumEVT = EVT::getEVT(AccumType);
+
+    if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable())
+      return Invalid;
+    if (VF.isFixed() && (!ST->isNeonAvailable() || !ST->hasDotProd()))
+      return Invalid;
+
+    if (InputEVT == MVT::i8) {
+      switch (VF.getKnownMinValue()) {
+      default:
+        return Invalid;
+      case 8:
+        if (AccumEVT == MVT::i32)
+          Cost *= 2;
+        else if (AccumEVT != MVT::i64)
+          return Invalid;
+        break;
+      case 16:
+        if (AccumEVT == MVT::i64)
+          Cost *= 2;
+        else if (AccumEVT != MVT::i32)
+          return Invalid;
+        break;
+      }
+    } else if (InputEVT == MVT::i16) {
+      // FIXME: Allow i32 accumulator but increase cost, as we would extend
+      //        it to i64.
+      if (VF.getKnownMinValue() != 8 || AccumEVT != MVT::i64)
+        return Invalid;
+    } else
+      return Invalid;
+
+    if (OpAExtend == TTI::PR_None || OpBExtend == TTI::PR_None)
+      return Invalid;
+
+    if (!BinOp || (*BinOp) != Instruction::Mul)
+      return Invalid;
+
+    return Cost;
+  }
+
   bool enableOrderedReductions() const { return true; }
 
   InstructionCost getInterleavedMemoryOpCost(
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1f6996cd9c1f4..ad963137f1af1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7605,6 +7605,10 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
         }
         continue;
       }
+      // The VPlan-based cost model is more accurate for partial reduction and
+      // comparing against the legacy cost isn't desirable.
+      if (isa<VPPartialReductionRecipe>(&R))
+        return true;
       if (Instruction *UI = GetInstructionForCost(&R))
         SeenInstrs.insert(UI);
     }
@@ -8827,6 +8831,103 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
   return Recipe;
 }
 
+/// Find all possible partial reductions in the loop and track all of those that
+/// are valid so recipes can be formed later.
+void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
+  // Find all possible partial reductions.
+  SmallVector<std::pair<PartialReductionChain, unsigned>, 1>
+      PartialReductionChains;
+  for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
+    if (std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
+            getScaledReduction(Phi, RdxDesc, Range))
+      PartialReductionChains.push_back(*Pair);
+
+  // A partial reduction is invalid if any of its extends are used by
+  // something that isn't another partial reduction. This is because the
+  // extends are intended to be lowered along with the reduction itself.
+
+  // Build up a set of partial reduction bin ops for efficient use checking.
+  SmallSet<User *, 4> PartialReductionBinOps;
+  for (const auto &[PartialRdx, _] : PartialReductionChains)
+    PartialReductionBinOps.insert(PartialRdx.BinOp);
+
+  auto ExtendIsOnlyUsedByPartialReductions =
+      [&PartialReductionBinOps](Instruction *Extend) {
+        return all_of(Extend->users(), [&](const User *U) {
+          return PartialReductionBinOps.contains(U);
+        });
+      };
+
+  // Check if each use of a chain's two extends is a partial reduction
+  // and only add those that don't have non-partial reduction users.
+  for (auto Pair : PartialReductionChains) {
+    PartialReductionChain Chain = Pair.first;
+    if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
+        ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
+      ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Pair));
+  }
+}
+
+std::optional<std::pair<PartialReductionChain, unsigned>>
+VPRecipeBuilder::getScaledReduction(PHINode *PHI,
+                                    const RecurrenceDescriptor &Rdx,
+                                    VFRange &Range) {
+  // TODO: Allow scaling reductions when predicating. The select at
+  // the end of the loop chooses between the phi value and most recent
+  // reduction result, both of which have different VFs to the active lane
+  // mask when scaling.
+  if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent()))
+    return std::nullopt;
+
+  auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
+  if (!Update)
+    return std::nullopt;
+
+  Value *Op = Update->getOperand(0);
+  if (Op == PHI)
+    Op = Update->getOperand(1);
+
+  auto *BinOp = dyn_cast<BinaryOperator>(Op);
+  if (!BinOp || !BinOp->hasOneUse())
+    return std::nullopt;
+
+  using namespace llvm::PatternMatch;
+  Value *A, *B;
+  if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) ||
+      !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B))))
+    return std::nullopt;
+
+  Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
+  Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
+
+  // Check that the extends extend from the same type.
+  if (A->getType() != B->getType())
+    return std::nullopt;
+
+  TTI::PartialReductionExtendKind OpAExtend =
+      TargetTransformInfo::getPartialReductionExtendKind(ExtA);
+  TTI::PartialReductionExtendKind OpBExtend =
+      TargetTransformInfo::getPartialReductionExtendKind(ExtB);
+
+  PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp);
+
+  unsigned TargetScaleFactor =
+      PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
+          A->getType()->getPrimitiveSizeInBits());
+
+  if (LoopVectorizationPlanner::getDecisionAndClampRange(
+          [&](ElementCount VF) {
+            InstructionCost Cost = TTI->getPartialReductionCost(
+                Update->getOpcode(), A->getType(), PHI->getType(), VF,
+                OpAExtend, OpBExtend, std::make_optional(BinOp->getOpcode()));
+            return Cost.isValid();
+          },
+          Range))
+    return std::make_pair(Chain, TargetScaleFactor);
+
+  return std::nullopt;
+}
+
 VPRecipeBase *
 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
                                         ArrayRef<VPValue *> Operands,
@@ -8851,9 +8952,14 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
           Legal->getReductionVars().find(Phi)->second;
       assert(RdxDesc.getRecurrenceStartValue() ==
              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
-      PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
-                                           CM.isInLoopReduction(Phi),
-                                           CM.useOrderedReductions(RdxDesc));
+
+      // If the PHI is used by a partial reduction, set the scale factor.
+      std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
+          getScaledReductionForInstr(RdxDesc.getLoopExitInstr());
+      unsigned ScaleFactor = Pair ? Pair->second : 1;
+      PhiRecipe = new VPReductionPHIRecipe(
+          Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
+          CM.useOrderedReductions(RdxDesc), ScaleFactor);
     } else {
       // TODO: Currently fixed-order recurrences are modeled as chains of
       // first-order recurrences. If there are no users of the intermediate
@@ -8885,6 +8991,9 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
     return tryToWidenMemory(Instr, Operands, Range);
 
+  if (getScaledReductionForInstr(Instr))
+    return tryToCreatePartialReduction(Instr, Operands);
+
   if (!shouldWiden(Instr, Range))
     return nullptr;
 
@@ -8905,6 +9014,21 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   return tryToWiden(Instr, Operands, VPBB);
 }
 
+VPRecipeBase *
+VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
+                                             ArrayRef<VPValue *> Operands) {
+  assert(Operands.size() == 2 &&
+         "Unexpected number of operands for partial reduction");
+
+  VPValue *BinOp = Operands[0];
+  VPValue *Phi = Operands[1];
+  if (isa<VPReductionPHIRecipe>(BinOp->getDefiningRecipe()))
+    std::swap(BinOp, Phi);
+
+  return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi,
+                                      Reduction);
+}
+
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
                                                         ElementCount MaxVF) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -9222,7 +9346,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
 
-  VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
+  VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
+                                Builder);
 
   // ---------------------------------------------------------------------------
   // Pre-construction: record ingredients whose recipes we'll need to further
@@ -9268,6 +9393,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
         return Legal->blockNeedsPredication(BB) || NeedsBlends;
       });
+
+  RecipeBuilder.collectScaledReductions(Range);
+
   auto *MiddleVPBB = Plan->getMiddleBlock();
   VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 5d4a3b555981c..cf653e2d3e658 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -21,8 +21,28 @@ namespace llvm {
 class LoopVectorizationLegality;
 class LoopVectorizationCostModel;
 class TargetLibraryInfo;
+class TargetTransformInfo;
 struct HistogramInfo;
 
+/// A chain of instructions that form a partial reduction.
+/// Designed to match: reduction_bin_op (bin_op (extend (A), (extend (B))),
+/// accumulator).
+struct PartialReductionChain {
+  PartialReductionChain(Instruction *Reduction, Instruction *ExtendA,
+                        Instruction *ExtendB, Instruction *BinOp)
+      : Reduction(Reduction), ExtendA(ExtendA), ExtendB(ExtendB), BinOp(BinOp) {
+  }
+  /// The top-level binary operation that forms the reduction to a scalar
+  /// after the loop body.
+  Instruction *Reduction;
+  /// The extension of each of the inner binary operation's operands.
+  Instruction *ExtendA;
+  Instruction *ExtendB;
+
+  /// The binary operation using the extends that is then reduced.
+  Instruction *BinOp;
+};
+
 /// Helper class to create VPRecipies from IR instructions.
 class VPRecipeBuilder {
   /// The VPlan new recipes are added to.
@@ -34,6 +54,9 @@ class VPRecipeBuilder {
   /// Target Library Info.
   const TargetLibraryInfo *TLI;
 
+  // Target Transform Info.
+  const TargetTransformInfo *TTI;
+
   /// The legality analysis.
   LoopVectorizationLegality *Legal;
 
@@ -63,6 +86,11 @@ class VPRecipeBuilder {
   /// created.
   SmallVector<VPHeaderPHIRecipe *, 4> PhisToFix;
 
+  /// The set of reduction exit instructions that will be scaled to
+  /// a smaller VF via partial reductions, paired with the scaling factor.
+  DenseMap<const Instruction *, std::pair<PartialReductionChain, unsigned>>
+      ScaledReductionExitInstrs;
+
   /// Check if \p I can be widened at the start of \p Range and possibly
   /// decrease the range such that the returned value holds for the entire \p
   /// Range. The function should not be called for memory instructions or calls.
@@ -111,13 +139,35 @@ class VPRecipeBuilder {
   VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
                                          ArrayRef<VPValue *> Operands);
 
+  /// Examines reduction operations to see if the target can use a cheaper
+  /// operation with a wider per-iteration input VF and narrower PHI VF.
+  /// Returns null if no scaled reduction was found, otherwise a pair with a
+  /// struct containing reduction information and the scaling factor between the
+  /// number of elements in the input and output.
+  std::optional<std::pair<PartialReductionChain, unsigned>>
+  getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx,
+                     VFRange &Range);
+
 public:
   VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI,
+                  const TargetTransformInfo *TTI,
                   LoopVectorizationLegality *Legal,
                   LoopVectorizationCostModel &CM,
                   PredicatedScalarEvolution &PSE, VPBuilder &Builder)
-      : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM),
-        PSE(PSE), Builder(Builder) {}
+      : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal),
+        CM(CM), PSE(PSE), Builder(Builder) {}
+
+  std::optional<std::pair<PartialReductionChain, unsigned>>
+  getScaledReductionForInstr(const Instruction *ExitInst) {
+    auto It = ScaledReductionExitInstrs.find(ExitInst);
+    return It == ScaledReductionExitInstrs.end()
+               ? std::nullopt
+               : std::make_optional(It->second);
+  }
+
+  /// Find all possible partial reductions in the loop and track all of those
+  /// that are valid so recipes can be formed later.
+  void collectScaledReductions(VFRange &Range);
 
   /// Create and return a widened recipe for \p I if one can be created within
   /// the given VF \p Range.
@@ -125,6 +175,11 @@ class VPRecipeBuilder {
                                        ArrayRef<VPValue *> Operands,
                                        VFRange &Range, VPBasicBlock *VPBB);
 
+  /// Create and return a partial reduction recipe for a reduction instruction
+  /// along with binary operation and reduction phi operands.
+  VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
+                                            ArrayRef<VPValue *> Operands);
+
   /// Set the recipe created for given ingredient.
   void setRecipe(Instruction *I, VPRecipeBase *R) {
     assert(!Ingredient2Recipe.contains(I) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 8dd94a292f707..3c23b12190c47 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -889,6 +889,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPWidenPointerInductionSC:
     case VPRecipeBase::VPReductionPHISC:
     case VPRecipeBase::VPScalarCastSC:
+    case VPRecipeBase::VPPartialReductionSC:
       return true;
     case VPRecipeBase::VPBranchOnMaskSC:
     case VPRecipeBase::VPInterleaveSC:
@@ -2373,23 +2374,28 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
   /// The phi is part of an ordered reduction. Requires IsInLoop to be true.
   bool IsOrdered;
 
+  /// When expanding the reduction PHI, the plan's VF element count is divided
+  /// by this factor to form the reduction phi's VF.
+  unsigned VFScaleFactor = 1;
+
 public:
   /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
   /// RdxDesc.
   VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
                        VPValue &Start, bool IsInLoop = false,
-                       bool IsOrdered = false)
+                       bool IsOrdered = false, unsigned VFScaleFactor = 1)
       : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start),
-        RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) {
+        RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered),
+        VFScaleFactor(VFScaleFactor) {
     assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
   }
 
   ~VPReductionPHIRecipe() override = default;
 
   VPReductionPHIRecipe *clone() override {
-    auto *R =
-        new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), RdxDesc,
-                                 *getOperand(0), IsInLoop, IsOrdered);
+    auto *R = new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()),
+                                       RdxDesc, *getOperand(0), IsInLoop,
+                                       IsOrdered, VFScaleFactor);
     R->addOperand(getBackedgeValue());
     return R;
   }
@@ -2420,6 +2426,51 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
   bool isInLoop() const { return IsInLoop; }
 };
 
+/// A recipe for forming partial reductions. In the loop, an accumulator and
+/// vector operand are added together and passed to the next iteration as the
+/// next accumulator. After the loop body, the accumulator is reduced to a
+/// scalar value.
+class VPPartialReductionRecipe : public VPSingleDefRecipe {
+  unsigned Opcode;
+
+public:
+  VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0,
+                           VPValue *Op1)
+      : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1,
+                                 ReductionInst) {}
+  VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1,
+                           Instruction *ReductionInst = nullptr)
+      : VPSingleDefRecipe(VPDef::VPPartialReductionSC,
+                          ArrayRef<VPValue *>({Op0, Op1}), ReductionInst),
+        Opcode(Opcode) {
+    assert(isa<VPReductionPHIRecipe>(getOperand(1)->getDefiningRecipe()) &&
+           "Unexpected operand order for partial reduction recipe");
+  }
+  ~VPPartialReductionRecipe() override = default;
+
+  VPPartialReductionRecipe *clone() override {
+    return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1));
+  }
+
+  VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
+
+  /// Generate the reduction in the loop.
+  void execute(VPTransformState &State) override;
+
+  /// Return the cost of this VPPartialReductionRecipe.
+  InstructionCost computeCost(ElementCount VF,
+                              VPCostContext &Ctx) const override;
+
+  /// Get the binary op's opcode.
+  unsigned getOpcode() const { return Opcode; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
 /// instructions.
 class VPBlendRecipe : public VPSingleDefRecipe {
@@ -2627,7 +2678,7 @@ class VPReductionRecipe : public VPSingleDefRecipe {
     return R && classof(R);
   }
 
-  /// Generate the reduction in the loop
+  /// Generate the reduction in the loop.
   void execute(VPTransformState &State) override;
 
   /// Return the cost of VPReductionRecipe.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 0d981ff5826ed..0422b6bc079e9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -225,10 +225,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
               [](const auto *R) { return R->getScalarType(); })
           .Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
                 VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
-                VPReverseVectorPointerRecipe, VPWidenCanonicalIVRecipe>(
-              [this](const VPRecipeBase *R) {
-                return inferScalarType(R->getOperand(0));
-              })
+                VPReverseVectorPointerRecipe, VPWidenCanonicalIVRecipe,
+                VPPartialReductionRecipe>([this](const VPRecipeBase *R) {
+            return inferScalarType(R->getOperand(0));
+          })
           .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPWidenEVLRecipe,
                 VPReplicateRecipe, VPWidenCallRecipe, VPWidenMemoryRecipe,
                 VPWidenSelectRecipe>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 7f8c560270bc0..b5020a3287432 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -292,6 +292,66 @@ InstructionCost VPRecipeBase::computeCost(ElementCount VF,
   llvm_unreachable("subclasses should implement computeCost");
 }
 
+InstructionCost
+VPPartialReductionRecipe::computeCost(ElementCount VF,
+                                      VPCostContext &Ctx) const {
+  std::optional<unsigned> Opcode = std::nullopt;
+  VPRecipeBase *BinOpR = getOperand(0)->getDefiningRecipe();
+  if (auto *WidenR = dyn_cast<VPWidenRecipe>(BinOpR))
+    Opcode = std::make_optional(WidenR->getOpcode());
+
+  VPRecipeBase *ExtAR = BinOpR->getOperand(0)->getDefiningRecipe();
+  VPRecipeBase *ExtBR = BinOpR->getOperand(1)->getDefiningRecipe();
+
+  auto GetExtendKind = [](VPRecipeBase *R) {
+    auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R);
+    if (!WidenCastR)
+      return TargetTransformInfo::PR_None;
+    if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt)
+      return TargetTransformInfo::PR_ZeroExtend;
+    if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
+      return TargetTransformInfo::PR_SignExtend;
+    return TargetTransformInfo::PR_None;
+  };
+
+  auto *PhiType = Ctx.Types.inferScalarType(getOperand(1));
+  auto *ExtTy = Ctx.Types.inferScalarType(ExtAR->getOperand(0));
+
+  return Ctx.TTI.getPartialReductionCost(getOpcode(), ExtTy, PhiType, VF,
+                                         GetExtendKind(ExtAR),
+                                         GetExtendKind(ExtBR), Opcode);
+}
+
+void VPPartialReductionRecipe::execute(VPTransformState &State) {
+  State.setDebugLocFrom(getDebugLoc());
+  auto &Builder = State.Builder;
+
+  assert(getOpcode() == Instruction::Add &&
+         "Unhandled partial reduction opcode");
+
+  Value *BinOpVal = State.get(getOperand(0));
+  Value *PhiVal = State.get(getOperand(1));
+  assert(PhiVal && BinOpVal && "Phi and Mul must be set");
+
+  Type *RetTy = PhiVal->getType();
+
+  CallInst *V = Builder.CreateIntrinsic(
+      RetTy, Intrinsic::experimental_vector_partial_reduce_add,
+      {PhiVal, BinOpVal}, nullptr, "partial.reduce");
+
+  State.set(this, V);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
+                                     VPSlotTracker &SlotTracker) const {
+  O << Indent << "PARTIAL-REDUCE ";
+  printAsOperand(O, SlotTracker);
+  O << " = " << Instruction::getOpcodeName(getOpcode()) << " ";
+  printOperands(O, SlotTracker);
+}
+#endif
+
 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
   assert(OpType == OperationType::FPMathOp &&
          "recipe doesn't have fast math flags");
@@ -3367,6 +3427,8 @@ void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
 void VPReductionPHIRecipe::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
 
+  auto VF = State.VF.divideCoefficientBy(VFScaleFactor);
+
   // Reductions do not have to start at zero. They can start with
   // any loop invariant values.
   VPValue *StartVPV = getStartValue();
@@ -3376,9 +3438,9 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
   // this value when we vectorize all of the instructions that use the PHI.
-  bool ScalarPHI = State.VF.isScalar() || IsInLoop;
-  Type *VecTy = ScalarPHI ? StartV->getType()
-                          : VectorType::get(StartV->getType(), State.VF);
+  bool ScalarPHI = VF.isScalar() || IsInLoop;
+  Type *VecTy =
+      ScalarPHI ? StartV->getType() : VectorType::get(StartV->getType(), VF);
 
   BasicBlock *HeaderBB = State.CFG.PrevBB;
   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
@@ -3428,13 +3490,13 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
         // Create start and identity vector values for the reduction in the
         // preheader.
         // TODO: Introduce recipes in VPlan preheader to create initial values.
-        Iden = Builder.CreateVectorSplat(State.VF, Iden);
+        Iden = Builder.CreateVectorSplat(VF, Iden);
         IRBuilderBase::InsertPointGuard IPBuilder(Builder);
         Builder.SetInsertPoint(VectorPH->getTerminator());
         Constant *Zero = Builder.getInt32(0);
         StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
       } else {
-        Iden = Builder.CreateVectorSplat(State.VF, Iden);
+        Iden = Builder.CreateVectorSplat(VF, Iden);
       }
     }
   }
@@ -3452,6 +3514,8 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
   printAsOperand(O, SlotTracker);
   O << " = phi ";
   printOperands(O, SlotTracker);
+  if (VFScaleFactor != 1)
+    O << " (VF scaled by 1/" << VFScaleFactor << ")";
 }
 #endif
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 957a602091c73..7aaf4002b8b3e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -329,6 +329,7 @@ class VPDef {
     VPInterleaveSC,
     VPReductionEVLSC,
     VPReductionSC,
+    VPPartialReductionSC,
     VPReplicateSC,
     VPScalarCastSC,
     VPScalarIVStepsSC,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
index 1cfb507a74344..c3e8c895fce24 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
@@ -11,10 +11,10 @@ define i64 @test(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i64 %i.iv.next, 16
 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 8: 26
+; CHECK: Cost for VF 8: 30
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 16: 48
+; CHECK: Cost for VF 16: 56
 ; CHECK: LV: Selecting VF: 16
 entry:
   br label %for.body
@@ -31,8 +31,8 @@ for.body:                                         ; preds = %entry, %for.body
   %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %i.iv
   %1 = load i8, ptr %arrayidx2, align 1
   %conv3 = zext i8 %1 to i64
-  %mul = mul nuw nsw i64 %conv3, %conv
-  %add = add i64 %mul, %sum
+  %div = udiv i64 %conv3, %conv
+  %add = add i64 %div, %sum
   %i.iv.next = add nuw nsw i64 %i.iv, 1
   %exitcond.not = icmp eq i64 %i.iv.next, 16
   br i1 %exitcond.not, label %exit, label %for.body
@@ -45,11 +45,11 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i64 %i.iv.next, 16
 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 8: 26
+; CHECK: Cost for VF 8: 30
 ; CHECK-NEXT: Cost of 1 for VF 16: induction instruction   %i.iv.next = add nuw nsw i64 %i.iv, 1
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 16: 49
+; CHECK: Cost for VF 16: 57
 ; CHECK: LV: Selecting VF: vscale x 2
 entry:
   br label %for.body
@@ -64,8 +64,8 @@ for.body:                                         ; preds = %entry, %for.body
   %arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next
   %1 = load i8, ptr %arrayidx2, align 1
   %conv3 = zext i8 %1 to i64
-  %mul = mul nuw nsw i64 %conv3, %conv
-  %add = add i64 %sum, %mul
+  %div = udiv i64 %conv3, %conv
+  %add = add i64 %sum, %div
   %exitcond.not = icmp eq i64 %i.iv.next, 16
   br i1 %exitcond.not, label %exit, label %for.body
 
@@ -82,11 +82,11 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i64 %i.iv.next, 16
 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 8: 27
+; CHECK: Cost for VF 8: 24
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 16: 48
+; CHECK: Cost for VF 16: 42
 ; CHECK: LV: Selecting VF: 16
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
new file mode 100644
index 0000000000000..586070d714780
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -mattr=+dotprod -passes=loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @dotp(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: define i32 @dotp(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP15]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ [[TMP18]], [[VEC_EPILOG_PH]] ], [ [[TMP27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX2]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
+; CHECK-NEXT:    [[TMP22:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP23]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP26:%.*]] = mul <vscale x 4 x i32> [[TMP25]], [[TMP22]]
+; CHECK-NEXT:    [[TMP27]] = add <vscale x 4 x i32> [[TMP26]], [[VEC_PHI3]]
+; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX2]], [[TMP17]]
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP27]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
new file mode 100644
index 0000000000000..c66695f1b50f0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -0,0 +1,1375 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -vectorizer-maximize-bandwidth -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @dotp(ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @dotp(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-MAXBW-LABEL: define i32 @dotp(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-MAXBW:       middle.block:
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @not_dotp_different_types(ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = load i16, ptr [[TMP23]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP40:%.*]] = load i16, ptr [[TMP24]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP41:%.*]] = load i16, ptr [[TMP25]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = load i16, ptr [[TMP26]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP43:%.*]] = load i16, ptr [[TMP27]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP44:%.*]] = load i16, ptr [[TMP28]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = load i16, ptr [[TMP29]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP46:%.*]] = load i16, ptr [[TMP30]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP55:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP39]], i32 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP56:%.*]] = insertelement <16 x i16> [[TMP55]], i16 [[TMP40]], i32 5
+; CHECK-INTERLEAVE1-NEXT:    [[TMP57:%.*]] = insertelement <16 x i16> [[TMP56]], i16 [[TMP41]], i32 6
+; CHECK-INTERLEAVE1-NEXT:    [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 7
+; CHECK-INTERLEAVE1-NEXT:    [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 9
+; CHECK-INTERLEAVE1-NEXT:    [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 10
+; CHECK-INTERLEAVE1-NEXT:    [[TMP62:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 11
+; CHECK-INTERLEAVE1-NEXT:    [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12
+; CHECK-INTERLEAVE1-NEXT:    [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13
+; CHECK-INTERLEAVE1-NEXT:    [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14
+; CHECK-INTERLEAVE1-NEXT:    [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15
+; CHECK-INTERLEAVE1-NEXT:    [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 17
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 18
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 19
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 20
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 21
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 22
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 23
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = add i64 [[INDEX]], 24
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX]], 25
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = add i64 [[INDEX]], 26
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX]], 27
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = add i64 [[INDEX]], 28
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = add i64 [[INDEX]], 29
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX]], 30
+; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 31
+; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = load i16, ptr [[TMP37]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP70:%.*]] = load i16, ptr [[TMP38]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = load i16, ptr [[TMP39]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = load i16, ptr [[TMP40]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = load i16, ptr [[TMP41]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = load i16, ptr [[TMP42]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP75:%.*]] = load i16, ptr [[TMP43]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP76:%.*]] = load i16, ptr [[TMP44]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP77:%.*]] = load i16, ptr [[TMP45]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP78:%.*]] = load i16, ptr [[TMP46]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP79:%.*]] = load i16, ptr [[TMP47]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP80:%.*]] = load i16, ptr [[TMP48]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP81:%.*]] = load i16, ptr [[TMP49]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = load i16, ptr [[TMP50]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP83:%.*]] = load i16, ptr [[TMP51]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP84:%.*]] = load i16, ptr [[TMP52]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5
+; CHECK-INTERLEAVED-NEXT:    [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6
+; CHECK-INTERLEAVED-NEXT:    [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7
+; CHECK-INTERLEAVED-NEXT:    [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9
+; CHECK-INTERLEAVED-NEXT:    [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10
+; CHECK-INTERLEAVED-NEXT:    [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11
+; CHECK-INTERLEAVED-NEXT:    [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12
+; CHECK-INTERLEAVED-NEXT:    [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13
+; CHECK-INTERLEAVED-NEXT:    [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14
+; CHECK-INTERLEAVED-NEXT:    [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15
+; CHECK-INTERLEAVED-NEXT:    [[TMP101:%.*]] = load i16, ptr [[TMP53]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP102:%.*]] = load i16, ptr [[TMP54]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP103:%.*]] = load i16, ptr [[TMP55]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP104:%.*]] = load i16, ptr [[TMP56]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP105:%.*]] = load i16, ptr [[TMP57]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP106:%.*]] = load i16, ptr [[TMP58]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP107:%.*]] = load i16, ptr [[TMP59]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP108:%.*]] = load i16, ptr [[TMP60]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP109:%.*]] = load i16, ptr [[TMP61]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP110:%.*]] = load i16, ptr [[TMP62]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP111:%.*]] = load i16, ptr [[TMP63]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP112:%.*]] = load i16, ptr [[TMP64]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP113:%.*]] = load i16, ptr [[TMP65]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP114:%.*]] = load i16, ptr [[TMP66]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP115:%.*]] = load i16, ptr [[TMP67]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP116:%.*]] = load i16, ptr [[TMP68]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5
+; CHECK-INTERLEAVED-NEXT:    [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6
+; CHECK-INTERLEAVED-NEXT:    [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7
+; CHECK-INTERLEAVED-NEXT:    [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9
+; CHECK-INTERLEAVED-NEXT:    [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10
+; CHECK-INTERLEAVED-NEXT:    [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11
+; CHECK-INTERLEAVED-NEXT:    [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12
+; CHECK-INTERLEAVED-NEXT:    [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13
+; CHECK-INTERLEAVED-NEXT:    [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14
+; CHECK-INTERLEAVED-NEXT:    [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15
+; CHECK-INTERLEAVED-NEXT:    [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-MAXBW-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-MAXBW-NEXT:    [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP39:%.*]] = load i16, ptr [[TMP23]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP40:%.*]] = load i16, ptr [[TMP24]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP41:%.*]] = load i16, ptr [[TMP25]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP42:%.*]] = load i16, ptr [[TMP26]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP43:%.*]] = load i16, ptr [[TMP27]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP44:%.*]] = load i16, ptr [[TMP28]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP45:%.*]] = load i16, ptr [[TMP29]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP46:%.*]] = load i16, ptr [[TMP30]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0
+; CHECK-MAXBW-NEXT:    [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1
+; CHECK-MAXBW-NEXT:    [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2
+; CHECK-MAXBW-NEXT:    [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3
+; CHECK-MAXBW-NEXT:    [[TMP55:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP39]], i32 4
+; CHECK-MAXBW-NEXT:    [[TMP56:%.*]] = insertelement <16 x i16> [[TMP55]], i16 [[TMP40]], i32 5
+; CHECK-MAXBW-NEXT:    [[TMP57:%.*]] = insertelement <16 x i16> [[TMP56]], i16 [[TMP41]], i32 6
+; CHECK-MAXBW-NEXT:    [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 7
+; CHECK-MAXBW-NEXT:    [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 8
+; CHECK-MAXBW-NEXT:    [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 9
+; CHECK-MAXBW-NEXT:    [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 10
+; CHECK-MAXBW-NEXT:    [[TMP62:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 11
+; CHECK-MAXBW-NEXT:    [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12
+; CHECK-MAXBW-NEXT:    [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13
+; CHECK-MAXBW-NEXT:    [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14
+; CHECK-MAXBW-NEXT:    [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15
+; CHECK-MAXBW-NEXT:    [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]]
+; CHECK-MAXBW-NEXT:    [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i16, ptr %gep.b, align 2
+  %ext.b = zext i16 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %mul, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @not_dotp_not_phi(ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %ext.b
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled(
+; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled(
+; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled(
+; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
+; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
+; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-MAXBW:       middle.block:
+; CHECK-MAXBW-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
+; CHECK-MAXBW-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
+; CHECK-MAXBW-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
+; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  br label %for.body
+
+for.body:                                    ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum3 = phi i32 [ 0, %entry ], [ %add.a3, %for.body ]
+  %accum2 = phi i32 [ 0, %entry ], [ %add.a2, %for.body ]
+  %accum1 = phi i32 [ 0, %entry ], [ %add.a1, %for.body ]
+  %accum0 = phi i32 [ 0, %entry ], [ %add.a0, %for.body ]
+  %gep.a0 = getelementptr inbounds i8, ptr %a, i64 %iv
+  %gep.b0 = getelementptr inbounds i8, ptr %b, i64 %iv
+  %offset.1 = or disjoint i64 %iv, 1
+  %gep.a1 = getelementptr inbounds i8, ptr %a, i64 %offset.1
+  %gep.b1 = getelementptr inbounds i8, ptr %b, i64 %offset.1
+  %offset.2 = or disjoint i64 %iv, 2
+  %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %offset.2
+  %gep.b2 = getelementptr inbounds i8, ptr %b, i64 %offset.2
+  %offset.3 = or disjoint i64 %iv, 3
+  %gep.a3 = getelementptr inbounds i8, ptr %a, i64 %offset.3
+  %gep.b3 = getelementptr inbounds i8, ptr %b, i64 %offset.3
+  %load.a0 = load i8, ptr %gep.a0, align 1
+  %ext.a0 = sext i8 %load.a0 to i32
+  %load.b0 = load i8, ptr %gep.b0, align 1
+  %ext.b0 = sext i8 %load.b0 to i32
+  %mul.a0 = mul nsw i32 %ext.b0, %ext.a0
+  %add.a0 = add nsw i32 %mul.a0, %accum0
+  %load.a1 = load i8, ptr %gep.a1, align 1
+  %ext.a1 = sext i8 %load.a1 to i32
+  %load.b1 = load i8, ptr %gep.b1, align 1
+  %ext.b1 = sext i8 %load.b1 to i32
+  %mul.a1 = mul nsw i32 %ext.a1, %ext.b1
+  %add.a1 = add nsw i32 %mul.a1, %accum1
+  %load.a2 = load i8, ptr %gep.a2, align 1
+  %ext.a2 = sext i8 %load.a2 to i32
+  %load.b2 = load i8, ptr %gep.b2, align 1
+  %ext.b2 = sext i8 %load.b2 to i32
+  %mul.a2 = mul nsw i32 %ext.a2, %ext.b2
+  %add.a2 = add nsw i32 %mul.a2, %accum2
+  %load.a3 = load i8, ptr %gep.a3, align 1
+  %ext.a3 = sext i8 %load.a3 to i32
+  %load.b3 = load i8, ptr %gep.b3, align 1
+  %ext.b3 = sext i8 %load.b3 to i32
+  %mul.a3 = mul nsw i32 %ext.a3, %ext.b3
+  %add.a3 = add nsw i32 %mul.a3, %accum3
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %num_in
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                                        ; preds = %for.body
+  %result0 = add nsw i32 %add.a0, %add.a1
+  %result1 = add nsw i32 %add.a2, %add.a3
+  %result = add nsw i32 %result0, %result1
+  ret i32 %result
+}
+
+define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated(
+; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated(
+; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated(
+; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-MAXBW:       middle.block:
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = sext i8 %load.a to i32
+  %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = sext i8 %load.b to i32
+  %mul = mul nsw i32 %ext.b, %ext.a
+  %add = add nsw i32 %mul, %accum
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 15
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 15
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 15
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
+; CHECK-MAXBW-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr inbounds i8, ptr %b, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = sext i8 %load.a to i32
+  %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %iv
+  %load.b = load i8, ptr %gep.a2, align 1
+  %ext.b = sext i8 %load.b to i32
+  %mul = mul nsw i32 %ext.b, %ext.a
+  %add = add nsw i32 %mul, %accum
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !7
+
+exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       for.body:
+; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]]
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[RESULT]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = extractelement <16 x i32> [[TMP10]], i32 15
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-INTERLEAVED-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       for.body:
+; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVED-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVED-NEXT:    [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]]
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[RESULT]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-MAXBW:       middle.block:
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
+; CHECK-MAXBW-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-MAXBW:       scalar.ph:
+; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-MAXBW:       for.body:
+; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-MAXBW-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-MAXBW-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-MAXBW-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-MAXBW-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-MAXBW:       for.exit:
+; CHECK-MAXBW-NEXT:    [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; CHECK-MAXBW-NEXT:    [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]]
+; CHECK-MAXBW-NEXT:    ret i32 [[RESULT]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  %result = add i32 %add, %ext.b
+  ret i32 %result
+}
+
+!7 = distinct !{!7, !8, !9, !10}
+!8 = !{!"llvm.loop.mustprogress"}
+!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!10 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
new file mode 100644
index 0000000000000..9530947232192
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -0,0 +1,1733 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @dotp(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @dotp(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY1:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP14:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP18]], [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP15]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]]
+; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       for.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[ADD_LCSSA]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY1:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP23:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP24:%.*]], [[VECTOR_BODY1]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP14]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP28]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = mul i64 [[TMP26]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP28]], i64 [[TMP27]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP29]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP30]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP25]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]]
+; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ]
+; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       for.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVED-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[ADD_LCSSA]]
+;
+; CHECK-MAXBW-LABEL: define i32 @dotp(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP20]], [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP22]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP41:%.*]] = load i16, ptr [[TMP23]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = load i16, ptr [[TMP24]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP43:%.*]] = load i16, ptr [[TMP25]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP44:%.*]] = load i16, ptr [[TMP26]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = load i16, ptr [[TMP27]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP46:%.*]] = load i16, ptr [[TMP28]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP55:%.*]] = load i16, ptr [[TMP29]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP56:%.*]] = load i16, ptr [[TMP30]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP57:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP41]], i32 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 5
+; CHECK-INTERLEAVE1-NEXT:    [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 6
+; CHECK-INTERLEAVE1-NEXT:    [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 7
+; CHECK-INTERLEAVE1-NEXT:    [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP96:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 9
+; CHECK-INTERLEAVE1-NEXT:    [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP55]], i32 10
+; CHECK-INTERLEAVE1-NEXT:    [[TMP62:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP56]], i32 11
+; CHECK-INTERLEAVE1-NEXT:    [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12
+; CHECK-INTERLEAVE1-NEXT:    [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13
+; CHECK-INTERLEAVE1-NEXT:    [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14
+; CHECK-INTERLEAVE1-NEXT:    [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15
+; CHECK-INTERLEAVE1-NEXT:    [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP70]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       for.body:
+; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[ADD_LCSSA]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 17
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 18
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 19
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 20
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 21
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 22
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 23
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = add i64 [[INDEX]], 24
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX]], 25
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = add i64 [[INDEX]], 26
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX]], 27
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = add i64 [[INDEX]], 28
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = add i64 [[INDEX]], 29
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX]], 30
+; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 31
+; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP139:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP140:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = load i16, ptr [[TMP39]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP70:%.*]] = load i16, ptr [[TMP40]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = load i16, ptr [[TMP41]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = load i16, ptr [[TMP42]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = load i16, ptr [[TMP43]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = load i16, ptr [[TMP44]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP75:%.*]] = load i16, ptr [[TMP45]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP76:%.*]] = load i16, ptr [[TMP46]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP77:%.*]] = load i16, ptr [[TMP47]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP78:%.*]] = load i16, ptr [[TMP48]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP79:%.*]] = load i16, ptr [[TMP49]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP80:%.*]] = load i16, ptr [[TMP50]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP81:%.*]] = load i16, ptr [[TMP51]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = load i16, ptr [[TMP52]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP83:%.*]] = load i16, ptr [[TMP53]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP84:%.*]] = load i16, ptr [[TMP54]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5
+; CHECK-INTERLEAVED-NEXT:    [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6
+; CHECK-INTERLEAVED-NEXT:    [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7
+; CHECK-INTERLEAVED-NEXT:    [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9
+; CHECK-INTERLEAVED-NEXT:    [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10
+; CHECK-INTERLEAVED-NEXT:    [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11
+; CHECK-INTERLEAVED-NEXT:    [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12
+; CHECK-INTERLEAVED-NEXT:    [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13
+; CHECK-INTERLEAVED-NEXT:    [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14
+; CHECK-INTERLEAVED-NEXT:    [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15
+; CHECK-INTERLEAVED-NEXT:    [[TMP101:%.*]] = load i16, ptr [[TMP55]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP102:%.*]] = load i16, ptr [[TMP56]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP103:%.*]] = load i16, ptr [[TMP57]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP104:%.*]] = load i16, ptr [[TMP58]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP105:%.*]] = load i16, ptr [[TMP59]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP106:%.*]] = load i16, ptr [[TMP60]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP107:%.*]] = load i16, ptr [[TMP61]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP108:%.*]] = load i16, ptr [[TMP62]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP109:%.*]] = load i16, ptr [[TMP63]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP110:%.*]] = load i16, ptr [[TMP64]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP111:%.*]] = load i16, ptr [[TMP65]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP112:%.*]] = load i16, ptr [[TMP66]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP113:%.*]] = load i16, ptr [[TMP67]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP114:%.*]] = load i16, ptr [[TMP68]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP115:%.*]] = load i16, ptr [[TMP139]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP116:%.*]] = load i16, ptr [[TMP140]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5
+; CHECK-INTERLEAVED-NEXT:    [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6
+; CHECK-INTERLEAVED-NEXT:    [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7
+; CHECK-INTERLEAVED-NEXT:    [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9
+; CHECK-INTERLEAVED-NEXT:    [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10
+; CHECK-INTERLEAVED-NEXT:    [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11
+; CHECK-INTERLEAVED-NEXT:    [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12
+; CHECK-INTERLEAVED-NEXT:    [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13
+; CHECK-INTERLEAVED-NEXT:    [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14
+; CHECK-INTERLEAVED-NEXT:    [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15
+; CHECK-INTERLEAVED-NEXT:    [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP141:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP142:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
+; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-MAXBW-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-MAXBW-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
+; CHECK-MAXBW-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
+; CHECK-MAXBW-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
+; CHECK-MAXBW-NEXT:    [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-MAXBW-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-MAXBW-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-MAXBW-NEXT:    [[TMP101:%.*]] = load i16, ptr [[TMP37]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP102:%.*]] = load i16, ptr [[TMP38]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP103:%.*]] = load i16, ptr [[TMP39]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP104:%.*]] = load i16, ptr [[TMP40]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP105:%.*]] = load i16, ptr [[TMP41]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP106:%.*]] = load i16, ptr [[TMP42]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP107:%.*]] = load i16, ptr [[TMP43]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP108:%.*]] = load i16, ptr [[TMP44]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP109:%.*]] = load i16, ptr [[TMP45]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP110:%.*]] = load i16, ptr [[TMP46]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP111:%.*]] = load i16, ptr [[TMP47]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP112:%.*]] = load i16, ptr [[TMP48]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP113:%.*]] = load i16, ptr [[TMP49]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP114:%.*]] = load i16, ptr [[TMP50]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP115:%.*]] = load i16, ptr [[TMP51]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP116:%.*]] = load i16, ptr [[TMP52]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
+; CHECK-MAXBW-NEXT:    [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
+; CHECK-MAXBW-NEXT:    [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
+; CHECK-MAXBW-NEXT:    [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3
+; CHECK-MAXBW-NEXT:    [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4
+; CHECK-MAXBW-NEXT:    [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5
+; CHECK-MAXBW-NEXT:    [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6
+; CHECK-MAXBW-NEXT:    [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7
+; CHECK-MAXBW-NEXT:    [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8
+; CHECK-MAXBW-NEXT:    [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9
+; CHECK-MAXBW-NEXT:    [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10
+; CHECK-MAXBW-NEXT:    [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11
+; CHECK-MAXBW-NEXT:    [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12
+; CHECK-MAXBW-NEXT:    [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13
+; CHECK-MAXBW-NEXT:    [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14
+; CHECK-MAXBW-NEXT:    [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15
+; CHECK-MAXBW-NEXT:    [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]]
+; CHECK-MAXBW-NEXT:    [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i16, ptr %gep.b, align 2
+  %ext.b = zext i16 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP16]], i32 -1)
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP17]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[TMP24]], <vscale x 8 x i32> [[TMP25]], i32 -1)
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
+; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP25]], i32 -1)
+; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %mul, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[TMP19]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP21]] = add <vscale x 8 x i32> [[TMP30]], [[TMP22]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP14]]
+; CHECK-MAXBW-NEXT:    [[TMP21]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %ext.b
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled(
+; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP20]], 4
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = mul nsw <vscale x 4 x i32> [[TMP21]], [[TMP36]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP38]], [[VEC_PHI3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD6]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = mul nsw <vscale x 4 x i32> [[TMP25]], [[TMP42]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP30]] = add <vscale x 4 x i32> [[TMP28]], [[VEC_PHI2]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 4 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD7]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = mul nsw <vscale x 4 x i32> [[TMP31]], [[TMP33]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP35]] = add <vscale x 4 x i32> [[TMP34]], [[VEC_PHI1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP40:%.*]] = mul nsw <vscale x 4 x i32> [[TMP37]], [[TMP39]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP41]] = add <vscale x 4 x i32> [[TMP40]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP26]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled(
+; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], 8
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], 8
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP34]], 8
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI4:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI5:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI6:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI7:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP56]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP20]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 [[TMP26]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP72]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul nsw <vscale x 4 x i32> [[TMP28]], [[TMP66]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = mul nsw <vscale x 4 x i32> [[TMP82]], [[TMP23]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP50]] = add <vscale x 4 x i32> [[TMP30]], [[VEC_PHI6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP33]] = add <vscale x 4 x i32> [[TMP31]], [[VEC_PHI7]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = mul i64 [[TMP35]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP36]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD11:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 4 x i8>, ptr [[TMP37]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD11]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD12]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP41]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP42]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD13:%.*]] = load <vscale x 4 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 4 x i8>, ptr [[TMP43]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD13]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD14]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = mul nsw <vscale x 4 x i32> [[TMP38]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = mul nsw <vscale x 4 x i32> [[TMP39]], [[TMP45]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP48]] = add <vscale x 4 x i32> [[TMP46]], [[VEC_PHI4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP49]] = add <vscale x 4 x i32> [[TMP47]], [[VEC_PHI5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = mul i64 [[TMP51]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP52]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD15:%.*]] = load <vscale x 4 x i8>, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 4 x i8>, ptr [[TMP53]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD15]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD16]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = mul i64 [[TMP57]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP58]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD17:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD18:%.*]] = load <vscale x 4 x i8>, ptr [[TMP59]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD17]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD18]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP62:%.*]] = mul nsw <vscale x 4 x i32> [[TMP54]], [[TMP60]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = mul nsw <vscale x 4 x i32> [[TMP55]], [[TMP61]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP64]] = add <vscale x 4 x i32> [[TMP62]], [[VEC_PHI2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP65]] = add <vscale x 4 x i32> [[TMP63]], [[VEC_PHI3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = mul i64 [[TMP67]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP68]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD19:%.*]] = load <vscale x 4 x i8>, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD20:%.*]] = load <vscale x 4 x i8>, ptr [[TMP69]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP70:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD19]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD20]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = mul i64 [[TMP73]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[TMP74]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD21:%.*]] = load <vscale x 4 x i8>, ptr [[TMP29]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD22:%.*]] = load <vscale x 4 x i8>, ptr [[TMP75]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP76:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD21]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP77:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD22]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP78:%.*]] = mul nsw <vscale x 4 x i32> [[TMP70]], [[TMP76]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP79:%.*]] = mul nsw <vscale x 4 x i32> [[TMP71]], [[TMP77]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP80]] = add <vscale x 4 x i32> [[TMP78]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP81]] = add <vscale x 4 x i32> [[TMP79]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP40]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled(
+; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI4:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI5:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI6:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI7:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP6]], 1
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = or disjoint i64 [[TMP6]], 2
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = or disjoint i64 [[TMP6]], 3
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
+; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x i8>, ptr [[TMP24]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD9]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = mul nsw <vscale x 8 x i32> [[TMP29]], [[TMP23]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE11]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI7]], <vscale x 8 x i32> [[TMP31]])
+; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 8 x i8>, ptr [[TMP32]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD12]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 8 x i8>, ptr [[TMP38]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP43:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD14]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP45:%.*]] = mul nsw <vscale x 8 x i32> [[TMP37]], [[TMP43]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI6]], <vscale x 8 x i32> [[TMP45]])
+; CHECK-MAXBW-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD18:%.*]] = load <vscale x 8 x i8>, ptr [[TMP46]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP51:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD18]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD20:%.*]] = load <vscale x 8 x i8>, ptr [[TMP52]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP57:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD20]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP59:%.*]] = mul nsw <vscale x 8 x i32> [[TMP51]], [[TMP57]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE17]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI5]], <vscale x 8 x i32> [[TMP59]])
+; CHECK-MAXBW-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD24:%.*]] = load <vscale x 8 x i8>, ptr [[TMP60]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP65:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD24]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD26:%.*]] = load <vscale x 8 x i8>, ptr [[TMP66]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP71:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD26]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP73:%.*]] = mul nsw <vscale x 8 x i32> [[TMP65]], [[TMP71]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE16]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI4]], <vscale x 8 x i32> [[TMP73]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                    ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum3 = phi i32 [ 0, %entry ], [ %add.a3, %for.body ]
+  %accum2 = phi i32 [ 0, %entry ], [ %add.a2, %for.body ]
+  %accum1 = phi i32 [ 0, %entry ], [ %add.a1, %for.body ]
+  %accum0 = phi i32 [ 0, %entry ], [ %add.a0, %for.body ]
+  %gep.a0 = getelementptr inbounds i8, ptr %a, i64 %iv
+  %gep.b0 = getelementptr inbounds i8, ptr %b, i64 %iv
+  %offset.1 = or disjoint i64 %iv, 1
+  %gep.a1 = getelementptr inbounds i8, ptr %a, i64 %offset.1
+  %gep.b1 = getelementptr inbounds i8, ptr %b, i64 %offset.1
+  %offset.2 = or disjoint i64 %iv, 2
+  %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %offset.2
+  %gep.b2 = getelementptr inbounds i8, ptr %b, i64 %offset.2
+  %offset.3 = or disjoint i64 %iv, 3
+  %gep.a3 = getelementptr inbounds i8, ptr %a, i64 %offset.3
+  %gep.b3 = getelementptr inbounds i8, ptr %b, i64 %offset.3
+  %load.a0 = load i8, ptr %gep.a0, align 1
+  %ext.a0 = sext i8 %load.a0 to i32
+  %load.b0 = load i8, ptr %gep.b0, align 1
+  %ext.b0 = sext i8 %load.b0 to i32
+  %mul.a0 = mul nsw i32 %ext.b0, %ext.a0
+  %add.a0 = add nsw i32 %mul.a0, %accum0
+  %load.a1 = load i8, ptr %gep.a1, align 1
+  %ext.a1 = sext i8 %load.a1 to i32
+  %load.b1 = load i8, ptr %gep.b1, align 1
+  %ext.b1 = sext i8 %load.b1 to i32
+  %mul.a1 = mul nsw i32 %ext.a1, %ext.b1
+  %add.a1 = add nsw i32 %mul.a1, %accum1
+  %load.a2 = load i8, ptr %gep.a2, align 1
+  %ext.a2 = sext i8 %load.a2 to i32
+  %load.b2 = load i8, ptr %gep.b2, align 1
+  %ext.b2 = sext i8 %load.b2 to i32
+  %mul.a2 = mul nsw i32 %ext.a2, %ext.b2
+  %add.a2 = add nsw i32 %mul.a2, %accum2
+  %load.a3 = load i8, ptr %gep.a3, align 1
+  %ext.a3 = sext i8 %load.a3 to i32
+  %load.b3 = load i8, ptr %gep.b3, align 1
+  %ext.b3 = sext i8 %load.b3 to i32
+  %mul.a3 = mul nsw i32 %ext.a3, %ext.b3
+  %add.a3 = add nsw i32 %mul.a3, %accum3
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %num_in
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                                        ; preds = %for.body
+  %result0 = add nsw i32 %add.a0, %add.a1
+  %result1 = add nsw i32 %add.a2, %add.a3
+  %result = add nsw i32 %result0, %result1
+  ret i32 %result
+}
+
+define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated(
+; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP7]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP10]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP11]], 4
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul nsw <vscale x 4 x i32> [[TMP12]], [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVE1:       middle.block:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated(
+; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP8]], 8
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP15]], 8
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul nsw <vscale x 4 x i32> [[TMP19]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul nsw <vscale x 4 x i32> [[TMP20]], [[TMP25]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP21]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVED:       middle.block:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated(
+; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul nsw <vscale x 8 x i32> [[TMP20]], [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP22]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = sext i8 %load.a to i32
+  %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = sext i8 %load.b to i32
+  %mul = mul nsw i32 %ext.b, %ext.a
+  %add = add nsw i32 %mul, %accum
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP10]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0
+; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP5]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP10]], 1
+; CHECK-INTERLEAVED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]]
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP12]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0
+; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP5]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma(
+; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
+; CHECK-MAXBW-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]]
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
+; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP15]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
+; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
+; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
+; CHECK-MAXBW-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr inbounds i8, ptr %b, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = sext i8 %load.a to i32
+  %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %iv
+  %load.b = load i8, ptr %gep.a2, align 1
+  %ext.b = sext i8 %load.b to i32
+  %mul = mul nsw i32 %ext.b, %ext.a
+  %add = add nsw i32 %mul, %accum
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %N
+  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !7
+
+exit:                        ; preds = %for.body
+  ret i32 %add
+}
+
+define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP8]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP11]], 4
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP12]], [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP7]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP14]], 8
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP20]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP21]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP20]], [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  %result = add i32 %add, %ext.b
+  ret i32 %result
+}
+
+define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define i64 @dotp_cost_disagreement(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVE1-NEXT:  entry:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVE1:       vector.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVE1:       vector.body:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD1]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP13]], [[TMP9]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP14]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+;
+; CHECK-INTERLEAVED-LABEL: define i64 @dotp_cost_disagreement(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVED-NEXT:  entry:
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED:       vector.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED:       vector.body:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add nuw nsw i64 [[TMP6]], 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i64 [[TMP18]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP16]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP20]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP21]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP22]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP25]] = add <vscale x 2 x i64> [[VEC_PHI1]], [[TMP23]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+;
+; CHECK-MAXBW-LABEL: define i64 @dotp_cost_disagreement(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-MAXBW-NEXT:  entry:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-MAXBW:       vector.ph:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-MAXBW:       vector.body:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]]
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP13]], [[TMP9]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i64> @llvm.experimental.vector.partial.reduce.add.nxv1i64.nxv8i64(<vscale x 1 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[TMP14]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-MAXBW:       middle.block:
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
+  %sum = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %a, i64 %i.iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i64
+  %i.iv.next = add nuw nsw i64 %i.iv, 1
+  %arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next
+  %1 = load i8, ptr %arrayidx2, align 1
+  %conv3 = zext i8 %1 to i64
+  %mul = mul nuw nsw i64 %conv3, %conv
+  %add = add i64 %sum, %mul
+  %exitcond.not = icmp eq i64 %i.iv.next, 16
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:                                 ; preds = %for.body
+  ret i64 %add
+}
+
+!7 = distinct !{!7, !8, !9, !10}
+!8 = !{!"llvm.loop.mustprogress"}
+!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!10 = !{!"llvm.loop.vectorize.enable", i1 true}
+attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll
new file mode 100644
index 0000000000000..f24b115ab9f99
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @not_dotp(ptr %a, ptr %b) {
+; CHECK-LABEL: define i32 @not_dotp(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret i32 %add
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
new file mode 100644
index 0000000000000..06aaf29b382a2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -0,0 +1,93 @@
+; REQUIRES: asserts
+; RUN: opt -mattr=+neon,+dotprod -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -disable-output %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+; Tests for printing VPlans that are enabled under AArch64
+
+define i32 @print_partial_reduction(ptr %a, ptr %b) {
+; CHECK:      VPlan 'Initial VPlan for VF={8,16},UF>=1' {
+; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF
+; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
+; CHECK-NEXT: Live-in ir<0> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<entry>:
+; CHECK-NEXT: Successor(s): vector.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT: vector.body:
+; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
+; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, ir<[[REDUCE:%.+]]> (VF scaled by 1/4)
+; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
+; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]>
+; CHECK-NEXT:   vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a>
+; CHECK-NEXT:   WIDEN ir<%load.a> = load vp<[[PTR_A]]>
+; CHECK-NEXT:   WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
+; CHECK-NEXT:   CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]>
+; CHECK-NEXT:   vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b>
+; CHECK-NEXT:   WIDEN ir<%load.b> = load vp<[[PTR_B]]>
+; CHECK-NEXT:   WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
+; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
+; CHECK-NEXT:   PARTIAL-REDUCE ir<[[REDUCE]]> = add ir<%mul>, ir<[[ACC]]>
+; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
+; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+; CHECK-EMPTY:
+; CHECK-NEXT: middle.block:
+; CHECK-NEXT:   EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]>
+; CHECK-NEXT:   EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1>
+; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq ir<0>, vp<%1>
+; CHECK-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
+; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
+; CHECK-EMPTY:
+; CHECK-NEXT: scalar.ph:
+; CHECK-NEXT:   EMIT vp<%bc.merge.rdx> = resume-phi vp<[[RED_RESULT]]>, ir<0>
+; CHECK-NEXT: Successor(s): ir-bb<for.body>
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<for.body>:
+; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+; CHECK-NEXT:   IR   %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
+; CHECK-NEXT:   IR   %gep.a = getelementptr i8, ptr %a, i64 %iv
+; CHECK-NEXT:   IR   %load.a = load i8, ptr %gep.a, align 1
+; CHECK-NEXT:   IR   %ext.a = zext i8 %load.a to i32
+; CHECK-NEXT:   IR   %gep.b = getelementptr i8, ptr %b, i64 %iv
+; CHECK-NEXT:   IR   %load.b = load i8, ptr %gep.b, align 1
+; CHECK-NEXT:   IR   %ext.b = zext i8 %load.b to i32
+; CHECK-NEXT:   IR   %mul = mul i32 %ext.b, %ext.a
+; CHECK-NEXT:   IR   %add = add i32 %mul, %accum
+; CHECK-NEXT:   IR   %iv.next = add i64 %iv, 1
+; CHECK-NEXT:   IR   %exitcond.not = icmp eq i64 %iv.next, 0
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: ir-bb<exit>:
+; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]> from middle.block)
+; CHECK-NEXT: No successors
+; CHECK-NEXT: }
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr i8, ptr %a, i64 %iv
+  %load.a = load i8, ptr %gep.a, align 1
+  %ext.a = zext i8 %load.a to i32
+  %gep.b = getelementptr i8, ptr %b, i64 %iv
+  %load.b = load i8, ptr %gep.b, align 1
+  %ext.b = zext i8 %load.b to i32
+  %mul = mul i32 %ext.b, %ext.a
+  %add = add i32 %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 0
+  br i1 %exitcond.not, label %exit, label %for.body
+
+exit:
+  ret i32 %add
+}

From d0d5101f9959013e42f6f07d79d0fe638aaa0aa3 Mon Sep 17 00:00:00 2001
From: Balazs Benics <benicsbalazs@gmail.com>
Date: Thu, 19 Dec 2024 12:51:40 +0100
Subject: [PATCH 047/209] [analyzer][NFC] Migrate nonloc::ConcreteInt to use
 APSIntPtr (2/4) (#120436)

---
 .../Core/PathSensitive/MemRegion.h            |  2 +-
 .../StaticAnalyzer/Core/PathSensitive/SVals.h |  8 +++--
 .../Checkers/ArrayBoundCheckerV2.cpp          | 14 ++++-----
 .../Checkers/BasicObjCFoundationChecks.cpp    |  2 +-
 .../Checkers/BitwiseShiftChecker.cpp          |  8 ++---
 .../Checkers/BuiltinFunctionChecker.cpp       |  8 +++--
 .../Checkers/CheckPlacementNew.cpp            |  2 +-
 .../lib/StaticAnalyzer/Checkers/Iterator.cpp  | 14 ++++-----
 .../Checkers/IteratorModeling.cpp             |  4 +--
 .../Checkers/MmapWriteExecChecker.cpp         |  2 +-
 .../StaticAnalyzer/Checkers/StreamChecker.cpp |  2 +-
 .../Core/BugReporterVisitors.cpp              |  2 +-
 clang/lib/StaticAnalyzer/Core/MemRegion.cpp   |  8 ++---
 .../lib/StaticAnalyzer/Core/ProgramState.cpp  |  6 ++--
 clang/lib/StaticAnalyzer/Core/SValBuilder.cpp |  2 +-
 clang/lib/StaticAnalyzer/Core/SVals.cpp       | 10 +++----
 .../Core/SimpleConstraintManager.cpp          |  2 +-
 .../StaticAnalyzer/Core/SimpleSValBuilder.cpp | 30 ++++++++++---------
 18 files changed, 65 insertions(+), 61 deletions(-)

diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h
index 0d9566285f5d4..f88bf70d72398 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h
@@ -1206,7 +1206,7 @@ class ElementRegion : public TypedValueRegion {
       : TypedValueRegion(sReg, ElementRegionKind), ElementType(elementType),
         Index(Idx) {
     assert((!isa<nonloc::ConcreteInt>(Idx) ||
-            Idx.castAs<nonloc::ConcreteInt>().getValue().isSigned()) &&
+            Idx.castAs<nonloc::ConcreteInt>().getValue()->isSigned()) &&
            "The index must be signed");
     assert(!elementType.isNull() && !elementType->isVoidType() &&
            "Invalid region type!");
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h
index a054a819a15a8..57d7514280f10 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h
@@ -17,6 +17,7 @@
 #include "clang/AST/Expr.h"
 #include "clang/AST/Type.h"
 #include "clang/Basic/LLVM.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/APSIntPtr.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/FoldingSet.h"
@@ -298,9 +299,12 @@ class SymbolVal : public NonLoc {
 /// Value representing integer constant.
 class ConcreteInt : public NonLoc {
 public:
-  explicit ConcreteInt(const llvm::APSInt &V) : NonLoc(ConcreteIntKind, &V) {}
+  explicit ConcreteInt(APSIntPtr V) : NonLoc(ConcreteIntKind, V.get()) {}
 
-  const llvm::APSInt &getValue() const { return *castDataAs<llvm::APSInt>(); }
+  APSIntPtr getValue() const {
+    // This is safe because in the ctor we take a safe APSIntPtr.
+    return APSIntPtr::unsafeConstructor(castDataAs<llvm::APSInt>());
+  }
 
   static bool classof(SVal V) { return V.getKind() == ConcreteIntKind; }
 };
diff --git a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
index 3f837564cf47c..6422933c8828a 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ArrayBoundCheckerV2.cpp
@@ -22,6 +22,7 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/DynamicExtent.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
+#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
@@ -241,26 +242,25 @@ computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location) {
 static std::pair<NonLoc, nonloc::ConcreteInt>
 getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent,
                      SValBuilder &svalBuilder) {
+  const llvm::APSInt &extentVal = extent.getValue();
   std::optional<nonloc::SymbolVal> SymVal = offset.getAs<nonloc::SymbolVal>();
   if (SymVal && SymVal->isExpression()) {
     if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(SymVal->getSymbol())) {
-      llvm::APSInt constant =
-          APSIntType(extent.getValue()).convert(SIE->getRHS());
+      llvm::APSInt constant = APSIntType(extentVal).convert(SIE->getRHS());
       switch (SIE->getOpcode()) {
       case BO_Mul:
         // The constant should never be 0 here, becasue multiplication by zero
         // is simplified by the engine.
-        if ((extent.getValue() % constant) != 0)
+        if ((extentVal % constant) != 0)
           return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
         else
           return getSimplifiedOffsets(
               nonloc::SymbolVal(SIE->getLHS()),
-              svalBuilder.makeIntVal(extent.getValue() / constant),
-              svalBuilder);
+              svalBuilder.makeIntVal(extentVal / constant), svalBuilder);
       case BO_Add:
         return getSimplifiedOffsets(
             nonloc::SymbolVal(SIE->getLHS()),
-            svalBuilder.makeIntVal(extent.getValue() - constant), svalBuilder);
+            svalBuilder.makeIntVal(extentVal - constant), svalBuilder);
       default:
         break;
       }
@@ -363,7 +363,7 @@ static std::string getRegionName(const SubRegion *Region) {
 
 static std::optional<int64_t> getConcreteValue(NonLoc SV) {
   if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) {
-    return ConcreteVal->getValue().tryExtValue();
+    return ConcreteVal->getValue()->tryExtValue();
   }
   return std::nullopt;
 }
diff --git a/clang/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp b/clang/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp
index 80f128b917b20..cc089767adfee 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BasicObjCFoundationChecks.cpp
@@ -457,7 +457,7 @@ void CFNumberChecker::checkPreStmt(const CallExpr *CE,
   if (!V)
     return;
 
-  uint64_t NumberKind = V->getValue().getLimitedValue();
+  uint64_t NumberKind = V->getValue()->getLimitedValue();
   std::optional<uint64_t> OptCFNumberSize = GetCFNumberSize(Ctx, NumberKind);
 
   // FIXME: In some cases we can emit an error.
diff --git a/clang/lib/StaticAnalyzer/Checkers/BitwiseShiftChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BitwiseShiftChecker.cpp
index 17f1214195b3e..ed26ddea93a26 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BitwiseShiftChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BitwiseShiftChecker.cpp
@@ -252,11 +252,11 @@ BugReportPtr BitwiseShiftValidator::checkLeftShiftOverflow() {
 
   // We should have already reported a bug if the left operand of the shift was
   // negative, so it cannot be negative here.
-  assert(Left->getValue().isNonNegative());
+  assert(Left->getValue()->isNonNegative());
 
   const unsigned LeftAvailableBitWidth =
       LeftBitWidth - static_cast<unsigned>(ShouldPreserveSignBit);
-  const unsigned UsedBitsInLeftOperand = Left->getValue().getActiveBits();
+  const unsigned UsedBitsInLeftOperand = Left->getValue()->getActiveBits();
   assert(LeftBitWidth >= UsedBitsInLeftOperand);
   const unsigned MaximalAllowedShift =
       LeftAvailableBitWidth - UsedBitsInLeftOperand;
@@ -275,9 +275,9 @@ BugReportPtr BitwiseShiftValidator::checkLeftShiftOverflow() {
   if (const auto ConcreteRight = Right.getAs<nonloc::ConcreteInt>()) {
     // Here ConcreteRight must contain a small non-negative integer, because
     // otherwise one of the earlier checks should've reported a bug.
-    const unsigned RHS = ConcreteRight->getValue().getExtValue();
+    const int64_t RHS = ConcreteRight->getValue()->getExtValue();
     assert(RHS > MaximalAllowedShift);
-    const unsigned OverflownBits = RHS - MaximalAllowedShift;
+    const int64_t OverflownBits = RHS - MaximalAllowedShift;
     ShortMsg = formatv(
         "The shift '{0} << {1}' overflows the capacity of '{2}'",
         Left->getValue(), ConcreteRight->getValue(), LHSTy.getAsString());
diff --git a/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp
index 4ab0c4c9ae7b7..cfdd3c9faa360 100644
--- a/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/BuiltinFunctionChecker.cpp
@@ -155,12 +155,14 @@ BuiltinFunctionChecker::checkOverflow(CheckerContext &C, SVal RetVal,
   unsigned BitWidth = C.getASTContext().getIntWidth(Res);
   bool IsUnsigned = Res->isUnsignedIntegerType();
 
+  SValBuilder &SVB = C.getSValBuilder();
+  BasicValueFactory &VF = SVB.getBasicValueFactory();
+
   auto MinValType = llvm::APSInt::getMinValue(BitWidth, IsUnsigned);
   auto MaxValType = llvm::APSInt::getMaxValue(BitWidth, IsUnsigned);
-  nonloc::ConcreteInt MinVal{MinValType};
-  nonloc::ConcreteInt MaxVal{MaxValType};
+  nonloc::ConcreteInt MinVal{VF.getValue(MinValType)};
+  nonloc::ConcreteInt MaxVal{VF.getValue(MaxValType)};
 
-  SValBuilder &SVB = C.getSValBuilder();
   ProgramStateRef State = C.getState();
   SVal IsLeMax = SVB.evalBinOp(State, BO_LE, RetVal, MaxVal, Res);
   SVal IsGeMin = SVB.evalBinOp(State, BO_GE, RetVal, MinVal, Res);
diff --git a/clang/lib/StaticAnalyzer/Checkers/CheckPlacementNew.cpp b/clang/lib/StaticAnalyzer/Checkers/CheckPlacementNew.cpp
index 1b89951397cfb..3a66b0f11eb2e 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CheckPlacementNew.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CheckPlacementNew.cpp
@@ -124,7 +124,7 @@ bool PlacementNewChecker::checkPlaceCapacityIsSufficient(
             "requires {1} bytes. Current overhead requires the size of {2} "
             "bytes",
             SizeOfPlaceCI->getValue(), SizeOfTargetCI->getValue(),
-            SizeOfPlaceCI->getValue() - SizeOfTargetCI->getValue()));
+            *SizeOfPlaceCI->getValue().get() - SizeOfTargetCI->getValue()));
       else if (IsArrayTypeAllocated &&
                SizeOfPlaceCI->getValue() == SizeOfTargetCI->getValue())
         Msg = std::string(llvm::formatv(
diff --git a/clang/lib/StaticAnalyzer/Checkers/Iterator.cpp b/clang/lib/StaticAnalyzer/Checkers/Iterator.cpp
index e8d35aac2efd9..ba561ddebdb69 100644
--- a/clang/lib/StaticAnalyzer/Checkers/Iterator.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/Iterator.cpp
@@ -241,7 +241,7 @@ ProgramStateRef advancePosition(ProgramStateRef State, SVal Iter,
   // For concrete integers we can calculate the new position
   nonloc::ConcreteInt IntDist = *IntDistOp;
 
-  if (IntDist.getValue().isNegative()) {
+  if (IntDist.getValue()->isNegative()) {
     IntDist = nonloc::ConcreteInt(BVF.getValue(-IntDist.getValue()));
     BinOp = (BinOp == BO_Add) ? BO_Sub : BO_Add;
   }
@@ -272,9 +272,9 @@ ProgramStateRef assumeNoOverflow(ProgramStateRef State, SymbolRef Sym,
   ProgramStateRef NewState = State;
 
   llvm::APSInt Max = AT.getMaxValue() / AT.getValue(Scale);
-  SVal IsCappedFromAbove =
-      SVB.evalBinOpNN(State, BO_LE, nonloc::SymbolVal(Sym),
-                      nonloc::ConcreteInt(Max), SVB.getConditionType());
+  SVal IsCappedFromAbove = SVB.evalBinOpNN(
+      State, BO_LE, nonloc::SymbolVal(Sym),
+      nonloc::ConcreteInt(BV.getValue(Max)), SVB.getConditionType());
   if (auto DV = IsCappedFromAbove.getAs<DefinedSVal>()) {
     NewState = NewState->assume(*DV, true);
     if (!NewState)
@@ -282,9 +282,9 @@ ProgramStateRef assumeNoOverflow(ProgramStateRef State, SymbolRef Sym,
   }
 
   llvm::APSInt Min = -Max;
-  SVal IsCappedFromBelow =
-      SVB.evalBinOpNN(State, BO_GE, nonloc::SymbolVal(Sym),
-                      nonloc::ConcreteInt(Min), SVB.getConditionType());
+  SVal IsCappedFromBelow = SVB.evalBinOpNN(
+      State, BO_GE, nonloc::SymbolVal(Sym),
+      nonloc::ConcreteInt(BV.getValue(Min)), SVB.getConditionType());
   if (auto DV = IsCappedFromBelow.getAs<DefinedSVal>()) {
     NewState = NewState->assume(*DV, true);
     if (!NewState)
diff --git a/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp b/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp
index 5649454b4cd47..d4ce73b03acb8 100644
--- a/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp
@@ -507,8 +507,8 @@ void IteratorModeling::processComparison(CheckerContext &C,
                                          OverloadedOperatorKind Op) const {
   if (const auto TruthVal = RetVal.getAs<nonloc::ConcreteInt>()) {
     if ((State = relateSymbols(State, Sym1, Sym2,
-                              (Op == OO_EqualEqual) ==
-                               (TruthVal->getValue() != 0)))) {
+                               (Op == OO_EqualEqual) ==
+                                   (TruthVal->getValue()->getBoolValue())))) {
       C.addTransition(State);
     } else {
       C.generateSink(State, C.getPredecessor());
diff --git a/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp
index 4b8e5216550d9..9a8c128edc233 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MmapWriteExecChecker.cpp
@@ -68,7 +68,7 @@ void MmapWriteExecChecker::checkPreCall(const CallEvent &Call,
     auto ProtLoc = ProtVal.getAs<nonloc::ConcreteInt>();
     if (!ProtLoc)
       return;
-    int64_t Prot = ProtLoc->getValue().getSExtValue();
+    int64_t Prot = ProtLoc->getValue()->getSExtValue();
 
     if ((Prot & ProtWrite) && (Prot & ProtExec)) {
       ExplodedNode *N = C.generateNonFatalErrorNode();
diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index 0a823a1126ce3..80969ce664530 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -1977,7 +1977,7 @@ StreamChecker::ensureFseekWhenceCorrect(SVal WhenceVal, CheckerContext &C,
   if (!CI)
     return State;
 
-  int64_t X = CI->getValue().getSExtValue();
+  int64_t X = CI->getValue()->getSExtValue();
   if (X == SeekSetVal || X == SeekCurVal || X == SeekEndVal)
     return State;
 
diff --git a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
index c4479db14b791..a9b4dbb39b5bd 100644
--- a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
+++ b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
@@ -264,7 +264,7 @@ getConcreteIntegerValue(const Expr *CondVarExpr, const ExplodedNode *N) {
 
   if (std::optional<SVal> V = getSValForVar(CondVarExpr, N))
     if (auto CI = V->getAs<nonloc::ConcreteInt>())
-      return &CI->getValue();
+      return CI->getValue().get();
   return std::nullopt;
 }
 
diff --git a/clang/lib/StaticAnalyzer/Core/MemRegion.cpp b/clang/lib/StaticAnalyzer/Core/MemRegion.cpp
index bbf2303b9f6ef..559c80634c12e 100644
--- a/clang/lib/StaticAnalyzer/Core/MemRegion.cpp
+++ b/clang/lib/StaticAnalyzer/Core/MemRegion.cpp
@@ -743,7 +743,7 @@ std::string MemRegion::getDescriptiveName(bool UseQuotes) const {
     // Index is a ConcreteInt.
     if (auto CI = ER->getIndex().getAs<nonloc::ConcreteInt>()) {
       llvm::SmallString<2> Idx;
-      CI->getValue().toString(Idx);
+      CI->getValue()->toString(Idx);
       ArrayIndices = (llvm::Twine("[") + Idx.str() + "]" + ArrayIndices).str();
     }
     // Index is symbolic, but may have a descriptive name.
@@ -1458,9 +1458,7 @@ RegionRawOffset ElementRegion::getAsArrayOffset() const {
     SVal index = ER->getIndex();
     if (auto CI = index.getAs<nonloc::ConcreteInt>()) {
       // Update the offset.
-      int64_t i = CI->getValue().getSExtValue();
-
-      if (i != 0) {
+      if (int64_t i = CI->getValue()->getSExtValue(); i != 0) {
         QualType elemType = ER->getElementType();
 
         // If we are pointing to an incomplete type, go no further.
@@ -1632,7 +1630,7 @@ static RegionOffset calculateOffset(const MemRegion *R) {
         if (SymbolicOffsetBase)
           continue;
 
-        int64_t i = CI->getValue().getSExtValue();
+        int64_t i = CI->getValue()->getSExtValue();
         // This type size is in bits.
         Offset += i * R->getContext().getTypeSize(EleTy);
       } else {
diff --git a/clang/lib/StaticAnalyzer/Core/ProgramState.cpp b/clang/lib/StaticAnalyzer/Core/ProgramState.cpp
index d4f56342d934c..34ab2388cbd2f 100644
--- a/clang/lib/StaticAnalyzer/Core/ProgramState.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ProgramState.cpp
@@ -288,12 +288,10 @@ SVal ProgramState::getSVal(Loc location, QualType T) const {
         //  The symbolic value stored to 'x' is actually the conjured
         //  symbol for the call to foo(); the type of that symbol is 'char',
         //  not unsigned.
-        const llvm::APSInt &NewV = getBasicVals().Convert(T, *Int);
-
+        APSIntPtr NewV = getBasicVals().Convert(T, *Int);
         if (V.getAs<Loc>())
           return loc::ConcreteInt(NewV);
-        else
-          return nonloc::ConcreteInt(NewV);
+        return nonloc::ConcreteInt(NewV);
       }
     }
   }
diff --git a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
index 92e9d24552034..5741fff0cc12f 100644
--- a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
@@ -875,7 +875,7 @@ class EvalCastVisitor : public SValVisitor<EvalCastVisitor, SVal> {
 
     // Integer to bool.
     if (CastTy->isBooleanType())
-      return VB.makeTruthVal(V.getValue().getBoolValue(), CastTy);
+      return VB.makeTruthVal(V.getValue()->getBoolValue(), CastTy);
 
     // Integer to pointer.
     if (CastTy->isIntegralOrEnumerationType())
diff --git a/clang/lib/StaticAnalyzer/Core/SVals.cpp b/clang/lib/StaticAnalyzer/Core/SVals.cpp
index d009552965eca..ec88f52a2b3c5 100644
--- a/clang/lib/StaticAnalyzer/Core/SVals.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SVals.cpp
@@ -111,7 +111,7 @@ SymbolRef SVal::getAsSymbol(bool IncludeBaseRegions) const {
 
 const llvm::APSInt *SVal::getAsInteger() const {
   if (auto CI = getAs<nonloc::ConcreteInt>())
-    return &CI->getValue();
+    return CI->getValue().get();
   if (auto CI = getAs<loc::ConcreteInt>())
     return &CI->getValue();
   return nullptr;
@@ -251,7 +251,7 @@ bool SVal::isConstant(int I) const {
   if (std::optional<loc::ConcreteInt> LV = getAs<loc::ConcreteInt>())
     return LV->getValue() == I;
   if (std::optional<nonloc::ConcreteInt> NV = getAs<nonloc::ConcreteInt>())
-    return NV->getValue() == I;
+    return *NV->getValue().get() == I;
   return false;
 }
 
@@ -314,9 +314,9 @@ void SVal::dumpToStream(raw_ostream &os) const {
 void NonLoc::dumpToStream(raw_ostream &os) const {
   switch (getKind()) {
   case nonloc::ConcreteIntKind: {
-    const auto &Value = castAs<nonloc::ConcreteInt>().getValue();
-    os << Value << ' ' << (Value.isSigned() ? 'S' : 'U') << Value.getBitWidth()
-       << 'b';
+    APSIntPtr Value = castAs<nonloc::ConcreteInt>().getValue();
+    os << Value << ' ' << (Value->isSigned() ? 'S' : 'U')
+       << Value->getBitWidth() << 'b';
     break;
   }
     case nonloc::SymbolValKind:
diff --git a/clang/lib/StaticAnalyzer/Core/SimpleConstraintManager.cpp b/clang/lib/StaticAnalyzer/Core/SimpleConstraintManager.cpp
index 8ca2cdb9d3ab7..3c5c992fa8dbc 100644
--- a/clang/lib/StaticAnalyzer/Core/SimpleConstraintManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SimpleConstraintManager.cpp
@@ -75,7 +75,7 @@ ProgramStateRef SimpleConstraintManager::assumeAux(ProgramStateRef State,
   }
 
   case nonloc::ConcreteIntKind: {
-    bool b = Cond.castAs<nonloc::ConcreteInt>().getValue() != 0;
+    bool b = *Cond.castAs<nonloc::ConcreteInt>().getValue().get() != 0;
     bool isFeasible = b ? Assumption : !Assumption;
     return isFeasible ? State : nullptr;
   }
diff --git a/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
index 7b7fc801ec7f4..d2e6870ad1707 100644
--- a/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
@@ -10,10 +10,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/APSIntPtr.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/APSIntType.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SValVisitor.h"
 #include <optional>
 
@@ -179,8 +180,7 @@ SVal SimpleSValBuilder::MakeSymIntVal(const SymExpr *LHS,
     if (RHS == 0)
       isIdempotent = true;
     else if (RHS.isAllOnes()) {
-      const llvm::APSInt &Result = BasicVals.Convert(resultTy, RHS);
-      return nonloc::ConcreteInt(Result);
+      return nonloc::ConcreteInt(BasicVals.Convert(resultTy, RHS));
     }
     break;
   }
@@ -234,9 +234,10 @@ SVal SimpleSValBuilder::MakeSymIntVal(const SymExpr *LHS,
 static bool isInRelation(BinaryOperator::Opcode Rel, SymbolRef Sym,
                          llvm::APSInt Bound, ProgramStateRef State) {
   SValBuilder &SVB = State->getStateManager().getSValBuilder();
-  SVal Result =
-      SVB.evalBinOpNN(State, Rel, nonloc::SymbolVal(Sym),
-                      nonloc::ConcreteInt(Bound), SVB.getConditionType());
+  BasicValueFactory &BV = SVB.getBasicValueFactory();
+  SVal Result = SVB.evalBinOpNN(State, Rel, nonloc::SymbolVal(Sym),
+                                nonloc::ConcreteInt(BV.getValue(Bound)),
+                                SVB.getConditionType());
   if (auto DV = Result.getAs<DefinedSVal>()) {
     return !State->assume(*DV, false);
   }
@@ -273,14 +274,14 @@ static bool isWithinConstantOverflowBounds(llvm::APSInt I) {
   return (I <= Max) && (I >= -Max);
 }
 
-static std::pair<SymbolRef, llvm::APSInt>
-decomposeSymbol(SymbolRef Sym, BasicValueFactory &BV) {
+static std::pair<SymbolRef, APSIntPtr> decomposeSymbol(SymbolRef Sym,
+                                                       BasicValueFactory &BV) {
   if (const auto *SymInt = dyn_cast<SymIntExpr>(Sym))
     if (BinaryOperator::isAdditiveOp(SymInt->getOpcode()))
       return std::make_pair(SymInt->getLHS(),
-                            (SymInt->getOpcode() == BO_Add) ?
-                            (SymInt->getRHS()) :
-                            (-SymInt->getRHS()));
+                            (SymInt->getOpcode() == BO_Add)
+                                ? BV.getValue(SymInt->getRHS())
+                                : BV.getValue(-SymInt->getRHS()));
 
   // Fail to decompose: "reduce" the problem to the "$x + 0" case.
   return std::make_pair(Sym, BV.getValue(0, Sym->getType()));
@@ -314,8 +315,9 @@ static NonLoc doRearrangeUnchecked(ProgramStateRef State,
     llvm_unreachable("Operation not suitable for unchecked rearrangement!");
 
   if (LSym == RSym)
-    return SVB.evalBinOpNN(State, Op, nonloc::ConcreteInt(LInt),
-                           nonloc::ConcreteInt(RInt), ResultTy)
+    return SVB
+        .evalBinOpNN(State, Op, nonloc::ConcreteInt(BV.getValue(LInt)),
+                     nonloc::ConcreteInt(BV.getValue(RInt)), ResultTy)
         .castAs<NonLoc>();
 
   SymbolRef ResultSym = nullptr;
@@ -1211,7 +1213,7 @@ const llvm::APSInt *SimpleSValBuilder::getConcreteValue(SVal V) {
     return &X->getValue();
 
   if (std::optional<nonloc::ConcreteInt> X = V.getAs<nonloc::ConcreteInt>())
-    return &X->getValue();
+    return X->getValue().get();
 
   return nullptr;
 }

From 13e20bcb98e57831d46162b9ba42a78d85e8283d Mon Sep 17 00:00:00 2001
From: Balazs Benics <benicsbalazs@gmail.com>
Date: Thu, 19 Dec 2024 12:57:51 +0100
Subject: [PATCH 048/209] [analyzer][NFC] Migrate loc::ConcreteInt to use
 APSIntPtr (3/4) (#120437)

---
 .../clang/StaticAnalyzer/Core/PathSensitive/SVals.h        | 7 +++++--
 clang/lib/StaticAnalyzer/Core/SValBuilder.cpp              | 2 +-
 clang/lib/StaticAnalyzer/Core/SVals.cpp                    | 6 +++---
 clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp        | 2 +-
 4 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h
index 57d7514280f10..aeb57b28077c6 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SVals.h
@@ -514,9 +514,12 @@ class MemRegionVal : public Loc {
 
 class ConcreteInt : public Loc {
 public:
-  explicit ConcreteInt(const llvm::APSInt &V) : Loc(ConcreteIntKind, &V) {}
+  explicit ConcreteInt(APSIntPtr V) : Loc(ConcreteIntKind, V.get()) {}
 
-  const llvm::APSInt &getValue() const { return *castDataAs<llvm::APSInt>(); }
+  APSIntPtr getValue() const {
+    // This is safe because in the ctor we take a safe APSIntPtr.
+    return APSIntPtr::unsafeConstructor(castDataAs<llvm::APSInt>());
+  }
 
   static bool classof(SVal V) { return V.getKind() == ConcreteIntKind; }
 };
diff --git a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
index 5741fff0cc12f..6fbdc956313d5 100644
--- a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
@@ -671,7 +671,7 @@ class EvalCastVisitor : public SValVisitor<EvalCastVisitor, SVal> {
   SVal VisitConcreteInt(loc::ConcreteInt V) {
     // Pointer to bool.
     if (CastTy->isBooleanType())
-      return VB.makeTruthVal(V.getValue().getBoolValue(), CastTy);
+      return VB.makeTruthVal(V.getValue()->getBoolValue(), CastTy);
 
     // Pointer to integer.
     if (CastTy->isIntegralOrEnumerationType()) {
diff --git a/clang/lib/StaticAnalyzer/Core/SVals.cpp b/clang/lib/StaticAnalyzer/Core/SVals.cpp
index ec88f52a2b3c5..3ab01a04dcec4 100644
--- a/clang/lib/StaticAnalyzer/Core/SVals.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SVals.cpp
@@ -113,7 +113,7 @@ const llvm::APSInt *SVal::getAsInteger() const {
   if (auto CI = getAs<nonloc::ConcreteInt>())
     return CI->getValue().get();
   if (auto CI = getAs<loc::ConcreteInt>())
-    return &CI->getValue();
+    return CI->getValue().get();
   return nullptr;
 }
 
@@ -249,7 +249,7 @@ bool SVal::isConstant() const {
 
 bool SVal::isConstant(int I) const {
   if (std::optional<loc::ConcreteInt> LV = getAs<loc::ConcreteInt>())
-    return LV->getValue() == I;
+    return *LV->getValue().get() == I;
   if (std::optional<nonloc::ConcreteInt> NV = getAs<nonloc::ConcreteInt>())
     return *NV->getValue().get() == I;
   return false;
@@ -380,7 +380,7 @@ void NonLoc::dumpToStream(raw_ostream &os) const {
 void Loc::dumpToStream(raw_ostream &os) const {
   switch (getKind()) {
   case loc::ConcreteIntKind:
-    os << castAs<loc::ConcreteInt>().getValue().getZExtValue() << " (Loc)";
+    os << castAs<loc::ConcreteInt>().getValue()->getZExtValue() << " (Loc)";
     break;
   case loc::GotoLabelKind:
     os << "&&" << castAs<loc::GotoLabel>().getLabel()->getName();
diff --git a/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
index d2e6870ad1707..136b1729c9469 100644
--- a/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
@@ -1210,7 +1210,7 @@ const llvm::APSInt *SimpleSValBuilder::getConstValue(ProgramStateRef state,
 
 const llvm::APSInt *SimpleSValBuilder::getConcreteValue(SVal V) {
   if (std::optional<loc::ConcreteInt> X = V.getAs<loc::ConcreteInt>())
-    return &X->getValue();
+    return X->getValue().get();
 
   if (std::optional<nonloc::ConcreteInt> X = V.getAs<nonloc::ConcreteInt>())
     return X->getValue().get();

From 23377890d022eb1fa9cb42eba5c4f72a1f8ac38d Mon Sep 17 00:00:00 2001
From: Balazs Benics <benicsbalazs@gmail.com>
Date: Thu, 19 Dec 2024 13:01:38 +0100
Subject: [PATCH 049/209] [analyzer][NFC] Migrate {SymInt,IntSym}Expr to use
 APSIntPtr (4/4) (#120438)

---
 .../Core/PathSensitive/SMTConstraintManager.h    |  4 ++--
 .../Core/PathSensitive/SValBuilder.h             |  7 +++----
 .../Core/PathSensitive/SymbolManager.h           | 16 +++++++---------
 .../Checkers/ExprInspectionChecker.cpp           |  4 ++--
 clang/lib/StaticAnalyzer/Core/SValBuilder.cpp    |  8 ++------
 .../StaticAnalyzer/Core/SimpleSValBuilder.cpp    |  2 +-
 clang/lib/StaticAnalyzer/Core/SymbolManager.cpp  |  8 +++-----
 7 files changed, 20 insertions(+), 29 deletions(-)

diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h
index 72038b92f8edf..7cfb24e5e649d 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConstraintManager.h
@@ -175,9 +175,9 @@ class SMTConstraintManager : public clang::ento::SimpleConstraintManager {
       const llvm::APSInt *LHS, *RHS;
       if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(BSE)) {
         LHS = getSymVal(State, SIE->getLHS());
-        RHS = &SIE->getRHS();
+        RHS = SIE->getRHS().get();
       } else if (const IntSymExpr *ISE = dyn_cast<IntSymExpr>(BSE)) {
-        LHS = &ISE->getLHS();
+        LHS = ISE->getLHS().get();
         RHS = getSymVal(State, ISE->getRHS());
       } else if (const SymSymExpr *SSM = dyn_cast<SymSymExpr>(BSE)) {
         // Early termination to avoid expensive call
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h
index ec2b2b2456948..54430d426a82a 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h
@@ -329,11 +329,10 @@ class SValBuilder {
   }
 
   nonloc::SymbolVal makeNonLoc(const SymExpr *lhs, BinaryOperator::Opcode op,
-                               const llvm::APSInt &rhs, QualType type);
+                               APSIntPtr rhs, QualType type);
 
-  nonloc::SymbolVal makeNonLoc(const llvm::APSInt &rhs,
-                               BinaryOperator::Opcode op, const SymExpr *lhs,
-                               QualType type);
+  nonloc::SymbolVal makeNonLoc(APSIntPtr rhs, BinaryOperator::Opcode op,
+                               const SymExpr *lhs, QualType type);
 
   nonloc::SymbolVal makeNonLoc(const SymExpr *lhs, BinaryOperator::Opcode op,
                                const SymExpr *rhs, QualType type);
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h
index 3b64d38ee2b23..73732d532f630 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h
@@ -18,6 +18,7 @@
 #include "clang/AST/Type.h"
 #include "clang/Analysis/AnalysisDeclContext.h"
 #include "clang/Basic/LLVM.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/APSIntPtr.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/StoreRef.h"
 #include "clang/StaticAnalyzer/Core/PathSensitive/SymExpr.h"
@@ -410,9 +411,7 @@ class BinarySymExpr : public SymExpr {
     return 1;
   }
 
-  static const llvm::APSInt *getPointer(const llvm::APSInt &Value) {
-    return &Value;
-  }
+  static const llvm::APSInt *getPointer(APSIntPtr Value) { return Value.get(); }
   static const SymExpr *getPointer(const SymExpr *Value) { return Value; }
 
   static void dumpToStreamImpl(raw_ostream &os, const SymExpr *Value);
@@ -468,11 +467,11 @@ class BinarySymExprImpl : public BinarySymExpr {
 };
 
 /// Represents a symbolic expression like 'x' + 3.
-using SymIntExpr = BinarySymExprImpl<const SymExpr *, const llvm::APSInt &,
+using SymIntExpr = BinarySymExprImpl<const SymExpr *, APSIntPtr,
                                      SymExpr::Kind::SymIntExprKind>;
 
 /// Represents a symbolic expression like 3 - 'x'.
-using IntSymExpr = BinarySymExprImpl<const llvm::APSInt &, const SymExpr *,
+using IntSymExpr = BinarySymExprImpl<APSIntPtr, const SymExpr *,
                                      SymExpr::Kind::IntSymExprKind>;
 
 /// Represents a symbolic expression like 'x' + 'y'.
@@ -537,15 +536,14 @@ class SymbolManager {
                                   QualType From, QualType To);
 
   const SymIntExpr *getSymIntExpr(const SymExpr *lhs, BinaryOperator::Opcode op,
-                                  const llvm::APSInt& rhs, QualType t);
+                                  APSIntPtr rhs, QualType t);
 
   const SymIntExpr *getSymIntExpr(const SymExpr &lhs, BinaryOperator::Opcode op,
-                                  const llvm::APSInt& rhs, QualType t) {
+                                  APSIntPtr rhs, QualType t) {
     return getSymIntExpr(&lhs, op, rhs, t);
   }
 
-  const IntSymExpr *getIntSymExpr(const llvm::APSInt& lhs,
-                                  BinaryOperator::Opcode op,
+  const IntSymExpr *getIntSymExpr(APSIntPtr lhs, BinaryOperator::Opcode op,
                                   const SymExpr *rhs, QualType t);
 
   const SymSymExpr *getSymSymExpr(const SymExpr *lhs, BinaryOperator::Opcode op,
diff --git a/clang/lib/StaticAnalyzer/Checkers/ExprInspectionChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/ExprInspectionChecker.cpp
index 3096999e9fd16..5534ef86a7bef 100644
--- a/clang/lib/StaticAnalyzer/Checkers/ExprInspectionChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/ExprInspectionChecker.cpp
@@ -486,8 +486,8 @@ class SymbolExpressor
       return Str;
     if (std::optional<std::string> Str = Visit(S->getLHS()))
       return (*Str + " " + BinaryOperator::getOpcodeStr(S->getOpcode()) + " " +
-              std::to_string(S->getRHS().getLimitedValue()) +
-              (S->getRHS().isUnsigned() ? "U" : ""))
+              std::to_string(S->getRHS()->getLimitedValue()) +
+              (S->getRHS()->isUnsigned() ? "U" : ""))
           .str();
     return std::nullopt;
   }
diff --git a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
index 6fbdc956313d5..2b85580186381 100644
--- a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
@@ -76,17 +76,13 @@ DefinedOrUnknownSVal SValBuilder::makeZeroVal(QualType type) {
 
 nonloc::SymbolVal SValBuilder::makeNonLoc(const SymExpr *lhs,
                                           BinaryOperator::Opcode op,
-                                          const llvm::APSInt &rhs,
-                                          QualType type) {
-  // The Environment ensures we always get a persistent APSInt in
-  // BasicValueFactory, so we don't need to get the APSInt from
-  // BasicValueFactory again.
+                                          APSIntPtr rhs, QualType type) {
   assert(lhs);
   assert(!Loc::isLocType(type));
   return nonloc::SymbolVal(SymMgr.getSymIntExpr(lhs, op, rhs, type));
 }
 
-nonloc::SymbolVal SValBuilder::makeNonLoc(const llvm::APSInt &lhs,
+nonloc::SymbolVal SValBuilder::makeNonLoc(APSIntPtr lhs,
                                           BinaryOperator::Opcode op,
                                           const SymExpr *rhs, QualType type) {
   assert(rhs);
diff --git a/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
index 136b1729c9469..455621739f693 100644
--- a/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SimpleSValBuilder.cpp
@@ -349,7 +349,7 @@ static NonLoc doRearrangeUnchecked(ProgramStateRef State,
       return nonloc::SymbolVal(ResultSym);
     }
   }
-  const llvm::APSInt &PersistentResultInt = BV.getValue(ResultInt);
+  APSIntPtr PersistentResultInt = BV.getValue(ResultInt);
   return nonloc::SymbolVal(
       SymMgr.getSymIntExpr(ResultSym, ResultOp, PersistentResultInt, ResultTy));
 }
diff --git a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp
index 9025e11a3f51a..f21e5c3ad7bd7 100644
--- a/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SymbolManager.cpp
@@ -261,8 +261,7 @@ SymbolManager::getCastSymbol(const SymExpr *Op,
 
 const SymIntExpr *SymbolManager::getSymIntExpr(const SymExpr *lhs,
                                                BinaryOperator::Opcode op,
-                                               const llvm::APSInt& v,
-                                               QualType t) {
+                                               APSIntPtr v, QualType t) {
   llvm::FoldingSetNodeID ID;
   SymIntExpr::Profile(ID, lhs, op, v, t);
   void *InsertPos;
@@ -276,10 +275,9 @@ const SymIntExpr *SymbolManager::getSymIntExpr(const SymExpr *lhs,
   return cast<SymIntExpr>(data);
 }
 
-const IntSymExpr *SymbolManager::getIntSymExpr(const llvm::APSInt& lhs,
+const IntSymExpr *SymbolManager::getIntSymExpr(APSIntPtr lhs,
                                                BinaryOperator::Opcode op,
-                                               const SymExpr *rhs,
-                                               QualType t) {
+                                               const SymExpr *rhs, QualType t) {
   llvm::FoldingSetNodeID ID;
   IntSymExpr::Profile(ID, lhs, op, rhs, t);
   void *InsertPos;

From eace8269d9aeb67013d273735ec1be1002a6fac1 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Thu, 19 Dec 2024 12:19:02 +0100
Subject: [PATCH 050/209] [clang] NFC, simplify the
 shouldLifetimeExtendThroughPath.

---
 clang/lib/Sema/CheckExprLifetime.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
index 843fdb4a65cd7..add6d7506bd6f 100644
--- a/clang/lib/Sema/CheckExprLifetime.cpp
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -1091,14 +1091,13 @@ enum PathLifetimeKind {
 /// supposed to lifetime-extend along.
 static PathLifetimeKind
 shouldLifetimeExtendThroughPath(const IndirectLocalPath &Path) {
-  PathLifetimeKind Kind = PathLifetimeKind::Extend;
   for (auto Elem : Path) {
     if (Elem.Kind == IndirectLocalPathEntry::DefaultInit)
       return PathLifetimeKind::Extend;
-    else if (Elem.Kind != IndirectLocalPathEntry::LambdaCaptureInit)
+    if (Elem.Kind != IndirectLocalPathEntry::LambdaCaptureInit)
       return PathLifetimeKind::NoExtend;
   }
-  return Kind;
+  return PathLifetimeKind::Extend;
 }
 
 /// Find the range for the first interesting entry in the path at or after I.

From 6586c676b42aa9c7e78f9b1d419767a02793a70f Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas@arm.com>
Date: Thu, 19 Dec 2024 12:06:46 +0000
Subject: [PATCH 051/209] [FMV][AArch64] Emit mangled default version if
 explicitly specified. (#120022)

Currently we need at least one more version other than the default to
trigger FMV. However we would like a header file declaration

__attribute__((target_version("default"))) void f(void);

to guarantee that there will be f.default
---
 clang/lib/CodeGen/CodeGenModule.cpp           |   2 +-
 clang/lib/Sema/SemaDecl.cpp                   |  47 ++--
 .../fmv-mix-explicit-implicit-default.c       | 221 ++++++++++++++++++
 clang/test/CodeGen/attr-target-version.c      |  51 +++-
 clang/test/CodeGenCXX/fmv-namespace.cpp       |  22 +-
 clang/test/Sema/attr-target-version.c         |   4 +
 clang/test/SemaCXX/attr-target-version.cpp    |   4 +-
 7 files changed, 305 insertions(+), 46 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/fmv-mix-explicit-implicit-default.c

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 5ac3eefbd6c51..c49f763148828 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -4283,7 +4283,7 @@ void CodeGenModule::emitMultiVersionFunctions() {
     getContext().forEachMultiversionedFunctionVersion(
         FD, [&](const FunctionDecl *CurFD) {
           llvm::SmallVector<StringRef, 8> Feats;
-          bool IsDefined = CurFD->doesThisDeclarationHaveABody();
+          bool IsDefined = CurFD->getDefinition() != nullptr;
 
           if (const auto *TA = CurFD->getAttr<TargetAttr>()) {
             assert(getTarget().getTriple().isX86() && "Unsupported target");
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 55e891e3acf20..4001c4d263f1d 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -11073,9 +11073,9 @@ bool Sema::shouldLinkDependentDeclWithPrevious(Decl *D, Decl *PrevDecl) {
 static bool CheckMultiVersionValue(Sema &S, const FunctionDecl *FD) {
   const auto *TA = FD->getAttr<TargetAttr>();
   const auto *TVA = FD->getAttr<TargetVersionAttr>();
-  assert(
-      (TA || TVA) &&
-      "MultiVersion candidate requires a target or target_version attribute");
+
+  assert((TA || TVA) && "Expecting target or target_version attribute");
+
   const TargetInfo &TargetInfo = S.Context.getTargetInfo();
   enum ErrType { Feature = 0, Architecture = 1 };
 
@@ -11372,10 +11372,6 @@ static bool CheckMultiVersionFirstFunction(Sema &S, FunctionDecl *FD) {
   // otherwise it is treated as a normal function.
   if (TA && !TA->isDefaultVersion())
     return false;
-  // The target_version attribute only causes Multiversioning if this
-  // declaration is NOT the default version.
-  if (TVA && TVA->isDefaultVersion())
-    return false;
 
   if ((TA || TVA) && CheckMultiVersionValue(S, FD)) {
     FD->setInvalidDecl();
@@ -11422,26 +11418,24 @@ static bool CheckDeclarationCausesMultiVersioning(Sema &S, FunctionDecl *OldFD,
                                                   LookupResult &Previous) {
   assert(!OldFD->isMultiVersion() && "Unexpected MultiVersion");
 
+  const auto *NewTA = NewFD->getAttr<TargetAttr>();
+  const auto *OldTA = OldFD->getAttr<TargetAttr>();
+  const auto *NewTVA = NewFD->getAttr<TargetVersionAttr>();
+  const auto *OldTVA = OldFD->getAttr<TargetVersionAttr>();
+
+  assert((NewTA || NewTVA) && "Excpecting target or target_version attribute");
+
   // The definitions should be allowed in any order. If we have discovered
   // a new target version and the preceeding was the default, then add the
   // corresponding attribute to it.
   patchDefaultTargetVersion(NewFD, OldFD);
 
-  const auto *NewTA = NewFD->getAttr<TargetAttr>();
-  const auto *NewTVA = NewFD->getAttr<TargetVersionAttr>();
-  const auto *OldTA = OldFD->getAttr<TargetAttr>();
-
   // If the old decl is NOT MultiVersioned yet, and we don't cause that
   // to change, this is a simple redeclaration.
   if (NewTA && !NewTA->isDefaultVersion() &&
       (!OldTA || OldTA->getFeaturesStr() == NewTA->getFeaturesStr()))
     return false;
 
-  // The target_version attribute only causes Multiversioning if this
-  // declaration is NOT the default version.
-  if (NewTVA && NewTVA->isDefaultVersion())
-    return false;
-
   // Otherwise, this decl causes MultiVersioning.
   if (CheckMultiVersionAdditionalRules(S, OldFD, NewFD, true,
                                        NewTVA ? MultiVersionKind::TargetVersion
@@ -11456,7 +11450,8 @@ static bool CheckDeclarationCausesMultiVersioning(Sema &S, FunctionDecl *OldFD,
   }
 
   // If this is 'default', permit the forward declaration.
-  if (NewTA && NewTA->isDefaultVersion() && !OldTA) {
+  if ((NewTA && NewTA->isDefaultVersion() && !OldTA) ||
+      (NewTVA && NewTVA->isDefaultVersion() && !OldTVA)) {
     Redeclaration = true;
     OldDecl = OldFD;
     OldFD->setIsMultiVersion();
@@ -11464,7 +11459,7 @@ static bool CheckDeclarationCausesMultiVersioning(Sema &S, FunctionDecl *OldFD,
     return false;
   }
 
-  if (CheckMultiVersionValue(S, OldFD)) {
+  if ((OldTA || OldTVA) && CheckMultiVersionValue(S, OldFD)) {
     S.Diag(NewFD->getLocation(), diag::note_multiversioning_caused_here);
     NewFD->setInvalidDecl();
     return true;
@@ -11761,9 +11756,7 @@ static bool CheckMultiVersionAdditionalDecl(
   // Else, this is simply a non-redecl case.  Checking the 'value' is only
   // necessary in the Target case, since The CPUSpecific/Dispatch cases are
   // handled in the attribute adding step.
-  if ((NewMVKind == MultiVersionKind::TargetVersion ||
-       NewMVKind == MultiVersionKind::Target) &&
-      CheckMultiVersionValue(S, NewFD)) {
+  if ((NewTA || NewTVA) && CheckMultiVersionValue(S, NewFD)) {
     NewFD->setInvalidDecl();
     return true;
   }
@@ -11799,6 +11792,12 @@ static bool CheckMultiVersionAdditionalDecl(
 static bool CheckMultiVersionFunction(Sema &S, FunctionDecl *NewFD,
                                       bool &Redeclaration, NamedDecl *&OldDecl,
                                       LookupResult &Previous) {
+  const TargetInfo &TI = S.getASTContext().getTargetInfo();
+
+  // Check if FMV is disabled.
+  if (TI.getTriple().isAArch64() && !TI.hasFeature("fmv"))
+    return false;
+
   const auto *NewTA = NewFD->getAttr<TargetAttr>();
   const auto *NewTVA = NewFD->getAttr<TargetVersionAttr>();
   const auto *NewCPUDisp = NewFD->getAttr<CPUDispatchAttr>();
@@ -11821,14 +11820,12 @@ static bool CheckMultiVersionFunction(Sema &S, FunctionDecl *NewFD,
     return false;
   }
 
-  const llvm::Triple &T = S.getASTContext().getTargetInfo().getTriple();
-
   // Target attribute on AArch64 is not used for multiversioning
-  if (NewTA && T.isAArch64())
+  if (NewTA && TI.getTriple().isAArch64())
     return false;
 
   // Target attribute on RISCV is not used for multiversioning
-  if (NewTA && T.isRISCV())
+  if (NewTA && TI.getTriple().isRISCV())
     return false;
 
   if (!OldDecl || !OldDecl->getAsFunction() ||
diff --git a/clang/test/CodeGen/AArch64/fmv-mix-explicit-implicit-default.c b/clang/test/CodeGen/AArch64/fmv-mix-explicit-implicit-default.c
new file mode 100644
index 0000000000000..032738fb9664d
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fmv-mix-explicit-implicit-default.c
@@ -0,0 +1,221 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fmv -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-NOFMV
+
+int implicit_default_decl_first(void);
+__attribute__((target_version("default"))) int implicit_default_decl_first(void) { return 1; }
+int caller1(void) { return implicit_default_decl_first(); }
+
+__attribute__((target_version("default"))) int explicit_default_def_first(void) { return 2; }
+int explicit_default_def_first(void);
+int caller2(void) { return explicit_default_def_first(); }
+
+int implicit_default_def_first(void) { return 3; }
+__attribute__((target_version("default"))) int implicit_default_def_first(void);
+int caller3(void) { return implicit_default_def_first(); }
+
+__attribute__((target_version("default"))) int explicit_default_decl_first(void);
+int explicit_default_decl_first(void) { return 4; }
+int caller4(void) { return explicit_default_decl_first(); }
+
+int no_def_implicit_default_first(void);
+__attribute__((target_version("default"))) int no_def_implicit_default_first(void);
+int caller5(void) { return no_def_implicit_default_first(); }
+
+__attribute__((target_version("default"))) int no_def_explicit_default_first(void);
+int no_def_explicit_default_first(void);
+int caller6(void) { return no_def_explicit_default_first(); }
+//.
+// CHECK: @implicit_default_decl_first = weak_odr ifunc i32 (), ptr @implicit_default_decl_first.resolver
+// CHECK: @explicit_default_def_first = weak_odr ifunc i32 (), ptr @explicit_default_def_first.resolver
+// CHECK: @implicit_default_def_first = weak_odr ifunc i32 (), ptr @implicit_default_def_first.resolver
+// CHECK: @explicit_default_decl_first = weak_odr ifunc i32 (), ptr @explicit_default_decl_first.resolver
+//.
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@implicit_default_decl_first.default
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 1
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@caller1
+// CHECK-SAME: () #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @implicit_default_decl_first()
+// CHECK-NEXT:    ret i32 [[CALL]]
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@explicit_default_def_first.default
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 2
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@caller2
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @explicit_default_def_first()
+// CHECK-NEXT:    ret i32 [[CALL]]
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@implicit_default_def_first.default
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 3
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@caller3
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @implicit_default_def_first()
+// CHECK-NEXT:    ret i32 [[CALL]]
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@explicit_default_decl_first.default
+// CHECK-SAME: () #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 4
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@caller4
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @explicit_default_decl_first()
+// CHECK-NEXT:    ret i32 [[CALL]]
+//
+//
+// CHECK: declare i32 @no_def_implicit_default_first() #[[ATTR2:[0-9]+]]
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@caller5
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @no_def_implicit_default_first()
+// CHECK-NEXT:    ret i32 [[CALL]]
+//
+//
+// CHECK: declare i32 @no_def_explicit_default_first() #[[ATTR2]]
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@caller6
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @no_def_explicit_default_first()
+// CHECK-NEXT:    ret i32 [[CALL]]
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@implicit_default_decl_first.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    ret ptr @implicit_default_decl_first.default
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@explicit_default_def_first.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    ret ptr @explicit_default_def_first.default
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@implicit_default_def_first.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    ret ptr @implicit_default_def_first.default
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@explicit_default_decl_first.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    ret ptr @explicit_default_decl_first.default
+//
+//
+// CHECK: declare i32 @no_def_implicit_default_first.default() #[[ATTR2]]
+//
+//
+// CHECK: declare i32 @no_def_explicit_default_first.default() #[[ATTR2]]
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@caller1
+// CHECK-NOFMV-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NOFMV-NEXT:  entry:
+// CHECK-NOFMV-NEXT:    [[CALL:%.*]] = call i32 @implicit_default_decl_first()
+// CHECK-NOFMV-NEXT:    ret i32 [[CALL]]
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@implicit_default_decl_first
+// CHECK-NOFMV-SAME: () #[[ATTR1:[0-9]+]] {
+// CHECK-NOFMV-NEXT:  entry:
+// CHECK-NOFMV-NEXT:    ret i32 1
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@caller2
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
+// CHECK-NOFMV-NEXT:  entry:
+// CHECK-NOFMV-NEXT:    [[CALL:%.*]] = call i32 @explicit_default_def_first()
+// CHECK-NOFMV-NEXT:    ret i32 [[CALL]]
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@explicit_default_def_first
+// CHECK-NOFMV-SAME: () #[[ATTR1]] {
+// CHECK-NOFMV-NEXT:  entry:
+// CHECK-NOFMV-NEXT:    ret i32 2
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@implicit_default_def_first
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
+// CHECK-NOFMV-NEXT:  entry:
+// CHECK-NOFMV-NEXT:    ret i32 3
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@caller3
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
+// CHECK-NOFMV-NEXT:  entry:
+// CHECK-NOFMV-NEXT:    [[CALL:%.*]] = call i32 @implicit_default_def_first()
+// CHECK-NOFMV-NEXT:    ret i32 [[CALL]]
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@caller4
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
+// CHECK-NOFMV-NEXT:  entry:
+// CHECK-NOFMV-NEXT:    [[CALL:%.*]] = call i32 @explicit_default_decl_first()
+// CHECK-NOFMV-NEXT:    ret i32 [[CALL]]
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@explicit_default_decl_first
+// CHECK-NOFMV-SAME: () #[[ATTR1]] {
+// CHECK-NOFMV-NEXT:  entry:
+// CHECK-NOFMV-NEXT:    ret i32 4
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@caller5
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
+// CHECK-NOFMV-NEXT:  entry:
+// CHECK-NOFMV-NEXT:    [[CALL:%.*]] = call i32 @no_def_implicit_default_first()
+// CHECK-NOFMV-NEXT:    ret i32 [[CALL]]
+//
+//
+// CHECK-NOFMV: declare i32 @no_def_implicit_default_first() #[[ATTR2:[0-9]+]]
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@caller6
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
+// CHECK-NOFMV-NEXT:  entry:
+// CHECK-NOFMV-NEXT:    [[CALL:%.*]] = call i32 @no_def_explicit_default_first()
+// CHECK-NOFMV-NEXT:    ret i32 [[CALL]]
+//
+//
+// CHECK-NOFMV: declare i32 @no_def_explicit_default_first() #[[ATTR2]]
+//.
diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c
index 428e7937d8d39..951401c498deb 100644
--- a/clang/test/CodeGen/attr-target-version.c
+++ b/clang/test/CodeGen/attr-target-version.c
@@ -143,10 +143,12 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK: @fmv_d = internal ifunc i32 (), ptr @fmv_d.resolver
 // CHECK: @fmv_c = weak_odr ifunc void (), ptr @fmv_c.resolver
 // CHECK: @fmv_inline = weak_odr ifunc i32 (), ptr @fmv_inline.resolver
+// CHECK: @reca = weak_odr ifunc void (), ptr @reca.resolver
 // CHECK: @unused_with_default_def = weak_odr ifunc i32 (), ptr @unused_with_default_def.resolver
 // CHECK: @unused_with_implicit_default_def = weak_odr ifunc i32 (), ptr @unused_with_implicit_default_def.resolver
 // CHECK: @unused_with_implicit_forward_default_def = weak_odr ifunc i32 (), ptr @unused_with_implicit_forward_default_def.resolver
 // CHECK: @default_def_with_version_decls = weak_odr ifunc i32 (), ptr @default_def_with_version_decls.resolver
+// CHECK: @recb = weak_odr ifunc void (), ptr @recb.resolver
 //.
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv._MflagmMfp16fmlMrng
@@ -287,8 +289,15 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_default.default
+// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 111
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@fmv_c._Mssbs
-// CHECK-SAME: () #[[ATTR16:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR17:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret void
 //
@@ -313,13 +322,6 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_default
-// CHECK-SAME: () #[[ATTR9]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 111
-//
-//
-// CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@recur
 // CHECK-SAME: () #[[ATTR15]] {
 // CHECK-NEXT:  entry:
@@ -895,6 +897,19 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:    ret ptr @fmv_inline.default
 //
 //
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@reca.default
+// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @recb()
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@reca.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    ret ptr @reca.default
+//
+//
 // CHECK-LABEL: define {{[^@]+}}@unused_with_default_def.resolver() comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
@@ -959,6 +974,26 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:    ret ptr @default_def_with_version_decls.default
 //
 //
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@recb.default
+// CHECK-SAME: () #[[ATTR9]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    call void @func()
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@func
+// CHECK-SAME: () #[[ATTR15]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@recb.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    ret ptr @recb.default
+//
+//
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: define {{[^@]+}}@foo
 // CHECK-NOFMV-SAME: () #[[ATTR0:[0-9]+]] {
diff --git a/clang/test/CodeGenCXX/fmv-namespace.cpp b/clang/test/CodeGenCXX/fmv-namespace.cpp
index 1a76ee0356524..75f29e1c77975 100644
--- a/clang/test/CodeGenCXX/fmv-namespace.cpp
+++ b/clang/test/CodeGenCXX/fmv-namespace.cpp
@@ -28,34 +28,40 @@ __attribute((target_version("mops"))) int bar() { return 1; }
 // CHECK: @_ZN4Name3fooEv = weak_odr ifunc i32 (), ptr @_ZN4Name3fooEv.resolver
 // CHECK: @_ZN3Foo3barEv = weak_odr ifunc i32 (), ptr @_ZN3Foo3barEv.resolver
 //.
-// CHECK-LABEL: define dso_local noundef i32 @_ZN4Name3fooEv._Msve(
+// CHECK-LABEL: define dso_local noundef i32 @_ZN4Name3fooEv.default(
 // CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK-LABEL: define dso_local noundef i32 @_ZN4Name3fooEv._Msve(
+// CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i32 1
 //
 //
 // CHECK-LABEL: define dso_local noundef i32 @_Z3barv(
-// CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+// CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN4Name3fooEv()
 // CHECK-NEXT:    ret i32 [[CALL]]
 //
 //
 // CHECK-LABEL: define dso_local noundef i32 @_ZN9OtherName3fooEv._Msve(
-// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-SAME: ) #[[ATTR1]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i32 2
 //
 //
 // CHECK-LABEL: define dso_local noundef i32 @_Z3bazv(
-// CHECK-SAME: ) #[[ATTR1]] {
+// CHECK-SAME: ) #[[ATTR2]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN9OtherName3fooEv()
 // CHECK-NEXT:    ret i32 [[CALL]]
 //
 //
 // CHECK-LABEL: define dso_local noundef i32 @_ZN3Foo3barEv.default(
-// CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+// CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i32 0
 //
@@ -66,12 +72,6 @@ __attribute((target_version("mops"))) int bar() { return 1; }
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK-LABEL: define dso_local noundef i32 @_ZN4Name3fooEv.default(
-// CHECK-SAME: ) #[[ATTR3]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    ret i32 0
-//
-//
 // CHECK-LABEL: define weak_odr ptr @_ZN4Name3fooEv.resolver() comdat {
 // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
diff --git a/clang/test/Sema/attr-target-version.c b/clang/test/Sema/attr-target-version.c
index 5ea370aa980f1..096d2f003a004 100644
--- a/clang/test/Sema/attr-target-version.c
+++ b/clang/test/Sema/attr-target-version.c
@@ -102,13 +102,17 @@ int __attribute__((target_version("sha2"))) combine(void) { return 1; }
 // expected-error@+1 {{multiversioned function declaration has a different calling convention}}
 int __attribute__((aarch64_vector_pcs, target_version("sha3"))) combine(void) { return 2; }
 
+// expected-error@+1 {{multiversioned function must have a prototype}}
 int __attribute__((target_version("fp+aes+rcpc"))) unspec_args() { return -1; }
 // expected-error@-1 {{multiversioned function must have a prototype}}
+// expected-note@+1 {{function multiversioning caused by this declaration}}
 int __attribute__((target_version("default"))) unspec_args() { return 0; }
 int cargs() { return unspec_args(); }
 
+// expected-error@+1 {{multiversioned function must have a prototype}}
 int unspec_args_implicit_default_first();
 // expected-error@-1 {{multiversioned function must have a prototype}}
 // expected-note@+1 {{function multiversioning caused by this declaration}}
 int __attribute__((target_version("aes"))) unspec_args_implicit_default_first() { return -1; }
+// expected-note@+1 {{function multiversioning caused by this declaration}}
 int __attribute__((target_version("default"))) unspec_args_implicit_default_first() { return 0; }
diff --git a/clang/test/SemaCXX/attr-target-version.cpp b/clang/test/SemaCXX/attr-target-version.cpp
index 32fb97a9dc98d..974f88e77a3f7 100644
--- a/clang/test/SemaCXX/attr-target-version.cpp
+++ b/clang/test/SemaCXX/attr-target-version.cpp
@@ -57,7 +57,7 @@ int __attribute__((target_version("fp16fml"))) diff_type3(void) noexcept(false)
 int __attribute__((target_version("sve2-sha3"))) diff_type3(void) noexcept(true) { return 2; }
 
 template <typename T> int __attribute__((target_version("default"))) temp(T) { return 1; }
-
+// expected-error@-1 {{attribute 'target_version' multiversioned functions do not yet support function templates}}
 template <typename T> int __attribute__((target_version("simd"))) temp1(T) { return 1; }
 // expected-error@-1 {{attribute 'target_version' multiversioned functions do not yet support function templates}}
 
@@ -68,6 +68,7 @@ int __attribute__((target_version("aes"))) extc(void) { return 1; }
 int __attribute__((target_version("lse"))) extc(void) { return 1; }
 
 auto __attribute__((target_version("default"))) ret1(void) { return 1; }
+// expected-error@-1 {{attribute 'target_version' multiversioned functions do not yet support deduced return types}}
 auto __attribute__((target_version("dpb"))) ret2(void) { return 1; }
 // expected-error@-1 {{attribute 'target_version' multiversioned functions do not yet support deduced return types}}
 auto __attribute__((target_version("dpb2"))) ret3(void) -> int { return 1; }
@@ -84,6 +85,7 @@ class Cls {
   // expected-error@-1 {{attribute 'target_version' multiversioned functions do not yet support deleted functions}}
 
   virtual void __attribute__((target_version("default"))) vfunc();
+  // expected-error@-1 {{attribute 'target_version' multiversioned functions do not yet support virtual functions}}
   virtual void __attribute__((target_version("sm4"))) vfunc1();
   // expected-error@-1 {{attribute 'target_version' multiversioned functions do not yet support virtual functions}}
 };

From eb812d28f542bf0de54c157a7391e446739570cc Mon Sep 17 00:00:00 2001
From: Feng Zou <feng.zou@intel.com>
Date: Thu, 19 Dec 2024 20:16:34 +0800
Subject: [PATCH 052/209] [X86] Put R20/R21/R28/R29 later in GR64 list
 (#120510)

Because these registers require an extra byte to encode in certain
memory form. Putting them later in the list will reduce code size when
EGPR is enabled. And align the same order in GR8, GR16 and GR32 lists.
Example:

    movq (%r20), %r11  # encoding: [0xd5,0x1c,0x8b,0x1c,0x24]
    movq (%r22), %r11  # encoding: [0xd5,0x1c,0x8b,0x1e]
---
 llvm/lib/Target/X86/X86RegisterInfo.td |   26 +-
 llvm/test/CodeGen/X86/apx/mul-i1024.ll | 1436 ++++++++++++------------
 2 files changed, 731 insertions(+), 731 deletions(-)

diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index f93f920b6aeca..48459b3aca508 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -541,8 +541,8 @@ def SSP : X86Reg<"ssp", 0>;
 // R12, R13, R14, and R15 for X86-64) are callee-save registers.
 // In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and
 // R8B, ... R15B.
-// Allocate R12 and R13 last, as these require an extra byte when
-// encoded in x86_64 instructions.
+// Allocate R12, R13, R20, R21, R28 and R29 last, as these require an extra byte
+// when encoded in x86_64 instructions.
 // FIXME: Allow AH, CH, DH, BH to be used as general-purpose registers in
 // 64-bit mode. The main complication is that they cannot be encoded in an
 // instruction requiring a REX prefix, while SIL, DIL, BPL, R8D, etc.
@@ -550,9 +550,9 @@ def SSP : X86Reg<"ssp", 0>;
 // cannot be encoded.
 def GR8 : RegisterClass<"X86", [i8], 8,
                         (add AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL,
-                             R8B, R9B, R10B, R11B, R16B, R17B, R18B, R19B, R20B,
-                             R21B, R22B, R23B, R24B, R25B, R26B, R27B, R28B, R29B,
-                             R30B, R31B, R14B, R15B, R12B, R13B)> {
+                             R8B, R9B, R10B, R11B, R16B, R17B, R18B, R19B, R22B,
+                             R23B, R24B, R25B, R26B, R27B, R30B, R31B, R14B,
+                             R15B, R12B, R13B, R20B, R21B, R28B, R29B)> {
   let AltOrders = [(sub GR8, AH, BH, CH, DH)];
   let AltOrderSelect = [{
     return MF.getSubtarget<X86Subtarget>().is64Bit();
@@ -567,9 +567,9 @@ def GRH8 : RegisterClass<"X86", [i8], 8,
                               R26BH, R27BH, R28BH, R29BH, R30BH, R31BH)>;
 def GR16 : RegisterClass<"X86", [i16], 16,
                          (add AX, CX, DX, SI, DI, BX, BP, SP, R8W, R9W, R10W,
-                              R11W, R16W, R17W, R18W, R19W, R20W, R21W, R22W, R23W,
-                              R24W, R25W, R26W, R27W, R28W, R29W, R30W, R31W, R14W,
-                              R15W, R12W, R13W)>;
+                              R11W, R16W, R17W, R18W, R19W, R22W, R23W, R24W,
+                              R25W, R26W, R27W, R30W, R31W, R14W, R15W, R12W,
+                              R13W, R20W, R21W, R28W, R29W)>;
 
 let isAllocatable = 0 in
 def GRH16 : RegisterClass<"X86", [i16], 16,
@@ -579,9 +579,9 @@ def GRH16 : RegisterClass<"X86", [i16], 16,
                          R25WH, R26WH, R27WH, R28WH, R29WH, R30WH, R31WH)>;
 def GR32 : RegisterClass<"X86", [i32], 32,
                          (add EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP, R8D, R9D,
-                              R10D, R11D, R16D, R17D, R18D, R19D, R20D, R21D, R22D,
-                              R23D, R24D, R25D, R26D, R27D, R28D, R29D, R30D, R31D,
-                              R14D, R15D, R12D, R13D)>;
+                              R10D, R11D, R16D, R17D, R18D, R19D, R22D, R23D,
+                              R24D, R25D, R26D, R27D, R30D, R31D, R14D, R15D,
+                              R12D, R13D, R20D, R21D, R28D, R29D)>;
 
 // GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since
 // RIP isn't really a register and it can't be used anywhere except in an
@@ -590,8 +590,8 @@ def GR32 : RegisterClass<"X86", [i32], 32,
 // tests because of the inclusion of RIP in this register class.
 def GR64 : RegisterClass<"X86", [i64], 64,
                     (add RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, R16, R17,
-                         R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29,
-                         R30, R31, RBX, R14, R15, R12, R13, RBP, RSP, RIP)>;
+                         R18, R19, R22, R23, R24, R25, R26, R27, R30, R31, RBX,
+                         R14, R15, R12, R13, R20, R21, R28, R29, RBP, RSP, RIP)>;
 
 // GR64PLTSafe - 64-bit GPRs without R10, R11, RSP and RIP. Could be used when
 // emitting code for intrinsics, which use implict input registers.
diff --git a/llvm/test/CodeGen/X86/apx/mul-i1024.ll b/llvm/test/CodeGen/X86/apx/mul-i1024.ll
index a4d15a1b21d6b..a29a92176f432 100644
--- a/llvm/test/CodeGen/X86/apx/mul-i1024.ll
+++ b/llvm/test/CodeGen/X86/apx/mul-i1024.ll
@@ -13,104 +13,104 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    pushq %rbx
 ; EGPR-NEXT:    subq $104, %rsp
 ; EGPR-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    movq %rdi, %r24
+; EGPR-NEXT:    movq %rdi, %r26
 ; EGPR-NEXT:    movq (%rdi), %r13
 ; EGPR-NEXT:    movq 8(%rdi), %r18
-; EGPR-NEXT:    movq 24(%rdi), %r29
+; EGPR-NEXT:    movq 24(%rdi), %r21
 ; EGPR-NEXT:    movq 16(%rdi), %r17
 ; EGPR-NEXT:    movq 40(%rdi), %rdi
-; EGPR-NEXT:    movq 32(%r24), %r10
-; EGPR-NEXT:    movq 56(%r24), %r15
-; EGPR-NEXT:    movq 48(%r24), %r12
+; EGPR-NEXT:    movq 32(%r26), %r10
+; EGPR-NEXT:    movq 56(%r26), %r15
+; EGPR-NEXT:    movq 48(%r26), %r12
 ; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    movq 24(%rsi), %r23
+; EGPR-NEXT:    movq 24(%rsi), %r25
 ; EGPR-NEXT:    movq 16(%rsi), %r11
-; EGPR-NEXT:    movq (%rsi), %r27
+; EGPR-NEXT:    movq (%rsi), %r31
 ; EGPR-NEXT:    movq 8(%rsi), %r14
 ; EGPR-NEXT:    movq %r12, %rax
-; EGPR-NEXT:    mulq %r27
+; EGPR-NEXT:    mulq %r31
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %r19
 ; EGPR-NEXT:    movq %r15, %rax
-; EGPR-NEXT:    mulq %r27
+; EGPR-NEXT:    mulq %r31
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r16
 ; EGPR-NEXT:    addq %r8, %r16
 ; EGPR-NEXT:    adcq $0, %r9
 ; EGPR-NEXT:    movq %r12, %rax
 ; EGPR-NEXT:    mulq %r14
-; EGPR-NEXT:    movq %rdx, %r20
+; EGPR-NEXT:    movq %rdx, %r22
 ; EGPR-NEXT:    movq %rax, %r8
 ; EGPR-NEXT:    addq %r16, %r8
-; EGPR-NEXT:    adcq %r9, %r20
+; EGPR-NEXT:    adcq %r9, %r22
 ; EGPR-NEXT:    setb %al
 ; EGPR-NEXT:    movzbl %al, %ecx
 ; EGPR-NEXT:    movq %r15, %rax
 ; EGPR-NEXT:    mulq %r14
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r16
-; EGPR-NEXT:    addq %r20, %r16
+; EGPR-NEXT:    addq %r22, %r16
 ; EGPR-NEXT:    adcq %rcx, %r9
 ; EGPR-NEXT:    movq %r10, %rax
-; EGPR-NEXT:    mulq %r27
-; EGPR-NEXT:    movq %rdx, %r20
-; EGPR-NEXT:    movq %rax, %r25
+; EGPR-NEXT:    mulq %r31
+; EGPR-NEXT:    movq %rdx, %r22
+; EGPR-NEXT:    movq %rax, %r27
 ; EGPR-NEXT:    movq %rdi, %rax
-; EGPR-NEXT:    mulq %r27
-; EGPR-NEXT:    movq %rdx, %r21
-; EGPR-NEXT:    movq %rax, %r22
-; EGPR-NEXT:    addq %r20, %r22
-; EGPR-NEXT:    adcq $0, %r21
+; EGPR-NEXT:    mulq %r31
+; EGPR-NEXT:    movq %rdx, %r23
+; EGPR-NEXT:    movq %rax, %r24
+; EGPR-NEXT:    addq %r22, %r24
+; EGPR-NEXT:    adcq $0, %r23
 ; EGPR-NEXT:    movq %r10, %rax
 ; EGPR-NEXT:    mulq %r14
-; EGPR-NEXT:    movq %rdx, %r20
-; EGPR-NEXT:    movq %rax, %r28
-; EGPR-NEXT:    addq %r22, %r28
-; EGPR-NEXT:    adcq %r21, %r20
+; EGPR-NEXT:    movq %rdx, %r22
+; EGPR-NEXT:    movq %rax, %r20
+; EGPR-NEXT:    addq %r24, %r20
+; EGPR-NEXT:    adcq %r23, %r22
 ; EGPR-NEXT:    setb %al
 ; EGPR-NEXT:    movzbl %al, %ecx
 ; EGPR-NEXT:    movq %rdi, %rax
 ; EGPR-NEXT:    mulq %r14
-; EGPR-NEXT:    movq %rdx, %r21
-; EGPR-NEXT:    movq %rax, %r22
-; EGPR-NEXT:    addq %r20, %r22
-; EGPR-NEXT:    adcq %rcx, %r21
-; EGPR-NEXT:    addq %r19, %r22
-; EGPR-NEXT:    adcq %r8, %r21
+; EGPR-NEXT:    movq %rdx, %r23
+; EGPR-NEXT:    movq %rax, %r24
+; EGPR-NEXT:    addq %r22, %r24
+; EGPR-NEXT:    adcq %rcx, %r23
+; EGPR-NEXT:    addq %r19, %r24
+; EGPR-NEXT:    adcq %r8, %r23
 ; EGPR-NEXT:    adcq $0, %r16
 ; EGPR-NEXT:    adcq $0, %r9
 ; EGPR-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    movq %r10, %rax
 ; EGPR-NEXT:    mulq %r11
 ; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r30
+; EGPR-NEXT:    movq %rax, %r28
 ; EGPR-NEXT:    movq %rdi, %rax
 ; EGPR-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    mulq %r11
 ; EGPR-NEXT:    movq %rdx, %r19
-; EGPR-NEXT:    movq %rax, %r20
-; EGPR-NEXT:    addq %r8, %r20
+; EGPR-NEXT:    movq %rax, %r22
+; EGPR-NEXT:    addq %r8, %r22
 ; EGPR-NEXT:    adcq $0, %r19
 ; EGPR-NEXT:    movq %r10, %rax
-; EGPR-NEXT:    mulq %r23
+; EGPR-NEXT:    mulq %r25
 ; EGPR-NEXT:    movq %rdx, %rbx
-; EGPR-NEXT:    movq %rax, %r31
-; EGPR-NEXT:    addq %r20, %r31
+; EGPR-NEXT:    movq %rax, %r29
+; EGPR-NEXT:    addq %r22, %r29
 ; EGPR-NEXT:    adcq %r19, %rbx
 ; EGPR-NEXT:    setb %al
 ; EGPR-NEXT:    movzbl %al, %ecx
 ; EGPR-NEXT:    movq %rdi, %rax
-; EGPR-NEXT:    mulq %r23
-; EGPR-NEXT:    movq %rdx, %r26
+; EGPR-NEXT:    mulq %r25
+; EGPR-NEXT:    movq %rdx, %r30
 ; EGPR-NEXT:    movq %rax, %r8
 ; EGPR-NEXT:    addq %rbx, %r8
-; EGPR-NEXT:    adcq %rcx, %r26
-; EGPR-NEXT:    addq %r22, %r30
-; EGPR-NEXT:    adcq %r21, %r31
+; EGPR-NEXT:    adcq %rcx, %r30
+; EGPR-NEXT:    addq %r24, %r28
+; EGPR-NEXT:    adcq %r23, %r29
 ; EGPR-NEXT:    adcq $0, %r8
-; EGPR-NEXT:    adcq $0, %r26
+; EGPR-NEXT:    adcq $0, %r30
 ; EGPR-NEXT:    addq %r16, %r8
-; EGPR-NEXT:    adcq %r9, %r26
+; EGPR-NEXT:    adcq %r9, %r30
 ; EGPR-NEXT:    setb %al
 ; EGPR-NEXT:    movzbl %al, %ecx
 ; EGPR-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -122,34 +122,34 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movq %r15, %rax
 ; EGPR-NEXT:    mulq %r11
 ; EGPR-NEXT:    movq %rdx, %r16
-; EGPR-NEXT:    movq %rax, %r21
-; EGPR-NEXT:    addq %r9, %r21
+; EGPR-NEXT:    movq %rax, %r23
+; EGPR-NEXT:    addq %r9, %r23
 ; EGPR-NEXT:    adcq $0, %r16
 ; EGPR-NEXT:    movq %r12, %rax
-; EGPR-NEXT:    mulq %r23
+; EGPR-NEXT:    mulq %r25
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %rdi
-; EGPR-NEXT:    addq %r21, %rdi
+; EGPR-NEXT:    addq %r23, %rdi
 ; EGPR-NEXT:    adcq %r16, %r9
 ; EGPR-NEXT:    setb %al
 ; EGPR-NEXT:    movzbl %al, %r10d
 ; EGPR-NEXT:    movq %r15, %rax
-; EGPR-NEXT:    mulq %r23
-; EGPR-NEXT:    movq %rdx, %r21
-; EGPR-NEXT:    movq %rax, %r22
-; EGPR-NEXT:    addq %r9, %r22
-; EGPR-NEXT:    adcq %r10, %r21
+; EGPR-NEXT:    mulq %r25
+; EGPR-NEXT:    movq %rdx, %r23
+; EGPR-NEXT:    movq %rax, %r24
+; EGPR-NEXT:    addq %r9, %r24
+; EGPR-NEXT:    adcq %r10, %r23
 ; EGPR-NEXT:    addq %r8, %rsi
 ; EGPR-NEXT:    movq %rsi, %r19
-; EGPR-NEXT:    adcq %r26, %rdi
-; EGPR-NEXT:    adcq %rcx, %r22
-; EGPR-NEXT:    adcq $0, %r21
+; EGPR-NEXT:    adcq %r30, %rdi
+; EGPR-NEXT:    adcq %rcx, %r24
+; EGPR-NEXT:    adcq $0, %r23
 ; EGPR-NEXT:    movq %r17, %rax
-; EGPR-NEXT:    mulq %r27
+; EGPR-NEXT:    mulq %r31
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %rbx
-; EGPR-NEXT:    movq %r29, %rax
-; EGPR-NEXT:    mulq %r27
+; EGPR-NEXT:    movq %r21, %rax
+; EGPR-NEXT:    mulq %r31
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r16
 ; EGPR-NEXT:    addq %r8, %r16
@@ -157,12 +157,12 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movq %r17, %rax
 ; EGPR-NEXT:    mulq %r14
 ; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r26
-; EGPR-NEXT:    addq %r16, %r26
+; EGPR-NEXT:    movq %rax, %r30
+; EGPR-NEXT:    addq %r16, %r30
 ; EGPR-NEXT:    adcq %r9, %r8
 ; EGPR-NEXT:    setb %al
 ; EGPR-NEXT:    movzbl %al, %ecx
-; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    movq %r21, %rax
 ; EGPR-NEXT:    mulq %r14
 ; EGPR-NEXT:    movq %r14, %rsi
 ; EGPR-NEXT:    movq %rdx, %r9
@@ -170,11 +170,11 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    addq %r8, %r16
 ; EGPR-NEXT:    adcq %rcx, %r9
 ; EGPR-NEXT:    movq %r13, %rax
-; EGPR-NEXT:    mulq %r27
+; EGPR-NEXT:    mulq %r31
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    movq %r18, %rax
-; EGPR-NEXT:    mulq %r27
+; EGPR-NEXT:    mulq %r31
 ; EGPR-NEXT:    movq %rdx, %r14
 ; EGPR-NEXT:    movq %rax, %r15
 ; EGPR-NEXT:    addq %r8, %r15
@@ -195,40 +195,40 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movzbl %cl, %eax
 ; EGPR-NEXT:    adcq %rax, %r8
 ; EGPR-NEXT:    addq %rbx, %r15
-; EGPR-NEXT:    adcq %r26, %r8
+; EGPR-NEXT:    adcq %r30, %r8
 ; EGPR-NEXT:    adcq $0, %r16
 ; EGPR-NEXT:    adcq $0, %r9
 ; EGPR-NEXT:    movq %r13, %rax
 ; EGPR-NEXT:    mulq %r11
-; EGPR-NEXT:    movq %rdx, %r26
+; EGPR-NEXT:    movq %rdx, %r30
 ; EGPR-NEXT:    movq %rax, %rsi
 ; EGPR-NEXT:    movq %r18, %rax
 ; EGPR-NEXT:    mulq %r11
 ; EGPR-NEXT:    movq %rdx, %rbx
 ; EGPR-NEXT:    movq %rax, %r14
-; EGPR-NEXT:    addq %r26, %r14
+; EGPR-NEXT:    addq %r30, %r14
 ; EGPR-NEXT:    adcq $0, %rbx
 ; EGPR-NEXT:    movq %r13, %rax
-; EGPR-NEXT:    mulq %r23
+; EGPR-NEXT:    mulq %r25
 ; EGPR-NEXT:    movq %rdx, %r12
 ; EGPR-NEXT:    addq %r14, %rax
 ; EGPR-NEXT:    movq %rax, %r10
 ; EGPR-NEXT:    adcq %rbx, %r12
 ; EGPR-NEXT:    setb %cl
 ; EGPR-NEXT:    movq %r18, %rax
-; EGPR-NEXT:    mulq %r23
+; EGPR-NEXT:    mulq %r25
 ; EGPR-NEXT:    movq %rdx, %r14
-; EGPR-NEXT:    movq %rax, %r26
-; EGPR-NEXT:    addq %r12, %r26
+; EGPR-NEXT:    movq %rax, %r30
+; EGPR-NEXT:    addq %r12, %r30
 ; EGPR-NEXT:    movzbl %cl, %eax
 ; EGPR-NEXT:    adcq %rax, %r14
 ; EGPR-NEXT:    addq %r15, %rsi
 ; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    adcq %r8, %r10
 ; EGPR-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq $0, %r26
+; EGPR-NEXT:    adcq $0, %r30
 ; EGPR-NEXT:    adcq $0, %r14
-; EGPR-NEXT:    addq %r16, %r26
+; EGPR-NEXT:    addq %r16, %r30
 ; EGPR-NEXT:    adcq %r9, %r14
 ; EGPR-NEXT:    setb %cl
 ; EGPR-NEXT:    movq %r17, %rax
@@ -236,48 +236,48 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    mulq %r11
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %rbx
-; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    movq %r21, %rax
 ; EGPR-NEXT:    mulq %r11
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r16
 ; EGPR-NEXT:    addq %r8, %r16
 ; EGPR-NEXT:    adcq $0, %r9
 ; EGPR-NEXT:    movq %r17, %rax
-; EGPR-NEXT:    mulq %r23
+; EGPR-NEXT:    mulq %r25
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %r15
 ; EGPR-NEXT:    addq %r16, %r15
 ; EGPR-NEXT:    adcq %r9, %r8
 ; EGPR-NEXT:    setb %r9b
-; EGPR-NEXT:    movq %r29, %rax
-; EGPR-NEXT:    mulq %r23
+; EGPR-NEXT:    movq %r21, %rax
+; EGPR-NEXT:    mulq %r25
 ; EGPR-NEXT:    movq %rdx, %r12
 ; EGPR-NEXT:    movq %rax, %rbp
 ; EGPR-NEXT:    addq %r8, %rbp
 ; EGPR-NEXT:    movzbl %r9b, %eax
 ; EGPR-NEXT:    adcq %rax, %r12
-; EGPR-NEXT:    addq %r26, %rbx
+; EGPR-NEXT:    addq %r30, %rbx
 ; EGPR-NEXT:    adcq %r14, %r15
 ; EGPR-NEXT:    movzbl %cl, %eax
 ; EGPR-NEXT:    adcq %rax, %rbp
 ; EGPR-NEXT:    adcq $0, %r12
-; EGPR-NEXT:    addq %r25, %rbx
+; EGPR-NEXT:    addq %r27, %rbx
 ; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; EGPR-NEXT:    movq 32(%rsi), %r25
-; EGPR-NEXT:    adcq %r28, %r15
-; EGPR-NEXT:    adcq %r30, %rbp
-; EGPR-NEXT:    adcq %r31, %r12
+; EGPR-NEXT:    movq 32(%rsi), %r27
+; EGPR-NEXT:    adcq %r20, %r15
+; EGPR-NEXT:    adcq %r28, %rbp
+; EGPR-NEXT:    adcq %r29, %r12
 ; EGPR-NEXT:    adcq $0, %r19
 ; EGPR-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    adcq $0, %rdi
-; EGPR-NEXT:    adcq $0, %r22
-; EGPR-NEXT:    adcq $0, %r21
+; EGPR-NEXT:    adcq $0, %r24
+; EGPR-NEXT:    adcq $0, %r23
 ; EGPR-NEXT:    movq %r17, %rax
-; EGPR-NEXT:    mulq %r25
+; EGPR-NEXT:    mulq %r27
 ; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r28
-; EGPR-NEXT:    movq %r29, %rax
-; EGPR-NEXT:    mulq %r25
+; EGPR-NEXT:    movq %rax, %r20
+; EGPR-NEXT:    movq %r21, %rax
+; EGPR-NEXT:    mulq %r27
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r16
 ; EGPR-NEXT:    addq %r8, %r16
@@ -286,11 +286,11 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movq %r17, %rax
 ; EGPR-NEXT:    mulq %rcx
 ; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r26
-; EGPR-NEXT:    addq %r16, %r26
+; EGPR-NEXT:    movq %rax, %r30
+; EGPR-NEXT:    addq %r16, %r30
 ; EGPR-NEXT:    adcq %r9, %r8
 ; EGPR-NEXT:    setb %r10b
-; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    movq %r21, %rax
 ; EGPR-NEXT:    mulq %rcx
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r16
@@ -298,138 +298,138 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movzbl %r10b, %eax
 ; EGPR-NEXT:    adcq %rax, %r9
 ; EGPR-NEXT:    movq %r13, %rax
-; EGPR-NEXT:    mulq %r25
+; EGPR-NEXT:    mulq %r27
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %r19
 ; EGPR-NEXT:    movq %r18, %rax
-; EGPR-NEXT:    mulq %r25
-; EGPR-NEXT:    movq %rdx, %r30
-; EGPR-NEXT:    movq %rax, %r31
-; EGPR-NEXT:    addq %r8, %r31
-; EGPR-NEXT:    adcq $0, %r30
+; EGPR-NEXT:    mulq %r27
+; EGPR-NEXT:    movq %rdx, %r28
+; EGPR-NEXT:    movq %rax, %r29
+; EGPR-NEXT:    addq %r8, %r29
+; EGPR-NEXT:    adcq $0, %r28
 ; EGPR-NEXT:    movq %r13, %rax
 ; EGPR-NEXT:    mulq %rcx
 ; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r20
-; EGPR-NEXT:    addq %r31, %r20
-; EGPR-NEXT:    adcq %r30, %r8
+; EGPR-NEXT:    movq %rax, %r22
+; EGPR-NEXT:    addq %r29, %r22
+; EGPR-NEXT:    adcq %r28, %r8
 ; EGPR-NEXT:    setb %r10b
 ; EGPR-NEXT:    movq %r18, %rax
 ; EGPR-NEXT:    mulq %rcx
-; EGPR-NEXT:    movq %rdx, %r30
-; EGPR-NEXT:    movq %rax, %r31
-; EGPR-NEXT:    addq %r8, %r31
+; EGPR-NEXT:    movq %rdx, %r28
+; EGPR-NEXT:    movq %rax, %r29
+; EGPR-NEXT:    addq %r8, %r29
 ; EGPR-NEXT:    movzbl %r10b, %eax
-; EGPR-NEXT:    adcq %rax, %r30
-; EGPR-NEXT:    addq %r28, %r31
-; EGPR-NEXT:    adcq %r26, %r30
+; EGPR-NEXT:    adcq %rax, %r28
+; EGPR-NEXT:    addq %r20, %r29
+; EGPR-NEXT:    adcq %r30, %r28
 ; EGPR-NEXT:    adcq $0, %r16
 ; EGPR-NEXT:    adcq $0, %r9
-; EGPR-NEXT:    movq 48(%rsi), %r28
+; EGPR-NEXT:    movq 48(%rsi), %r20
 ; EGPR-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    movq %r13, %rax
-; EGPR-NEXT:    mulq %r28
+; EGPR-NEXT:    mulq %r20
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %r11
 ; EGPR-NEXT:    movq %r18, %rax
 ; EGPR-NEXT:    movq %r18, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    mulq %r28
-; EGPR-NEXT:    movq %rdx, %r26
+; EGPR-NEXT:    mulq %r20
+; EGPR-NEXT:    movq %rdx, %r30
 ; EGPR-NEXT:    movq %rax, %r14
 ; EGPR-NEXT:    addq %r8, %r14
-; EGPR-NEXT:    adcq $0, %r26
+; EGPR-NEXT:    adcq $0, %r30
 ; EGPR-NEXT:    movq 56(%rsi), %r10
 ; EGPR-NEXT:    movq %r13, %rax
 ; EGPR-NEXT:    mulq %r10
 ; EGPR-NEXT:    movq %rdx, %r13
 ; EGPR-NEXT:    addq %r14, %rax
 ; EGPR-NEXT:    movq %rax, %r14
-; EGPR-NEXT:    adcq %r26, %r13
+; EGPR-NEXT:    adcq %r30, %r13
 ; EGPR-NEXT:    setb %sil
 ; EGPR-NEXT:    movq %r18, %rax
 ; EGPR-NEXT:    mulq %r10
-; EGPR-NEXT:    movq %rdx, %r26
+; EGPR-NEXT:    movq %rdx, %r30
 ; EGPR-NEXT:    movq %rax, %r8
 ; EGPR-NEXT:    addq %r13, %r8
 ; EGPR-NEXT:    movzbl %sil, %eax
-; EGPR-NEXT:    adcq %rax, %r26
-; EGPR-NEXT:    addq %r31, %r11
-; EGPR-NEXT:    adcq %r30, %r14
+; EGPR-NEXT:    adcq %rax, %r30
+; EGPR-NEXT:    addq %r29, %r11
+; EGPR-NEXT:    adcq %r28, %r14
 ; EGPR-NEXT:    adcq $0, %r8
-; EGPR-NEXT:    adcq $0, %r26
+; EGPR-NEXT:    adcq $0, %r30
 ; EGPR-NEXT:    addq %r16, %r8
-; EGPR-NEXT:    adcq %r9, %r26
+; EGPR-NEXT:    adcq %r9, %r30
 ; EGPR-NEXT:    setb %r18b
 ; EGPR-NEXT:    movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    movq %r17, %rax
-; EGPR-NEXT:    mulq %r28
+; EGPR-NEXT:    mulq %r20
 ; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %r30
-; EGPR-NEXT:    movq %r29, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    movq %r29, %rax
-; EGPR-NEXT:    mulq %r28
+; EGPR-NEXT:    movq %rax, %r28
+; EGPR-NEXT:    movq %r21, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq %r21, %rax
+; EGPR-NEXT:    mulq %r20
 ; EGPR-NEXT:    movq %rdx, %r16
-; EGPR-NEXT:    movq %rax, %r31
-; EGPR-NEXT:    addq %r9, %r31
+; EGPR-NEXT:    movq %rax, %r29
+; EGPR-NEXT:    addq %r9, %r29
 ; EGPR-NEXT:    adcq $0, %r16
 ; EGPR-NEXT:    movq %r17, %rax
 ; EGPR-NEXT:    mulq %r10
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r17
-; EGPR-NEXT:    addq %r31, %r17
+; EGPR-NEXT:    addq %r29, %r17
 ; EGPR-NEXT:    adcq %r16, %r9
 ; EGPR-NEXT:    setb %r16b
-; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    movq %r21, %rax
 ; EGPR-NEXT:    mulq %r10
 ; EGPR-NEXT:    movq %rdx, %r13
-; EGPR-NEXT:    movq %rax, %r31
-; EGPR-NEXT:    addq %r9, %r31
+; EGPR-NEXT:    movq %rax, %r29
+; EGPR-NEXT:    addq %r9, %r29
 ; EGPR-NEXT:    movzbl %r16b, %eax
 ; EGPR-NEXT:    adcq %rax, %r13
-; EGPR-NEXT:    addq %r8, %r30
-; EGPR-NEXT:    adcq %r26, %r17
+; EGPR-NEXT:    addq %r8, %r28
+; EGPR-NEXT:    adcq %r30, %r17
 ; EGPR-NEXT:    movzbl %r18b, %eax
-; EGPR-NEXT:    adcq %rax, %r31
+; EGPR-NEXT:    adcq %rax, %r29
 ; EGPR-NEXT:    adcq $0, %r13
 ; EGPR-NEXT:    addq %rbx, %r19
 ; EGPR-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq %r15, %r20
-; EGPR-NEXT:    movq %r20, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    adcq %r15, %r22
+; EGPR-NEXT:    movq %r22, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    adcq %rbp, %r11
 ; EGPR-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    adcq %r12, %r14
 ; EGPR-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq $0, %r30
+; EGPR-NEXT:    adcq $0, %r28
 ; EGPR-NEXT:    adcq $0, %r17
-; EGPR-NEXT:    adcq $0, %r31
+; EGPR-NEXT:    adcq $0, %r29
 ; EGPR-NEXT:    adcq $0, %r13
-; EGPR-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r30 # 8-byte Folded Reload
+; EGPR-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r28 # 8-byte Folded Reload
 ; EGPR-NEXT:    adcq %rdi, %r17
-; EGPR-NEXT:    adcq %r22, %r31
-; EGPR-NEXT:    adcq %r21, %r13
+; EGPR-NEXT:    adcq %r24, %r29
+; EGPR-NEXT:    adcq %r23, %r13
 ; EGPR-NEXT:    setb %r15b
 ; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; EGPR-NEXT:    movq %rsi, %rax
-; EGPR-NEXT:    mulq %r25
+; EGPR-NEXT:    mulq %r27
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %r19
-; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Reload
-; EGPR-NEXT:    movq %r21, %rax
-; EGPR-NEXT:    mulq %r25
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Reload
+; EGPR-NEXT:    movq %r23, %rax
+; EGPR-NEXT:    mulq %r27
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r16
 ; EGPR-NEXT:    addq %r8, %r16
 ; EGPR-NEXT:    adcq $0, %r9
 ; EGPR-NEXT:    movq %rsi, %rax
-; EGPR-NEXT:    movq %rsi, %r29
+; EGPR-NEXT:    movq %rsi, %r21
 ; EGPR-NEXT:    mulq %rcx
 ; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r20
-; EGPR-NEXT:    addq %r16, %r20
+; EGPR-NEXT:    movq %rax, %r22
+; EGPR-NEXT:    addq %r16, %r22
 ; EGPR-NEXT:    adcq %r9, %r8
 ; EGPR-NEXT:    setb %r18b
-; EGPR-NEXT:    movq %r21, %rax
-; EGPR-NEXT:    movq %r21, %r14
+; EGPR-NEXT:    movq %r23, %rax
+; EGPR-NEXT:    movq %r23, %r14
 ; EGPR-NEXT:    mulq %rcx
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r16
@@ -438,77 +438,77 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    adcq %rax, %r9
 ; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
 ; EGPR-NEXT:    movq %rbx, %rax
-; EGPR-NEXT:    mulq %r25
+; EGPR-NEXT:    mulq %r27
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %rdi
 ; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; EGPR-NEXT:    movq %rsi, %rax
-; EGPR-NEXT:    mulq %r25
-; EGPR-NEXT:    movq %rdx, %r21
-; EGPR-NEXT:    movq %rax, %r22
-; EGPR-NEXT:    addq %r8, %r22
-; EGPR-NEXT:    adcq $0, %r21
+; EGPR-NEXT:    mulq %r27
+; EGPR-NEXT:    movq %rdx, %r23
+; EGPR-NEXT:    movq %rax, %r24
+; EGPR-NEXT:    addq %r8, %r24
+; EGPR-NEXT:    adcq $0, %r23
 ; EGPR-NEXT:    movq %rbx, %rax
 ; EGPR-NEXT:    mulq %rcx
 ; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    addq %r22, %rax
+; EGPR-NEXT:    addq %r24, %rax
 ; EGPR-NEXT:    movq %rax, %r11
-; EGPR-NEXT:    adcq %r21, %r8
+; EGPR-NEXT:    adcq %r23, %r8
 ; EGPR-NEXT:    setb %r18b
 ; EGPR-NEXT:    movq %rsi, %rax
-; EGPR-NEXT:    movq %rsi, %r21
+; EGPR-NEXT:    movq %rsi, %r23
 ; EGPR-NEXT:    mulq %rcx
-; EGPR-NEXT:    movq %rdx, %r22
-; EGPR-NEXT:    movq %rax, %r26
-; EGPR-NEXT:    addq %r8, %r26
+; EGPR-NEXT:    movq %rdx, %r24
+; EGPR-NEXT:    movq %rax, %r30
+; EGPR-NEXT:    addq %r8, %r30
 ; EGPR-NEXT:    movzbl %r18b, %eax
-; EGPR-NEXT:    adcq %rax, %r22
-; EGPR-NEXT:    addq %r19, %r26
-; EGPR-NEXT:    adcq %r20, %r22
+; EGPR-NEXT:    adcq %rax, %r24
+; EGPR-NEXT:    addq %r19, %r30
+; EGPR-NEXT:    adcq %r22, %r24
 ; EGPR-NEXT:    adcq $0, %r16
 ; EGPR-NEXT:    adcq $0, %r9
 ; EGPR-NEXT:    movq %rbx, %rax
-; EGPR-NEXT:    mulq %r28
+; EGPR-NEXT:    mulq %r20
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %rsi
-; EGPR-NEXT:    movq %r21, %rax
-; EGPR-NEXT:    mulq %r28
+; EGPR-NEXT:    movq %r23, %rax
+; EGPR-NEXT:    mulq %r20
 ; EGPR-NEXT:    movq %rdx, %r19
-; EGPR-NEXT:    movq %rax, %r20
-; EGPR-NEXT:    addq %r8, %r20
+; EGPR-NEXT:    movq %rax, %r22
+; EGPR-NEXT:    addq %r8, %r22
 ; EGPR-NEXT:    adcq $0, %r19
 ; EGPR-NEXT:    movq %rbx, %rax
 ; EGPR-NEXT:    mulq %r10
 ; EGPR-NEXT:    movq %rdx, %rbx
-; EGPR-NEXT:    addq %r20, %rax
-; EGPR-NEXT:    movq %rax, %r20
+; EGPR-NEXT:    addq %r22, %rax
+; EGPR-NEXT:    movq %rax, %r22
 ; EGPR-NEXT:    adcq %r19, %rbx
 ; EGPR-NEXT:    setb %r18b
-; EGPR-NEXT:    movq %r21, %rax
+; EGPR-NEXT:    movq %r23, %rax
 ; EGPR-NEXT:    mulq %r10
-; EGPR-NEXT:    movq %rdx, %r21
+; EGPR-NEXT:    movq %rdx, %r23
 ; EGPR-NEXT:    movq %rax, %r8
 ; EGPR-NEXT:    addq %rbx, %r8
 ; EGPR-NEXT:    movzbl %r18b, %eax
-; EGPR-NEXT:    adcq %rax, %r21
-; EGPR-NEXT:    addq %r26, %rsi
-; EGPR-NEXT:    adcq %r22, %r20
+; EGPR-NEXT:    adcq %rax, %r23
+; EGPR-NEXT:    addq %r30, %rsi
+; EGPR-NEXT:    adcq %r24, %r22
 ; EGPR-NEXT:    adcq $0, %r8
-; EGPR-NEXT:    adcq $0, %r21
+; EGPR-NEXT:    adcq $0, %r23
 ; EGPR-NEXT:    addq %r16, %r8
-; EGPR-NEXT:    adcq %r9, %r21
+; EGPR-NEXT:    adcq %r9, %r23
 ; EGPR-NEXT:    setb %r18b
-; EGPR-NEXT:    movq %r29, %rax
-; EGPR-NEXT:    mulq %r28
+; EGPR-NEXT:    movq %r21, %rax
+; EGPR-NEXT:    mulq %r20
 ; EGPR-NEXT:    movq %rdx, %r9
-; EGPR-NEXT:    movq %rax, %r22
+; EGPR-NEXT:    movq %rax, %r24
 ; EGPR-NEXT:    movq %r14, %rax
-; EGPR-NEXT:    mulq %r28
+; EGPR-NEXT:    mulq %r20
 ; EGPR-NEXT:    movq %rdx, %r16
 ; EGPR-NEXT:    movq %rax, %r19
 ; EGPR-NEXT:    addq %r9, %r19
 ; EGPR-NEXT:    adcq $0, %r16
-; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    movq %r21, %rax
 ; EGPR-NEXT:    mulq %r10
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    addq %r19, %rax
@@ -522,121 +522,121 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    addq %r9, %r12
 ; EGPR-NEXT:    movzbl %r16b, %eax
 ; EGPR-NEXT:    adcq %rax, %rbp
-; EGPR-NEXT:    addq %r8, %r22
-; EGPR-NEXT:    adcq %r21, %r19
+; EGPR-NEXT:    addq %r8, %r24
+; EGPR-NEXT:    adcq %r23, %r19
 ; EGPR-NEXT:    movzbl %r18b, %eax
 ; EGPR-NEXT:    adcq %rax, %r12
 ; EGPR-NEXT:    adcq $0, %rbp
-; EGPR-NEXT:    addq %r30, %rdi
+; EGPR-NEXT:    addq %r28, %rdi
 ; EGPR-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    adcq %r17, %r11
 ; EGPR-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq %r31, %rsi
+; EGPR-NEXT:    adcq %r29, %rsi
 ; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq %r13, %r20
-; EGPR-NEXT:    movq %r20, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    adcq %r13, %r22
+; EGPR-NEXT:    movq %r22, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    movzbl %r15b, %eax
-; EGPR-NEXT:    adcq %rax, %r22
-; EGPR-NEXT:    movq %r22, (%rsp) # 8-byte Spill
+; EGPR-NEXT:    adcq %rax, %r24
+; EGPR-NEXT:    movq %r24, (%rsp) # 8-byte Spill
 ; EGPR-NEXT:    adcq $0, %r19
 ; EGPR-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    adcq $0, %r12
 ; EGPR-NEXT:    adcq $0, %rbp
-; EGPR-NEXT:    movq 64(%r24), %r21
+; EGPR-NEXT:    movq 64(%r26), %r23
 ; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; EGPR-NEXT:    movq %rdi, %rax
-; EGPR-NEXT:    mulq %r21
+; EGPR-NEXT:    mulq %r23
 ; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r22
-; EGPR-NEXT:    movq %r23, %rax
-; EGPR-NEXT:    mulq %r21
+; EGPR-NEXT:    movq %rax, %r24
+; EGPR-NEXT:    movq %r25, %rax
+; EGPR-NEXT:    mulq %r23
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r16
 ; EGPR-NEXT:    addq %r8, %r16
 ; EGPR-NEXT:    adcq $0, %r9
-; EGPR-NEXT:    movq 72(%r24), %r30
+; EGPR-NEXT:    movq 72(%r26), %r28
 ; EGPR-NEXT:    movq %rdi, %rax
-; EGPR-NEXT:    mulq %r30
+; EGPR-NEXT:    mulq %r28
 ; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r26
-; EGPR-NEXT:    addq %r16, %r26
+; EGPR-NEXT:    movq %rax, %r30
+; EGPR-NEXT:    addq %r16, %r30
 ; EGPR-NEXT:    adcq %r9, %r8
 ; EGPR-NEXT:    setb %r18b
-; EGPR-NEXT:    movq %r23, %rax
-; EGPR-NEXT:    mulq %r30
+; EGPR-NEXT:    movq %r25, %rax
+; EGPR-NEXT:    mulq %r28
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r16
 ; EGPR-NEXT:    addq %r8, %r16
 ; EGPR-NEXT:    movzbl %r18b, %eax
 ; EGPR-NEXT:    adcq %rax, %r9
-; EGPR-NEXT:    movq %r27, %rax
-; EGPR-NEXT:    mulq %r21
+; EGPR-NEXT:    movq %r31, %rax
+; EGPR-NEXT:    mulq %r23
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
 ; EGPR-NEXT:    movq %r11, %rax
-; EGPR-NEXT:    mulq %r21
-; EGPR-NEXT:    movq %rdx, %r31
+; EGPR-NEXT:    mulq %r23
+; EGPR-NEXT:    movq %rdx, %r29
 ; EGPR-NEXT:    movq %rax, %rbx
 ; EGPR-NEXT:    addq %r8, %rbx
-; EGPR-NEXT:    adcq $0, %r31
-; EGPR-NEXT:    movq %r27, %rax
-; EGPR-NEXT:    mulq %r30
+; EGPR-NEXT:    adcq $0, %r29
+; EGPR-NEXT:    movq %r31, %rax
+; EGPR-NEXT:    mulq %r28
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    addq %rbx, %rax
 ; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq %r31, %r8
+; EGPR-NEXT:    adcq %r29, %r8
 ; EGPR-NEXT:    setb %r18b
 ; EGPR-NEXT:    movq %r11, %rax
-; EGPR-NEXT:    mulq %r30
-; EGPR-NEXT:    movq %rdx, %r31
+; EGPR-NEXT:    mulq %r28
+; EGPR-NEXT:    movq %rdx, %r29
 ; EGPR-NEXT:    movq %rax, %rbx
 ; EGPR-NEXT:    addq %r8, %rbx
 ; EGPR-NEXT:    movzbl %r18b, %eax
-; EGPR-NEXT:    adcq %rax, %r31
-; EGPR-NEXT:    addq %r22, %rbx
-; EGPR-NEXT:    adcq %r26, %r31
+; EGPR-NEXT:    adcq %rax, %r29
+; EGPR-NEXT:    addq %r24, %rbx
+; EGPR-NEXT:    adcq %r30, %r29
 ; EGPR-NEXT:    adcq $0, %r16
 ; EGPR-NEXT:    adcq $0, %r9
-; EGPR-NEXT:    movq 80(%r24), %r13
-; EGPR-NEXT:    movq %r27, %rax
+; EGPR-NEXT:    movq 80(%r26), %r13
+; EGPR-NEXT:    movq %r31, %rax
 ; EGPR-NEXT:    mulq %r13
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %rsi
 ; EGPR-NEXT:    movq %r11, %rax
 ; EGPR-NEXT:    mulq %r13
-; EGPR-NEXT:    movq %rdx, %r26
+; EGPR-NEXT:    movq %rdx, %r30
 ; EGPR-NEXT:    movq %rax, %r14
 ; EGPR-NEXT:    addq %r8, %r14
-; EGPR-NEXT:    adcq $0, %r26
-; EGPR-NEXT:    movq 88(%r24), %r18
-; EGPR-NEXT:    movq %r27, %rax
+; EGPR-NEXT:    adcq $0, %r30
+; EGPR-NEXT:    movq 88(%r26), %r18
+; EGPR-NEXT:    movq %r31, %rax
 ; EGPR-NEXT:    mulq %r18
 ; EGPR-NEXT:    movq %rdx, %r15
-; EGPR-NEXT:    movq %rax, %r22
-; EGPR-NEXT:    addq %r14, %r22
-; EGPR-NEXT:    adcq %r26, %r15
+; EGPR-NEXT:    movq %rax, %r24
+; EGPR-NEXT:    addq %r14, %r24
+; EGPR-NEXT:    adcq %r30, %r15
 ; EGPR-NEXT:    setb %r14b
 ; EGPR-NEXT:    movq %r11, %rax
 ; EGPR-NEXT:    mulq %r18
-; EGPR-NEXT:    movq %rdx, %r26
+; EGPR-NEXT:    movq %rdx, %r30
 ; EGPR-NEXT:    movq %rax, %r8
 ; EGPR-NEXT:    addq %r15, %r8
 ; EGPR-NEXT:    movzbl %r14b, %eax
-; EGPR-NEXT:    adcq %rax, %r26
+; EGPR-NEXT:    adcq %rax, %r30
 ; EGPR-NEXT:    addq %rbx, %rsi
 ; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NEXT:    adcq %r31, %r22
+; EGPR-NEXT:    adcq %r29, %r24
 ; EGPR-NEXT:    adcq $0, %r8
-; EGPR-NEXT:    adcq $0, %r26
+; EGPR-NEXT:    adcq $0, %r30
 ; EGPR-NEXT:    addq %r16, %r8
-; EGPR-NEXT:    adcq %r9, %r26
-; EGPR-NEXT:    setb %r31b
+; EGPR-NEXT:    adcq %r9, %r30
+; EGPR-NEXT:    setb %r29b
 ; EGPR-NEXT:    movq %rdi, %rax
 ; EGPR-NEXT:    mulq %r13
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %rsi
-; EGPR-NEXT:    movq %r23, %rax
+; EGPR-NEXT:    movq %r25, %rax
 ; EGPR-NEXT:    mulq %r13
 ; EGPR-NEXT:    movq %rdx, %r16
 ; EGPR-NEXT:    movq %rax, %r14
@@ -649,7 +649,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    addq %r14, %rbx
 ; EGPR-NEXT:    adcq %r16, %r9
 ; EGPR-NEXT:    setb %r16b
-; EGPR-NEXT:    movq %r23, %rax
+; EGPR-NEXT:    movq %r25, %rax
 ; EGPR-NEXT:    mulq %r18
 ; EGPR-NEXT:    movq %rdx, %r14
 ; EGPR-NEXT:    movq %rax, %r15
@@ -657,116 +657,116 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movzbl %r16b, %eax
 ; EGPR-NEXT:    adcq %rax, %r14
 ; EGPR-NEXT:    addq %r8, %rsi
-; EGPR-NEXT:    adcq %r26, %rbx
-; EGPR-NEXT:    movzbl %r31b, %eax
+; EGPR-NEXT:    adcq %r30, %rbx
+; EGPR-NEXT:    movzbl %r29b, %eax
 ; EGPR-NEXT:    adcq %rax, %r15
 ; EGPR-NEXT:    adcq $0, %r14
-; EGPR-NEXT:    imulq %r25, %r18
-; EGPR-NEXT:    movq %r25, %rax
+; EGPR-NEXT:    imulq %r27, %r18
+; EGPR-NEXT:    movq %r27, %rax
 ; EGPR-NEXT:    mulq %r13
 ; EGPR-NEXT:    movq %rax, %r8
 ; EGPR-NEXT:    addq %r18, %rdx
 ; EGPR-NEXT:    imulq %rcx, %r13
 ; EGPR-NEXT:    addq %rdx, %r13
-; EGPR-NEXT:    movq %r28, %r9
-; EGPR-NEXT:    imulq %r30, %r9
-; EGPR-NEXT:    movq %r28, %rax
-; EGPR-NEXT:    mulq %r21
-; EGPR-NEXT:    movq %rax, %r26
+; EGPR-NEXT:    movq %r20, %r9
+; EGPR-NEXT:    imulq %r28, %r9
+; EGPR-NEXT:    movq %r20, %rax
+; EGPR-NEXT:    mulq %r23
+; EGPR-NEXT:    movq %rax, %r30
 ; EGPR-NEXT:    addq %r9, %rdx
-; EGPR-NEXT:    imulq %r21, %r10
+; EGPR-NEXT:    imulq %r23, %r10
 ; EGPR-NEXT:    addq %rdx, %r10
-; EGPR-NEXT:    addq %r8, %r26
+; EGPR-NEXT:    addq %r8, %r30
 ; EGPR-NEXT:    adcq %r13, %r10
-; EGPR-NEXT:    movq %r21, %rax
-; EGPR-NEXT:    mulq %r25
+; EGPR-NEXT:    movq %r23, %rax
+; EGPR-NEXT:    mulq %r27
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %r9
-; EGPR-NEXT:    movq %r30, %rax
-; EGPR-NEXT:    mulq %r25
-; EGPR-NEXT:    movq %rdx, %r25
-; EGPR-NEXT:    movq %rax, %r28
-; EGPR-NEXT:    addq %r8, %r28
-; EGPR-NEXT:    adcq $0, %r25
-; EGPR-NEXT:    movq %r21, %rax
+; EGPR-NEXT:    movq %r28, %rax
+; EGPR-NEXT:    mulq %r27
+; EGPR-NEXT:    movq %rdx, %r27
+; EGPR-NEXT:    movq %rax, %r20
+; EGPR-NEXT:    addq %r8, %r20
+; EGPR-NEXT:    adcq $0, %r27
+; EGPR-NEXT:    movq %r23, %rax
 ; EGPR-NEXT:    mulq %rcx
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %r16
-; EGPR-NEXT:    addq %r28, %r16
-; EGPR-NEXT:    adcq %r25, %r8
+; EGPR-NEXT:    addq %r20, %r16
+; EGPR-NEXT:    adcq %r27, %r8
 ; EGPR-NEXT:    setb %r18b
-; EGPR-NEXT:    movq %r30, %rax
+; EGPR-NEXT:    movq %r28, %rax
 ; EGPR-NEXT:    mulq %rcx
-; EGPR-NEXT:    movq %rdx, %r21
-; EGPR-NEXT:    movq %rax, %r28
-; EGPR-NEXT:    addq %r8, %r28
+; EGPR-NEXT:    movq %rdx, %r23
+; EGPR-NEXT:    movq %rax, %r20
+; EGPR-NEXT:    addq %r8, %r20
 ; EGPR-NEXT:    movzbl %r18b, %eax
-; EGPR-NEXT:    adcq %rax, %r21
-; EGPR-NEXT:    addq %r26, %r28
-; EGPR-NEXT:    adcq %r10, %r21
-; EGPR-NEXT:    movq 112(%r24), %rcx
-; EGPR-NEXT:    movq %r27, %rax
+; EGPR-NEXT:    adcq %rax, %r23
+; EGPR-NEXT:    addq %r30, %r20
+; EGPR-NEXT:    adcq %r10, %r23
+; EGPR-NEXT:    movq 112(%r26), %rcx
+; EGPR-NEXT:    movq %r31, %rax
 ; EGPR-NEXT:    mulq %rcx
 ; EGPR-NEXT:    movq %rax, %r8
 ; EGPR-NEXT:    imulq %r11, %rcx
 ; EGPR-NEXT:    addq %rdx, %rcx
-; EGPR-NEXT:    movq 120(%r24), %rax
-; EGPR-NEXT:    imulq %r27, %rax
+; EGPR-NEXT:    movq 120(%r26), %rax
+; EGPR-NEXT:    imulq %r31, %rax
 ; EGPR-NEXT:    addq %rax, %rcx
-; EGPR-NEXT:    movq 96(%r24), %r25
-; EGPR-NEXT:    movq 104(%r24), %r26
+; EGPR-NEXT:    movq 96(%r26), %r27
+; EGPR-NEXT:    movq 104(%r26), %r30
 ; EGPR-NEXT:    movq %rdi, %rax
-; EGPR-NEXT:    imulq %r26, %rdi
-; EGPR-NEXT:    mulq %r25
-; EGPR-NEXT:    movq %rax, %r29
-; EGPR-NEXT:    addq %rdi, %rdx
-; EGPR-NEXT:    imulq %r25, %r23
-; EGPR-NEXT:    addq %rdx, %r23
-; EGPR-NEXT:    addq %r8, %r29
-; EGPR-NEXT:    adcq %rcx, %r23
-; EGPR-NEXT:    movq %r25, %rax
+; EGPR-NEXT:    imulq %r30, %rdi
 ; EGPR-NEXT:    mulq %r27
+; EGPR-NEXT:    movq %rax, %r21
+; EGPR-NEXT:    addq %rdi, %rdx
+; EGPR-NEXT:    imulq %r27, %r25
+; EGPR-NEXT:    addq %rdx, %r25
+; EGPR-NEXT:    addq %r8, %r21
+; EGPR-NEXT:    adcq %rcx, %r25
+; EGPR-NEXT:    movq %r27, %rax
+; EGPR-NEXT:    mulq %r31
 ; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r20
-; EGPR-NEXT:    movq %r26, %rax
-; EGPR-NEXT:    mulq %r27
-; EGPR-NEXT:    movq %rdx, %r27
-; EGPR-NEXT:    movq %rax, %r30
-; EGPR-NEXT:    addq %r8, %r30
-; EGPR-NEXT:    adcq $0, %r27
-; EGPR-NEXT:    movq %r25, %rax
+; EGPR-NEXT:    movq %rax, %r22
+; EGPR-NEXT:    movq %r30, %rax
+; EGPR-NEXT:    mulq %r31
+; EGPR-NEXT:    movq %rdx, %r31
+; EGPR-NEXT:    movq %rax, %r28
+; EGPR-NEXT:    addq %r8, %r28
+; EGPR-NEXT:    adcq $0, %r31
+; EGPR-NEXT:    movq %r27, %rax
 ; EGPR-NEXT:    mulq %r11
 ; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r25
-; EGPR-NEXT:    addq %r30, %r25
-; EGPR-NEXT:    adcq %r27, %r8
+; EGPR-NEXT:    movq %rax, %r27
+; EGPR-NEXT:    addq %r28, %r27
+; EGPR-NEXT:    adcq %r31, %r8
 ; EGPR-NEXT:    setb %cl
-; EGPR-NEXT:    movq %r26, %rax
+; EGPR-NEXT:    movq %r30, %rax
 ; EGPR-NEXT:    mulq %r11
-; EGPR-NEXT:    movq %rdx, %r24
-; EGPR-NEXT:    movq %rax, %r27
-; EGPR-NEXT:    addq %r8, %r27
+; EGPR-NEXT:    movq %rdx, %r26
+; EGPR-NEXT:    movq %rax, %r31
+; EGPR-NEXT:    addq %r8, %r31
 ; EGPR-NEXT:    movzbl %cl, %eax
-; EGPR-NEXT:    adcq %rax, %r24
-; EGPR-NEXT:    addq %r29, %r27
-; EGPR-NEXT:    adcq %r23, %r24
-; EGPR-NEXT:    addq %r9, %r20
-; EGPR-NEXT:    adcq %r16, %r25
-; EGPR-NEXT:    adcq %r28, %r27
-; EGPR-NEXT:    adcq %r21, %r24
-; EGPR-NEXT:    addq %rsi, %r20
-; EGPR-NEXT:    adcq %rbx, %r25
-; EGPR-NEXT:    adcq %r15, %r27
-; EGPR-NEXT:    adcq %r14, %r24
+; EGPR-NEXT:    adcq %rax, %r26
+; EGPR-NEXT:    addq %r21, %r31
+; EGPR-NEXT:    adcq %r25, %r26
+; EGPR-NEXT:    addq %r9, %r22
+; EGPR-NEXT:    adcq %r16, %r27
+; EGPR-NEXT:    adcq %r20, %r31
+; EGPR-NEXT:    adcq %r23, %r26
+; EGPR-NEXT:    addq %rsi, %r22
+; EGPR-NEXT:    adcq %rbx, %r27
+; EGPR-NEXT:    adcq %r15, %r31
+; EGPR-NEXT:    adcq %r14, %r26
 ; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
 ; EGPR-NEXT:    movq 80(%r11), %rbx
 ; EGPR-NEXT:    movq %rbx, %rax
 ; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Reload
 ; EGPR-NEXT:    mulq %r19
-; EGPR-NEXT:    movq %rax, %r21
+; EGPR-NEXT:    movq %rax, %r23
 ; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq 88(%r11), %r28
-; EGPR-NEXT:    movq %r28, %rax
+; EGPR-NEXT:    movq 88(%r11), %r20
+; EGPR-NEXT:    movq %r20, %rax
 ; EGPR-NEXT:    mulq %r19
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r16
@@ -776,11 +776,11 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r17 # 8-byte Reload
 ; EGPR-NEXT:    mulq %r17
 ; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r26
-; EGPR-NEXT:    addq %r16, %r26
+; EGPR-NEXT:    movq %rax, %r30
+; EGPR-NEXT:    addq %r16, %r30
 ; EGPR-NEXT:    adcq %r9, %r8
 ; EGPR-NEXT:    setb %cl
-; EGPR-NEXT:    movq %r28, %rax
+; EGPR-NEXT:    movq %r20, %rax
 ; EGPR-NEXT:    mulq %r17
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r16
@@ -790,71 +790,71 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movq 64(%r11), %r15
 ; EGPR-NEXT:    movq %r15, %rax
 ; EGPR-NEXT:    mulq %r19
-; EGPR-NEXT:    movq %rax, %r23
+; EGPR-NEXT:    movq %rax, %r25
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq 72(%r11), %r14
 ; EGPR-NEXT:    movq %r14, %rax
 ; EGPR-NEXT:    mulq %r19
-; EGPR-NEXT:    movq %rdx, %r30
-; EGPR-NEXT:    movq %rax, %r31
-; EGPR-NEXT:    addq %r8, %r31
-; EGPR-NEXT:    adcq $0, %r30
+; EGPR-NEXT:    movq %rdx, %r28
+; EGPR-NEXT:    movq %rax, %r29
+; EGPR-NEXT:    addq %r8, %r29
+; EGPR-NEXT:    adcq $0, %r28
 ; EGPR-NEXT:    movq %r15, %rax
 ; EGPR-NEXT:    mulq %r17
 ; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r29
-; EGPR-NEXT:    addq %r31, %r29
-; EGPR-NEXT:    adcq %r30, %r8
+; EGPR-NEXT:    movq %rax, %r21
+; EGPR-NEXT:    addq %r29, %r21
+; EGPR-NEXT:    adcq %r28, %r8
 ; EGPR-NEXT:    setb %cl
 ; EGPR-NEXT:    movq %r14, %rax
 ; EGPR-NEXT:    mulq %r17
-; EGPR-NEXT:    movq %rdx, %r31
+; EGPR-NEXT:    movq %rdx, %r29
 ; EGPR-NEXT:    movq %rax, %r13
 ; EGPR-NEXT:    addq %r8, %r13
 ; EGPR-NEXT:    movzbl %cl, %eax
-; EGPR-NEXT:    adcq %rax, %r31
-; EGPR-NEXT:    addq %r21, %r13
-; EGPR-NEXT:    adcq %r26, %r31
+; EGPR-NEXT:    adcq %rax, %r29
+; EGPR-NEXT:    addq %r23, %r13
+; EGPR-NEXT:    adcq %r30, %r29
 ; EGPR-NEXT:    adcq $0, %r16
 ; EGPR-NEXT:    adcq $0, %r9
 ; EGPR-NEXT:    movq %r15, %rax
 ; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; EGPR-NEXT:    mulq %rdi
 ; EGPR-NEXT:    movq %rdx, %r8
-; EGPR-NEXT:    movq %rax, %r30
+; EGPR-NEXT:    movq %rax, %r28
 ; EGPR-NEXT:    movq %r14, %rax
 ; EGPR-NEXT:    mulq %rdi
-; EGPR-NEXT:    movq %rdx, %r26
+; EGPR-NEXT:    movq %rdx, %r30
 ; EGPR-NEXT:    movq %rax, %rcx
 ; EGPR-NEXT:    addq %r8, %rcx
-; EGPR-NEXT:    adcq $0, %r26
+; EGPR-NEXT:    adcq $0, %r30
 ; EGPR-NEXT:    movq %r15, %rax
 ; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r18 # 8-byte Reload
 ; EGPR-NEXT:    mulq %r18
 ; EGPR-NEXT:    movq %rdx, %r10
-; EGPR-NEXT:    movq %rax, %r21
-; EGPR-NEXT:    addq %rcx, %r21
-; EGPR-NEXT:    adcq %r26, %r10
+; EGPR-NEXT:    movq %rax, %r23
+; EGPR-NEXT:    addq %rcx, %r23
+; EGPR-NEXT:    adcq %r30, %r10
 ; EGPR-NEXT:    setb %cl
 ; EGPR-NEXT:    movq %r14, %rax
 ; EGPR-NEXT:    mulq %r18
-; EGPR-NEXT:    movq %rdx, %r26
+; EGPR-NEXT:    movq %rdx, %r30
 ; EGPR-NEXT:    movq %rax, %r8
 ; EGPR-NEXT:    addq %r10, %r8
 ; EGPR-NEXT:    movzbl %cl, %eax
-; EGPR-NEXT:    adcq %rax, %r26
-; EGPR-NEXT:    addq %r13, %r30
-; EGPR-NEXT:    adcq %r31, %r21
+; EGPR-NEXT:    adcq %rax, %r30
+; EGPR-NEXT:    addq %r13, %r28
+; EGPR-NEXT:    adcq %r29, %r23
 ; EGPR-NEXT:    adcq $0, %r8
-; EGPR-NEXT:    adcq $0, %r26
+; EGPR-NEXT:    adcq $0, %r30
 ; EGPR-NEXT:    addq %r16, %r8
-; EGPR-NEXT:    adcq %r9, %r26
+; EGPR-NEXT:    adcq %r9, %r30
 ; EGPR-NEXT:    setb %sil
 ; EGPR-NEXT:    movq %rbx, %rax
 ; EGPR-NEXT:    mulq %rdi
 ; EGPR-NEXT:    movq %rdx, %rcx
-; EGPR-NEXT:    movq %rax, %r31
-; EGPR-NEXT:    movq %r28, %rax
+; EGPR-NEXT:    movq %rax, %r29
+; EGPR-NEXT:    movq %r20, %rax
 ; EGPR-NEXT:    mulq %rdi
 ; EGPR-NEXT:    movq %rdx, %r9
 ; EGPR-NEXT:    movq %rax, %r10
@@ -867,15 +867,15 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    addq %r10, %r13
 ; EGPR-NEXT:    adcq %r9, %rcx
 ; EGPR-NEXT:    setb %r10b
-; EGPR-NEXT:    movq %r28, %rax
+; EGPR-NEXT:    movq %r20, %rax
 ; EGPR-NEXT:    mulq %r18
 ; EGPR-NEXT:    movq %rdx, %r16
 ; EGPR-NEXT:    movq %rax, %r9
 ; EGPR-NEXT:    addq %rcx, %r9
 ; EGPR-NEXT:    movzbl %r10b, %eax
 ; EGPR-NEXT:    adcq %rax, %r16
-; EGPR-NEXT:    addq %r8, %r31
-; EGPR-NEXT:    adcq %r26, %r13
+; EGPR-NEXT:    addq %r8, %r29
+; EGPR-NEXT:    adcq %r30, %r13
 ; EGPR-NEXT:    movzbl %sil, %eax
 ; EGPR-NEXT:    adcq %rax, %r9
 ; EGPR-NEXT:    adcq $0, %r16
@@ -885,9 +885,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    mulq %rdi
 ; EGPR-NEXT:    movq %rax, %r8
 ; EGPR-NEXT:    addq %r18, %rdx
-; EGPR-NEXT:    movq 104(%r11), %r26
+; EGPR-NEXT:    movq 104(%r11), %r30
 ; EGPR-NEXT:    movq %rdi, %rax
-; EGPR-NEXT:    imulq %r26, %rax
+; EGPR-NEXT:    imulq %r30, %rax
 ; EGPR-NEXT:    addq %rdx, %rax
 ; EGPR-NEXT:    movq %rax, %r10
 ; EGPR-NEXT:    movq 112(%r11), %rax
@@ -912,14 +912,14 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    addq %r8, %r10
 ; EGPR-NEXT:    adcq $0, %rcx
 ; EGPR-NEXT:    movq %r19, %rax
-; EGPR-NEXT:    mulq %r26
+; EGPR-NEXT:    mulq %r30
 ; EGPR-NEXT:    movq %rdx, %r8
 ; EGPR-NEXT:    movq %rax, %r11
 ; EGPR-NEXT:    addq %r10, %r11
 ; EGPR-NEXT:    adcq %rcx, %r8
 ; EGPR-NEXT:    setb %cl
 ; EGPR-NEXT:    movq %r17, %rax
-; EGPR-NEXT:    mulq %r26
+; EGPR-NEXT:    mulq %r30
 ; EGPR-NEXT:    movq %rdx, %r10
 ; EGPR-NEXT:    movq %rax, %r17
 ; EGPR-NEXT:    addq %r8, %r17
@@ -944,12 +944,12 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movq %rbx, %rax
 ; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
 ; EGPR-NEXT:    mulq %r8
-; EGPR-NEXT:    movq %rax, %r26
+; EGPR-NEXT:    movq %rax, %r30
 ; EGPR-NEXT:    addq %rdi, %rdx
-; EGPR-NEXT:    imulq %r8, %r28
-; EGPR-NEXT:    addq %rdx, %r28
-; EGPR-NEXT:    addq %rcx, %r26
-; EGPR-NEXT:    adcq %r18, %r28
+; EGPR-NEXT:    imulq %r8, %r20
+; EGPR-NEXT:    addq %rdx, %r20
+; EGPR-NEXT:    addq %rcx, %r30
+; EGPR-NEXT:    adcq %r18, %r20
 ; EGPR-NEXT:    movq %r8, %rax
 ; EGPR-NEXT:    movq %r8, %rdi
 ; EGPR-NEXT:    mulq %r15
@@ -973,28 +973,28 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    addq %rcx, %rax
 ; EGPR-NEXT:    movzbl %dil, %ecx
 ; EGPR-NEXT:    adcq %rcx, %rdx
-; EGPR-NEXT:    addq %r26, %rax
-; EGPR-NEXT:    adcq %r28, %rdx
+; EGPR-NEXT:    addq %r30, %rax
+; EGPR-NEXT:    adcq %r20, %rdx
 ; EGPR-NEXT:    addq %rsi, %r8
 ; EGPR-NEXT:    adcq %r11, %r18
 ; EGPR-NEXT:    adcq %r17, %rax
 ; EGPR-NEXT:    adcq %r10, %rdx
-; EGPR-NEXT:    addq %r31, %r8
+; EGPR-NEXT:    addq %r29, %r8
 ; EGPR-NEXT:    adcq %r13, %r18
 ; EGPR-NEXT:    adcq %r9, %rax
 ; EGPR-NEXT:    adcq %r16, %rdx
-; EGPR-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Folded Reload
-; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r29 # 8-byte Folded Reload
-; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r30 # 8-byte Folded Reload
-; EGPR-NEXT:    adcq %r22, %r21
-; EGPR-NEXT:    adcq %r20, %r8
-; EGPR-NEXT:    adcq %r25, %r18
-; EGPR-NEXT:    adcq %r27, %rax
-; EGPR-NEXT:    adcq %r24, %rdx
-; EGPR-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Folded Reload
-; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r29 # 8-byte Folded Reload
-; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r30 # 8-byte Folded Reload
+; EGPR-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r25 # 8-byte Folded Reload
+; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Folded Reload
+; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r28 # 8-byte Folded Reload
+; EGPR-NEXT:    adcq %r24, %r23
+; EGPR-NEXT:    adcq %r22, %r8
+; EGPR-NEXT:    adcq %r27, %r18
+; EGPR-NEXT:    adcq %r31, %rax
+; EGPR-NEXT:    adcq %r26, %rdx
+; EGPR-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r25 # 8-byte Folded Reload
 ; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Folded Reload
+; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r28 # 8-byte Folded Reload
+; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Folded Reload
 ; EGPR-NEXT:    adcq (%rsp), %r8 # 8-byte Folded Reload
 ; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r18 # 8-byte Folded Reload
 ; EGPR-NEXT:    adcq %r12, %rax
@@ -1016,10 +1016,10 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NEXT:    movq %rsi, 48(%rcx)
 ; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
 ; EGPR-NEXT:    movq %rsi, 56(%rcx)
-; EGPR-NEXT:    movq %r23, 64(%rcx)
-; EGPR-NEXT:    movq %r29, 72(%rcx)
-; EGPR-NEXT:    movq %r30, 80(%rcx)
-; EGPR-NEXT:    movq %r21, 88(%rcx)
+; EGPR-NEXT:    movq %r25, 64(%rcx)
+; EGPR-NEXT:    movq %r21, 72(%rcx)
+; EGPR-NEXT:    movq %r28, 80(%rcx)
+; EGPR-NEXT:    movq %r23, 88(%rcx)
 ; EGPR-NEXT:    movq %r8, 96(%rcx)
 ; EGPR-NEXT:    movq %r18, 104(%rcx)
 ; EGPR-NEXT:    movq %rax, 112(%rcx)
@@ -1044,67 +1044,67 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    subq $96, %rsp
 ; EGPR-NDD-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    movq %rsi, %r15
-; EGPR-NDD-NEXT:    movq %rdi, %r20
+; EGPR-NDD-NEXT:    movq %rdi, %r22
 ; EGPR-NDD-NEXT:    movq (%rdi), %r17
 ; EGPR-NDD-NEXT:    movq 8(%rdi), %r11
 ; EGPR-NDD-NEXT:    movq 24(%rdi), %r9
 ; EGPR-NDD-NEXT:    movq 16(%rdi), %r10
 ; EGPR-NDD-NEXT:    movq 40(%rdi), %rdi
-; EGPR-NDD-NEXT:    movq 32(%r20), %r16
-; EGPR-NDD-NEXT:    movq 56(%r20), %r18
-; EGPR-NDD-NEXT:    movq 48(%r20), %r23
+; EGPR-NDD-NEXT:    movq 32(%r22), %r16
+; EGPR-NDD-NEXT:    movq 56(%r22), %r18
+; EGPR-NDD-NEXT:    movq 48(%r22), %r25
 ; EGPR-NDD-NEXT:    movq 24(%rsi), %r14
-; EGPR-NDD-NEXT:    movq 16(%rsi), %r24
-; EGPR-NDD-NEXT:    movq (%rsi), %r22
-; EGPR-NDD-NEXT:    movq 8(%rsi), %r21
-; EGPR-NDD-NEXT:    movq %r23, %rax
-; EGPR-NDD-NEXT:    mulq %r22
-; EGPR-NDD-NEXT:    movq %rdx, %r25
+; EGPR-NDD-NEXT:    movq 16(%rsi), %r26
+; EGPR-NDD-NEXT:    movq (%rsi), %r24
+; EGPR-NDD-NEXT:    movq 8(%rsi), %r23
+; EGPR-NDD-NEXT:    movq %r25, %rax
+; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    movq %rdx, %r27
 ; EGPR-NDD-NEXT:    movq %rax, %r19
 ; EGPR-NDD-NEXT:    movq %r18, %rax
-; EGPR-NDD-NEXT:    mulq %r22
-; EGPR-NDD-NEXT:    addq %rax, %r25
+; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    addq %rax, %r27
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %rcx
-; EGPR-NDD-NEXT:    movq %r23, %rax
-; EGPR-NDD-NEXT:    mulq %r21
-; EGPR-NDD-NEXT:    addq %r25, %rax, %rsi
+; EGPR-NDD-NEXT:    movq %r25, %rax
+; EGPR-NDD-NEXT:    mulq %r23
+; EGPR-NDD-NEXT:    addq %r27, %rax, %rsi
 ; EGPR-NDD-NEXT:    adcq %rdx, %rcx
 ; EGPR-NDD-NEXT:    setb %al
 ; EGPR-NDD-NEXT:    movzbl %al, %r8d
 ; EGPR-NDD-NEXT:    movq %r18, %rax
-; EGPR-NDD-NEXT:    mulq %r21
-; EGPR-NDD-NEXT:    addq %rcx, %rax, %r27
+; EGPR-NDD-NEXT:    mulq %r23
+; EGPR-NDD-NEXT:    addq %rcx, %rax, %r31
 ; EGPR-NDD-NEXT:    adcq %rdx, %r8
 ; EGPR-NDD-NEXT:    movq %r16, %rax
-; EGPR-NDD-NEXT:    mulq %r22
-; EGPR-NDD-NEXT:    movq %rdx, %r26
-; EGPR-NDD-NEXT:    movq %rax, %r25
+; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    movq %rdx, %r30
+; EGPR-NDD-NEXT:    movq %rax, %r27
 ; EGPR-NDD-NEXT:    movq %rdi, %rax
-; EGPR-NDD-NEXT:    mulq %r22
-; EGPR-NDD-NEXT:    addq %r26, %rax, %rcx
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r26
+; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    addq %r30, %rax, %rcx
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r30
 ; EGPR-NDD-NEXT:    movq %r16, %rax
-; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    mulq %r23
 ; EGPR-NDD-NEXT:    addq %rax, %rcx
-; EGPR-NDD-NEXT:    adcq %rdx, %r26
+; EGPR-NDD-NEXT:    adcq %rdx, %r30
 ; EGPR-NDD-NEXT:    setb %al
-; EGPR-NDD-NEXT:    movzbl %al, %r28d
+; EGPR-NDD-NEXT:    movzbl %al, %r20d
 ; EGPR-NDD-NEXT:    movq %rdi, %rax
-; EGPR-NDD-NEXT:    mulq %r21
-; EGPR-NDD-NEXT:    addq %r26, %rax
-; EGPR-NDD-NEXT:    adcq %r28, %rdx
-; EGPR-NDD-NEXT:    addq %rax, %r19, %r28
-; EGPR-NDD-NEXT:    adcq %rdx, %rsi, %r29
-; EGPR-NDD-NEXT:    adcq $0, %r27
+; EGPR-NDD-NEXT:    mulq %r23
+; EGPR-NDD-NEXT:    addq %r30, %rax
+; EGPR-NDD-NEXT:    adcq %r20, %rdx
+; EGPR-NDD-NEXT:    addq %rax, %r19, %r20
+; EGPR-NDD-NEXT:    adcq %rdx, %rsi, %r21
+; EGPR-NDD-NEXT:    adcq $0, %r31
 ; EGPR-NDD-NEXT:    adcq $0, %r8
 ; EGPR-NDD-NEXT:    movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    movq %r16, %rax
-; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    mulq %r26
 ; EGPR-NDD-NEXT:    movq %rdx, %r19
-; EGPR-NDD-NEXT:    movq %rax, %r26
+; EGPR-NDD-NEXT:    movq %rax, %r30
 ; EGPR-NDD-NEXT:    movq %rdi, %rax
 ; EGPR-NDD-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    mulq %r26
 ; EGPR-NDD-NEXT:    addq %rax, %r19
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %rsi
 ; EGPR-NDD-NEXT:    movq %r16, %rax
@@ -1112,95 +1112,95 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    addq %rax, %r19
 ; EGPR-NDD-NEXT:    adcq %rdx, %rsi
 ; EGPR-NDD-NEXT:    setb %al
-; EGPR-NDD-NEXT:    movzbl %al, %r30d
+; EGPR-NDD-NEXT:    movzbl %al, %r28d
 ; EGPR-NDD-NEXT:    movq %rdi, %rax
 ; EGPR-NDD-NEXT:    mulq %r14
 ; EGPR-NDD-NEXT:    addq %rsi, %rax
-; EGPR-NDD-NEXT:    adcq %r30, %rdx
-; EGPR-NDD-NEXT:    addq %r28, %r26, %rsi
-; EGPR-NDD-NEXT:    adcq %r29, %r19, %r28
+; EGPR-NDD-NEXT:    adcq %r28, %rdx
+; EGPR-NDD-NEXT:    addq %r20, %r30, %rsi
+; EGPR-NDD-NEXT:    adcq %r21, %r19, %r20
 ; EGPR-NDD-NEXT:    adcq $0, %rax
 ; EGPR-NDD-NEXT:    adcq $0, %rdx
-; EGPR-NDD-NEXT:    addq %rax, %r27
+; EGPR-NDD-NEXT:    addq %rax, %r31
 ; EGPR-NDD-NEXT:    adcq %rdx, %r8
 ; EGPR-NDD-NEXT:    setb %al
-; EGPR-NDD-NEXT:    movzbl %al, %r31d
-; EGPR-NDD-NEXT:    movq %r23, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    movq %r23, %rax
-; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    movzbl %al, %r29d
+; EGPR-NDD-NEXT:    movq %r25, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq %r25, %rax
+; EGPR-NDD-NEXT:    mulq %r26
 ; EGPR-NDD-NEXT:    movq %rdx, %r19
-; EGPR-NDD-NEXT:    movq %rax, %r26
+; EGPR-NDD-NEXT:    movq %rax, %r30
 ; EGPR-NDD-NEXT:    movq %r18, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    movq %r18, %rax
-; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    mulq %r26
 ; EGPR-NDD-NEXT:    addq %rax, %r19
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r29
-; EGPR-NDD-NEXT:    movq %r23, %rax
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r21
+; EGPR-NDD-NEXT:    movq %r25, %rax
 ; EGPR-NDD-NEXT:    mulq %r14
 ; EGPR-NDD-NEXT:    addq %rax, %r19
-; EGPR-NDD-NEXT:    adcq %rdx, %r29
+; EGPR-NDD-NEXT:    adcq %rdx, %r21
 ; EGPR-NDD-NEXT:    setb %al
-; EGPR-NDD-NEXT:    movzbl %al, %r30d
+; EGPR-NDD-NEXT:    movzbl %al, %r28d
 ; EGPR-NDD-NEXT:    movq %r18, %rax
 ; EGPR-NDD-NEXT:    mulq %r14
-; EGPR-NDD-NEXT:    addq %r29, %rax
-; EGPR-NDD-NEXT:    adcq %r30, %rdx
-; EGPR-NDD-NEXT:    addq %r27, %r26, %r29
-; EGPR-NDD-NEXT:    adcq %r8, %r19, %r30
-; EGPR-NDD-NEXT:    adcq %rax, %r31
+; EGPR-NDD-NEXT:    addq %r21, %rax
+; EGPR-NDD-NEXT:    adcq %r28, %rdx
+; EGPR-NDD-NEXT:    addq %r31, %r30, %r21
+; EGPR-NDD-NEXT:    adcq %r8, %r19, %r28
+; EGPR-NDD-NEXT:    adcq %rax, %r29
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %rdi
 ; EGPR-NDD-NEXT:    movq %r10, %rax
-; EGPR-NDD-NEXT:    mulq %r22
+; EGPR-NDD-NEXT:    mulq %r24
 ; EGPR-NDD-NEXT:    movq %rdx, %r19
-; EGPR-NDD-NEXT:    movq %rax, %r26
+; EGPR-NDD-NEXT:    movq %rax, %r30
 ; EGPR-NDD-NEXT:    movq %r9, %rax
-; EGPR-NDD-NEXT:    mulq %r22
+; EGPR-NDD-NEXT:    mulq %r24
 ; EGPR-NDD-NEXT:    addq %rax, %r19
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %r8
 ; EGPR-NDD-NEXT:    movq %r10, %rax
-; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    mulq %r23
 ; EGPR-NDD-NEXT:    addq %rax, %r19
 ; EGPR-NDD-NEXT:    adcq %rdx, %r8
 ; EGPR-NDD-NEXT:    setb %al
-; EGPR-NDD-NEXT:    movzbl %al, %r27d
+; EGPR-NDD-NEXT:    movzbl %al, %r31d
 ; EGPR-NDD-NEXT:    movq %r9, %rax
-; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    mulq %r23
 ; EGPR-NDD-NEXT:    addq %rax, %r8
-; EGPR-NDD-NEXT:    adcq %r27, %rdx, %rbx
+; EGPR-NDD-NEXT:    adcq %r31, %rdx, %rbx
 ; EGPR-NDD-NEXT:    movq %r17, %rax
-; EGPR-NDD-NEXT:    mulq %r22
-; EGPR-NDD-NEXT:    movq %rdx, %r27
+; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    movq %rdx, %r31
 ; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    movq %r11, %rax
-; EGPR-NDD-NEXT:    mulq %r22
-; EGPR-NDD-NEXT:    addq %rax, %r27
+; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    addq %rax, %r31
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %r12
 ; EGPR-NDD-NEXT:    movq %r17, %rax
-; EGPR-NDD-NEXT:    mulq %r21
-; EGPR-NDD-NEXT:    addq %r27, %rax
+; EGPR-NDD-NEXT:    mulq %r23
+; EGPR-NDD-NEXT:    addq %r31, %rax
 ; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    adcq %rdx, %r12
-; EGPR-NDD-NEXT:    setb %r27b
+; EGPR-NDD-NEXT:    setb %r31b
 ; EGPR-NDD-NEXT:    movq %r11, %rax
-; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    mulq %r23
 ; EGPR-NDD-NEXT:    addq %r12, %rax
-; EGPR-NDD-NEXT:    movzbl %r27b, %r27d
-; EGPR-NDD-NEXT:    adcq %r27, %rdx
-; EGPR-NDD-NEXT:    addq %rax, %r26, %r12
+; EGPR-NDD-NEXT:    movzbl %r31b, %r31d
+; EGPR-NDD-NEXT:    adcq %r31, %rdx
+; EGPR-NDD-NEXT:    addq %rax, %r30, %r12
 ; EGPR-NDD-NEXT:    adcq %rdx, %r19
 ; EGPR-NDD-NEXT:    adcq $0, %r8
 ; EGPR-NDD-NEXT:    adcq $0, %rbx
 ; EGPR-NDD-NEXT:    movq %r17, %rax
-; EGPR-NDD-NEXT:    mulq %r24
-; EGPR-NDD-NEXT:    movq %rdx, %r26
-; EGPR-NDD-NEXT:    movq %rax, %r27
+; EGPR-NDD-NEXT:    mulq %r26
+; EGPR-NDD-NEXT:    movq %rdx, %r30
+; EGPR-NDD-NEXT:    movq %rax, %r31
 ; EGPR-NDD-NEXT:    movq %r11, %rax
-; EGPR-NDD-NEXT:    mulq %r24
-; EGPR-NDD-NEXT:    addq %rax, %r26
+; EGPR-NDD-NEXT:    mulq %r26
+; EGPR-NDD-NEXT:    addq %rax, %r30
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %r13
 ; EGPR-NDD-NEXT:    movq %r17, %rax
 ; EGPR-NDD-NEXT:    mulq %r14
-; EGPR-NDD-NEXT:    addq %rax, %r26
+; EGPR-NDD-NEXT:    addq %rax, %r30
 ; EGPR-NDD-NEXT:    adcq %rdx, %r13
 ; EGPR-NDD-NEXT:    setb %bpl
 ; EGPR-NDD-NEXT:    movq %r11, %rax
@@ -1208,9 +1208,9 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    addq %r13, %rax
 ; EGPR-NDD-NEXT:    movzbl %bpl, %r13d
 ; EGPR-NDD-NEXT:    adcq %r13, %rdx
-; EGPR-NDD-NEXT:    addq %r12, %r27
-; EGPR-NDD-NEXT:    movq %r27, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq %r26, %r19
+; EGPR-NDD-NEXT:    addq %r12, %r31
+; EGPR-NDD-NEXT:    movq %r31, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq %r30, %r19
 ; EGPR-NDD-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    adcq $0, %rax
 ; EGPR-NDD-NEXT:    adcq $0, %rdx
@@ -1219,16 +1219,16 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    setb %r19b
 ; EGPR-NDD-NEXT:    movq %r10, %r16
 ; EGPR-NDD-NEXT:    movq %r10, %rax
-; EGPR-NDD-NEXT:    mulq %r24
-; EGPR-NDD-NEXT:    movq %rdx, %r26
-; EGPR-NDD-NEXT:    movq %rax, %r27
+; EGPR-NDD-NEXT:    mulq %r26
+; EGPR-NDD-NEXT:    movq %rdx, %r30
+; EGPR-NDD-NEXT:    movq %rax, %r31
 ; EGPR-NDD-NEXT:    movq %r9, %rax
-; EGPR-NDD-NEXT:    mulq %r24
-; EGPR-NDD-NEXT:    addq %rax, %r26
+; EGPR-NDD-NEXT:    mulq %r26
+; EGPR-NDD-NEXT:    addq %rax, %r30
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %r12
 ; EGPR-NDD-NEXT:    movq %r10, %rax
 ; EGPR-NDD-NEXT:    mulq %r14
-; EGPR-NDD-NEXT:    addq %rax, %r26
+; EGPR-NDD-NEXT:    addq %rax, %r30
 ; EGPR-NDD-NEXT:    adcq %rdx, %r12
 ; EGPR-NDD-NEXT:    setb %bpl
 ; EGPR-NDD-NEXT:    movq %r9, %rax
@@ -1236,35 +1236,35 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    addq %r12, %rax
 ; EGPR-NDD-NEXT:    movzbl %bpl, %r12d
 ; EGPR-NDD-NEXT:    adcq %r12, %rdx
-; EGPR-NDD-NEXT:    addq %r27, %r8
-; EGPR-NDD-NEXT:    adcq %r26, %rbx
+; EGPR-NDD-NEXT:    addq %r31, %r8
+; EGPR-NDD-NEXT:    adcq %r30, %rbx
 ; EGPR-NDD-NEXT:    movzbl %r19b, %r19d
 ; EGPR-NDD-NEXT:    adcq %r19, %rax
 ; EGPR-NDD-NEXT:    adcq $0, %rdx
-; EGPR-NDD-NEXT:    addq %r8, %r25, %r12
-; EGPR-NDD-NEXT:    movq 32(%r15), %r26
+; EGPR-NDD-NEXT:    addq %r8, %r27, %r12
+; EGPR-NDD-NEXT:    movq 32(%r15), %r30
 ; EGPR-NDD-NEXT:    adcq %rbx, %rcx, %r13
 ; EGPR-NDD-NEXT:    adcq %rax, %rsi, %rbp
-; EGPR-NDD-NEXT:    adcq %rdx, %r28, %rbx
+; EGPR-NDD-NEXT:    adcq %rdx, %r20, %rbx
+; EGPR-NDD-NEXT:    adcq $0, %r21
+; EGPR-NDD-NEXT:    movq %r21, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq $0, %r28
 ; EGPR-NDD-NEXT:    adcq $0, %r29
-; EGPR-NDD-NEXT:    movq %r29, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq $0, %r30
-; EGPR-NDD-NEXT:    adcq $0, %r31
 ; EGPR-NDD-NEXT:    adcq $0, %rdi
 ; EGPR-NDD-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    movq %r10, %rax
-; EGPR-NDD-NEXT:    mulq %r26
-; EGPR-NDD-NEXT:    movq %rdx, %r25
-; EGPR-NDD-NEXT:    movq %rax, %r27
+; EGPR-NDD-NEXT:    mulq %r30
+; EGPR-NDD-NEXT:    movq %rdx, %r27
+; EGPR-NDD-NEXT:    movq %rax, %r31
 ; EGPR-NDD-NEXT:    movq %r9, %r19
 ; EGPR-NDD-NEXT:    movq %r9, %rax
-; EGPR-NDD-NEXT:    mulq %r26
-; EGPR-NDD-NEXT:    addq %rax, %r25
+; EGPR-NDD-NEXT:    mulq %r30
+; EGPR-NDD-NEXT:    addq %rax, %r27
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %rcx
 ; EGPR-NDD-NEXT:    movq 40(%r15), %r18
 ; EGPR-NDD-NEXT:    movq %r10, %rax
 ; EGPR-NDD-NEXT:    mulq %r18
-; EGPR-NDD-NEXT:    addq %r25, %rax, %r29
+; EGPR-NDD-NEXT:    addq %r27, %rax, %r21
 ; EGPR-NDD-NEXT:    adcq %rdx, %rcx
 ; EGPR-NDD-NEXT:    setb %r8b
 ; EGPR-NDD-NEXT:    movq %r9, %rax
@@ -1273,26 +1273,26 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    movzbl %r8b, %eax
 ; EGPR-NDD-NEXT:    adcq %rax, %rdx, %rsi
 ; EGPR-NDD-NEXT:    movq %r17, %rax
-; EGPR-NDD-NEXT:    mulq %r26
-; EGPR-NDD-NEXT:    movq %rdx, %r28
-; EGPR-NDD-NEXT:    movq %rax, %r25
+; EGPR-NDD-NEXT:    mulq %r30
+; EGPR-NDD-NEXT:    movq %rdx, %r20
+; EGPR-NDD-NEXT:    movq %rax, %r27
 ; EGPR-NDD-NEXT:    movq %r11, %r10
 ; EGPR-NDD-NEXT:    movq %r11, %rax
-; EGPR-NDD-NEXT:    mulq %r26
-; EGPR-NDD-NEXT:    addq %r28, %rax, %r8
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r28
+; EGPR-NDD-NEXT:    mulq %r30
+; EGPR-NDD-NEXT:    addq %r20, %rax, %r8
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r20
 ; EGPR-NDD-NEXT:    movq %r17, %rax
 ; EGPR-NDD-NEXT:    mulq %r18
-; EGPR-NDD-NEXT:    addq %r8, %rax, %r23
-; EGPR-NDD-NEXT:    adcq %rdx, %r28
+; EGPR-NDD-NEXT:    addq %r8, %rax, %r25
+; EGPR-NDD-NEXT:    adcq %rdx, %r20
 ; EGPR-NDD-NEXT:    setb %cl
 ; EGPR-NDD-NEXT:    movq %r11, %rax
 ; EGPR-NDD-NEXT:    mulq %r18
-; EGPR-NDD-NEXT:    addq %r28, %rax
+; EGPR-NDD-NEXT:    addq %r20, %rax
 ; EGPR-NDD-NEXT:    movzbl %cl, %ecx
 ; EGPR-NDD-NEXT:    adcq %rdx, %rcx
-; EGPR-NDD-NEXT:    addq %rax, %r27
-; EGPR-NDD-NEXT:    adcq %rcx, %r29, %r8
+; EGPR-NDD-NEXT:    addq %rax, %r31
+; EGPR-NDD-NEXT:    adcq %rcx, %r21, %r8
 ; EGPR-NDD-NEXT:    adcq $0, %rdi
 ; EGPR-NDD-NEXT:    adcq $0, %rsi, %r9
 ; EGPR-NDD-NEXT:    movq 48(%r15), %r11
@@ -1300,17 +1300,17 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    movq %r17, %rax
 ; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    movq %rdx, %r28
-; EGPR-NDD-NEXT:    movq %rax, %r29
+; EGPR-NDD-NEXT:    movq %rdx, %r20
+; EGPR-NDD-NEXT:    movq %rax, %r21
 ; EGPR-NDD-NEXT:    movq %r10, %rax
 ; EGPR-NDD-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    addq %rax, %r28
+; EGPR-NDD-NEXT:    addq %rax, %r20
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %rcx
 ; EGPR-NDD-NEXT:    movq 56(%r15), %r17
 ; EGPR-NDD-NEXT:    movq %rsi, %rax
 ; EGPR-NDD-NEXT:    mulq %r17
-; EGPR-NDD-NEXT:    addq %rax, %r28
+; EGPR-NDD-NEXT:    addq %rax, %r20
 ; EGPR-NDD-NEXT:    adcq %rdx, %rcx
 ; EGPR-NDD-NEXT:    setb %sil
 ; EGPR-NDD-NEXT:    movq %r10, %rax
@@ -1318,8 +1318,8 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    addq %rcx, %rax
 ; EGPR-NDD-NEXT:    movzbl %sil, %ecx
 ; EGPR-NDD-NEXT:    adcq %rdx, %rcx
-; EGPR-NDD-NEXT:    addq %r29, %r27
-; EGPR-NDD-NEXT:    adcq %r8, %r28, %r10
+; EGPR-NDD-NEXT:    addq %r21, %r31
+; EGPR-NDD-NEXT:    adcq %r8, %r20, %r10
 ; EGPR-NDD-NEXT:    adcq $0, %rax
 ; EGPR-NDD-NEXT:    adcq $0, %rcx
 ; EGPR-NDD-NEXT:    addq %rax, %rdi
@@ -1328,16 +1328,16 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    movq %r16, %rax
 ; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    movq %rdx, %r28
-; EGPR-NDD-NEXT:    movq %rax, %r29
+; EGPR-NDD-NEXT:    movq %rdx, %r20
+; EGPR-NDD-NEXT:    movq %rax, %r21
 ; EGPR-NDD-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    movq %r19, %rax
 ; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    addq %rax, %r28
+; EGPR-NDD-NEXT:    addq %rax, %r20
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %r9
 ; EGPR-NDD-NEXT:    movq %r16, %rax
 ; EGPR-NDD-NEXT:    mulq %r17
-; EGPR-NDD-NEXT:    addq %rax, %r28
+; EGPR-NDD-NEXT:    addq %rax, %r20
 ; EGPR-NDD-NEXT:    adcq %rdx, %r9
 ; EGPR-NDD-NEXT:    setb %cl
 ; EGPR-NDD-NEXT:    movq %r19, %rax
@@ -1345,17 +1345,17 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    addq %r9, %rax
 ; EGPR-NDD-NEXT:    movzbl %cl, %ecx
 ; EGPR-NDD-NEXT:    adcq %rdx, %rcx
-; EGPR-NDD-NEXT:    addq %r29, %rdi
-; EGPR-NDD-NEXT:    adcq %r28, %r8
+; EGPR-NDD-NEXT:    addq %r21, %rdi
+; EGPR-NDD-NEXT:    adcq %r20, %r8
 ; EGPR-NDD-NEXT:    movzbl %sil, %edx
 ; EGPR-NDD-NEXT:    adcq %rdx, %rax
 ; EGPR-NDD-NEXT:    adcq $0, %rcx
-; EGPR-NDD-NEXT:    addq %r12, %r25
-; EGPR-NDD-NEXT:    movq %r25, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq %r13, %r23, %r19
-; EGPR-NDD-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq %rbp, %r27
+; EGPR-NDD-NEXT:    addq %r12, %r27
 ; EGPR-NDD-NEXT:    movq %r27, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq %r13, %r25, %r19
+; EGPR-NDD-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq %rbp, %r31
+; EGPR-NDD-NEXT:    movq %r31, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    adcq %rbx, %r10
 ; EGPR-NDD-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    adcq $0, %rdi
@@ -1363,23 +1363,23 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    adcq $0, %rax
 ; EGPR-NDD-NEXT:    adcq $0, %rcx
 ; EGPR-NDD-NEXT:    addq %rdi, {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    adcq %r8, %r30
-; EGPR-NDD-NEXT:    adcq %rax, %r31
+; EGPR-NDD-NEXT:    adcq %r8, %r28
+; EGPR-NDD-NEXT:    adcq %rax, %r29
 ; EGPR-NDD-NEXT:    adcq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
 ; EGPR-NDD-NEXT:    setb %r8b
 ; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
 ; EGPR-NDD-NEXT:    movq %r13, %rax
-; EGPR-NDD-NEXT:    mulq %r26
-; EGPR-NDD-NEXT:    movq %rdx, %r25
-; EGPR-NDD-NEXT:    movq %rax, %r28
+; EGPR-NDD-NEXT:    mulq %r30
+; EGPR-NDD-NEXT:    movq %rdx, %r27
+; EGPR-NDD-NEXT:    movq %rax, %r20
 ; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
 ; EGPR-NDD-NEXT:    movq %r10, %rax
-; EGPR-NDD-NEXT:    mulq %r26
-; EGPR-NDD-NEXT:    addq %rax, %r25
+; EGPR-NDD-NEXT:    mulq %r30
+; EGPR-NDD-NEXT:    addq %rax, %r27
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %rsi
 ; EGPR-NDD-NEXT:    movq %r13, %rax
 ; EGPR-NDD-NEXT:    mulq %r18
-; EGPR-NDD-NEXT:    addq %r25, %rax, %rdi
+; EGPR-NDD-NEXT:    addq %r27, %rax, %rdi
 ; EGPR-NDD-NEXT:    adcq %rdx, %rsi
 ; EGPR-NDD-NEXT:    setb %r9b
 ; EGPR-NDD-NEXT:    movq %r10, %rax
@@ -1388,66 +1388,66 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    addq %rax, %rsi
 ; EGPR-NDD-NEXT:    movzbl %r9b, %eax
 ; EGPR-NDD-NEXT:    adcq %rax, %rdx, %r9
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Reload
-; EGPR-NDD-NEXT:    movq %r23, %rax
-; EGPR-NDD-NEXT:    mulq %r26
-; EGPR-NDD-NEXT:    movq %rdx, %r29
-; EGPR-NDD-NEXT:    movq %rax, %r25
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r25 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r25, %rax
+; EGPR-NDD-NEXT:    mulq %r30
+; EGPR-NDD-NEXT:    movq %rdx, %r21
+; EGPR-NDD-NEXT:    movq %rax, %r27
 ; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
 ; EGPR-NDD-NEXT:    movq %r12, %rax
-; EGPR-NDD-NEXT:    mulq %r26
-; EGPR-NDD-NEXT:    addq %rax, %r29
+; EGPR-NDD-NEXT:    mulq %r30
+; EGPR-NDD-NEXT:    addq %rax, %r21
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %r10
-; EGPR-NDD-NEXT:    movq %r23, %rax
+; EGPR-NDD-NEXT:    movq %r25, %rax
 ; EGPR-NDD-NEXT:    mulq %r18
-; EGPR-NDD-NEXT:    addq %r29, %rax, %rbx
+; EGPR-NDD-NEXT:    addq %r21, %rax, %rbx
 ; EGPR-NDD-NEXT:    adcq %rdx, %r10
-; EGPR-NDD-NEXT:    setb %r27b
+; EGPR-NDD-NEXT:    setb %r31b
 ; EGPR-NDD-NEXT:    movq %r12, %rax
 ; EGPR-NDD-NEXT:    mulq %r18
 ; EGPR-NDD-NEXT:    addq %r10, %rax
-; EGPR-NDD-NEXT:    movzbl %r27b, %r10d
+; EGPR-NDD-NEXT:    movzbl %r31b, %r10d
 ; EGPR-NDD-NEXT:    adcq %r10, %rdx
-; EGPR-NDD-NEXT:    addq %rax, %r28, %r10
+; EGPR-NDD-NEXT:    addq %rax, %r20, %r10
 ; EGPR-NDD-NEXT:    adcq %rdx, %rdi
 ; EGPR-NDD-NEXT:    adcq $0, %rsi
 ; EGPR-NDD-NEXT:    adcq $0, %r9
-; EGPR-NDD-NEXT:    movq %r23, %rax
+; EGPR-NDD-NEXT:    movq %r25, %rax
 ; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    movq %rdx, %r28
-; EGPR-NDD-NEXT:    movq %rax, %r29
+; EGPR-NDD-NEXT:    movq %rdx, %r20
+; EGPR-NDD-NEXT:    movq %rax, %r21
 ; EGPR-NDD-NEXT:    movq %r12, %rax
 ; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    addq %rax, %r28
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r27
-; EGPR-NDD-NEXT:    movq %r23, %rax
+; EGPR-NDD-NEXT:    addq %rax, %r20
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r31
+; EGPR-NDD-NEXT:    movq %r25, %rax
 ; EGPR-NDD-NEXT:    mulq %r17
-; EGPR-NDD-NEXT:    addq %rax, %r28
-; EGPR-NDD-NEXT:    adcq %rdx, %r27
+; EGPR-NDD-NEXT:    addq %rax, %r20
+; EGPR-NDD-NEXT:    adcq %rdx, %r31
 ; EGPR-NDD-NEXT:    setb %bpl
 ; EGPR-NDD-NEXT:    movq %r12, %rax
 ; EGPR-NDD-NEXT:    mulq %r17
-; EGPR-NDD-NEXT:    addq %r27, %rax
-; EGPR-NDD-NEXT:    movzbl %bpl, %r27d
-; EGPR-NDD-NEXT:    adcq %r27, %rdx
-; EGPR-NDD-NEXT:    addq %r29, %r10
-; EGPR-NDD-NEXT:    adcq %r28, %rdi
+; EGPR-NDD-NEXT:    addq %r31, %rax
+; EGPR-NDD-NEXT:    movzbl %bpl, %r31d
+; EGPR-NDD-NEXT:    adcq %r31, %rdx
+; EGPR-NDD-NEXT:    addq %r21, %r10
+; EGPR-NDD-NEXT:    adcq %r20, %rdi
 ; EGPR-NDD-NEXT:    adcq $0, %rax
 ; EGPR-NDD-NEXT:    adcq $0, %rdx
 ; EGPR-NDD-NEXT:    addq %rax, %rsi
 ; EGPR-NDD-NEXT:    adcq %rdx, %r9
-; EGPR-NDD-NEXT:    setb %r27b
+; EGPR-NDD-NEXT:    setb %r31b
 ; EGPR-NDD-NEXT:    movq %r13, %rax
 ; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    movq %rdx, %r28
-; EGPR-NDD-NEXT:    movq %rax, %r29
+; EGPR-NDD-NEXT:    movq %rdx, %r20
+; EGPR-NDD-NEXT:    movq %rax, %r21
 ; EGPR-NDD-NEXT:    movq %r16, %rax
 ; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    addq %rax, %r28
+; EGPR-NDD-NEXT:    addq %rax, %r20
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %r12
 ; EGPR-NDD-NEXT:    movq %r13, %rax
 ; EGPR-NDD-NEXT:    mulq %r17
-; EGPR-NDD-NEXT:    addq %rax, %r28
+; EGPR-NDD-NEXT:    addq %rax, %r20
 ; EGPR-NDD-NEXT:    adcq %rdx, %r12
 ; EGPR-NDD-NEXT:    setb %bpl
 ; EGPR-NDD-NEXT:    movq %r16, %rax
@@ -1455,16 +1455,16 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    addq %r12, %rax
 ; EGPR-NDD-NEXT:    movzbl %bpl, %r12d
 ; EGPR-NDD-NEXT:    adcq %r12, %rdx
-; EGPR-NDD-NEXT:    addq %r29, %rsi
-; EGPR-NDD-NEXT:    adcq %r28, %r9
-; EGPR-NDD-NEXT:    movzbl %r27b, %r27d
-; EGPR-NDD-NEXT:    adcq %r27, %rax
+; EGPR-NDD-NEXT:    addq %r21, %rsi
+; EGPR-NDD-NEXT:    adcq %r20, %r9
+; EGPR-NDD-NEXT:    movzbl %r31b, %r31d
+; EGPR-NDD-NEXT:    adcq %r31, %rax
 ; EGPR-NDD-NEXT:    adcq $0, %rdx
-; EGPR-NDD-NEXT:    addq %r25, %r19
+; EGPR-NDD-NEXT:    addq %r27, %r19
 ; EGPR-NDD-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq %rbx, %r30
-; EGPR-NDD-NEXT:    movq %r30, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq %r31, %r10
+; EGPR-NDD-NEXT:    adcq %rbx, %r28
+; EGPR-NDD-NEXT:    movq %r28, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq %r29, %r10
 ; EGPR-NDD-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    adcq %rdi, %rcx
 ; EGPR-NDD-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1477,88 +1477,88 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    adcq $0, %rdx
 ; EGPR-NDD-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    movq 64(%r20), %r28
-; EGPR-NDD-NEXT:    movq %r24, %rax
-; EGPR-NDD-NEXT:    mulq %r28
-; EGPR-NDD-NEXT:    movq %rdx, %r25
-; EGPR-NDD-NEXT:    movq %rax, %r30
+; EGPR-NDD-NEXT:    movq 64(%r22), %r20
+; EGPR-NDD-NEXT:    movq %r26, %rax
+; EGPR-NDD-NEXT:    mulq %r20
+; EGPR-NDD-NEXT:    movq %rdx, %r27
+; EGPR-NDD-NEXT:    movq %rax, %r28
 ; EGPR-NDD-NEXT:    movq %r14, %rax
-; EGPR-NDD-NEXT:    mulq %r28
-; EGPR-NDD-NEXT:    addq %rax, %r25
+; EGPR-NDD-NEXT:    mulq %r20
+; EGPR-NDD-NEXT:    addq %rax, %r27
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %rcx
-; EGPR-NDD-NEXT:    movq 72(%r20), %r29
-; EGPR-NDD-NEXT:    movq %r24, %rax
-; EGPR-NDD-NEXT:    mulq %r29
-; EGPR-NDD-NEXT:    addq %rax, %r25
+; EGPR-NDD-NEXT:    movq 72(%r22), %r21
+; EGPR-NDD-NEXT:    movq %r26, %rax
+; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    addq %rax, %r27
 ; EGPR-NDD-NEXT:    adcq %rdx, %rcx
 ; EGPR-NDD-NEXT:    setb %sil
 ; EGPR-NDD-NEXT:    movq %r14, %rax
-; EGPR-NDD-NEXT:    mulq %r29
+; EGPR-NDD-NEXT:    mulq %r21
 ; EGPR-NDD-NEXT:    addq %rax, %rcx
 ; EGPR-NDD-NEXT:    movzbl %sil, %eax
 ; EGPR-NDD-NEXT:    adcq %rax, %rdx, %rsi
-; EGPR-NDD-NEXT:    movq %r22, %rax
-; EGPR-NDD-NEXT:    mulq %r28
-; EGPR-NDD-NEXT:    movq %rdx, %r31
+; EGPR-NDD-NEXT:    movq %r24, %rax
+; EGPR-NDD-NEXT:    mulq %r20
+; EGPR-NDD-NEXT:    movq %rdx, %r29
 ; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    movq %r21, %rax
-; EGPR-NDD-NEXT:    mulq %r28
-; EGPR-NDD-NEXT:    addq %rax, %r31
+; EGPR-NDD-NEXT:    movq %r23, %rax
+; EGPR-NDD-NEXT:    mulq %r20
+; EGPR-NDD-NEXT:    addq %rax, %r29
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %rdi
-; EGPR-NDD-NEXT:    movq %r22, %rax
-; EGPR-NDD-NEXT:    mulq %r29
-; EGPR-NDD-NEXT:    addq %r31, %rax
+; EGPR-NDD-NEXT:    movq %r24, %rax
+; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    addq %r29, %rax
 ; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; EGPR-NDD-NEXT:    adcq %rdx, %rdi
 ; EGPR-NDD-NEXT:    setb %r8b
-; EGPR-NDD-NEXT:    movq %r21, %rax
-; EGPR-NDD-NEXT:    mulq %r29
+; EGPR-NDD-NEXT:    movq %r23, %rax
+; EGPR-NDD-NEXT:    mulq %r21
 ; EGPR-NDD-NEXT:    addq %rdi, %rax
 ; EGPR-NDD-NEXT:    movzbl %r8b, %edi
 ; EGPR-NDD-NEXT:    adcq %rdi, %rdx
-; EGPR-NDD-NEXT:    addq %rax, %r30, %rdi
-; EGPR-NDD-NEXT:    adcq %rdx, %r25
+; EGPR-NDD-NEXT:    addq %rax, %r28, %rdi
+; EGPR-NDD-NEXT:    adcq %rdx, %r27
 ; EGPR-NDD-NEXT:    adcq $0, %rcx
 ; EGPR-NDD-NEXT:    adcq $0, %rsi
-; EGPR-NDD-NEXT:    movq 80(%r20), %r8
-; EGPR-NDD-NEXT:    movq %r22, %rax
+; EGPR-NDD-NEXT:    movq 80(%r22), %r8
+; EGPR-NDD-NEXT:    movq %r24, %rax
 ; EGPR-NDD-NEXT:    mulq %r8
-; EGPR-NDD-NEXT:    movq %rdx, %r30
-; EGPR-NDD-NEXT:    movq %rax, %r31
-; EGPR-NDD-NEXT:    movq %r21, %rax
+; EGPR-NDD-NEXT:    movq %rdx, %r28
+; EGPR-NDD-NEXT:    movq %rax, %r29
+; EGPR-NDD-NEXT:    movq %r23, %rax
 ; EGPR-NDD-NEXT:    mulq %r8
-; EGPR-NDD-NEXT:    addq %rax, %r30
+; EGPR-NDD-NEXT:    addq %rax, %r28
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %r9
-; EGPR-NDD-NEXT:    movq 88(%r20), %rbx
-; EGPR-NDD-NEXT:    movq %r22, %rax
+; EGPR-NDD-NEXT:    movq 88(%r22), %rbx
+; EGPR-NDD-NEXT:    movq %r24, %rax
 ; EGPR-NDD-NEXT:    mulq %rbx
-; EGPR-NDD-NEXT:    addq %rax, %r30
+; EGPR-NDD-NEXT:    addq %rax, %r28
 ; EGPR-NDD-NEXT:    adcq %rdx, %r9
 ; EGPR-NDD-NEXT:    setb %r10b
-; EGPR-NDD-NEXT:    movq %r21, %rax
+; EGPR-NDD-NEXT:    movq %r23, %rax
 ; EGPR-NDD-NEXT:    mulq %rbx
 ; EGPR-NDD-NEXT:    addq %r9, %rax
 ; EGPR-NDD-NEXT:    movzbl %r10b, %r9d
 ; EGPR-NDD-NEXT:    adcq %r9, %rdx
-; EGPR-NDD-NEXT:    addq %r31, %rdi
+; EGPR-NDD-NEXT:    addq %r29, %rdi
 ; EGPR-NDD-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; EGPR-NDD-NEXT:    adcq %r25, %r30, %rbp
+; EGPR-NDD-NEXT:    adcq %r27, %r28, %rbp
 ; EGPR-NDD-NEXT:    adcq $0, %rax
 ; EGPR-NDD-NEXT:    adcq $0, %rdx
 ; EGPR-NDD-NEXT:    addq %rax, %rcx
 ; EGPR-NDD-NEXT:    adcq %rdx, %rsi
 ; EGPR-NDD-NEXT:    setb %dil
-; EGPR-NDD-NEXT:    movq %r24, %rax
+; EGPR-NDD-NEXT:    movq %r26, %rax
 ; EGPR-NDD-NEXT:    mulq %r8
-; EGPR-NDD-NEXT:    movq %rdx, %r30
-; EGPR-NDD-NEXT:    movq %rax, %r31
+; EGPR-NDD-NEXT:    movq %rdx, %r28
+; EGPR-NDD-NEXT:    movq %rax, %r29
 ; EGPR-NDD-NEXT:    movq %r14, %rax
 ; EGPR-NDD-NEXT:    mulq %r8
-; EGPR-NDD-NEXT:    addq %rax, %r30
+; EGPR-NDD-NEXT:    addq %rax, %r28
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %r9
-; EGPR-NDD-NEXT:    movq %r24, %rax
+; EGPR-NDD-NEXT:    movq %r26, %rax
 ; EGPR-NDD-NEXT:    mulq %rbx
-; EGPR-NDD-NEXT:    addq %rax, %r30
+; EGPR-NDD-NEXT:    addq %rax, %r28
 ; EGPR-NDD-NEXT:    adcq %rdx, %r9
 ; EGPR-NDD-NEXT:    setb %r10b
 ; EGPR-NDD-NEXT:    movq %r14, %rax
@@ -1566,191 +1566,191 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    addq %r9, %rax
 ; EGPR-NDD-NEXT:    movzbl %r10b, %r9d
 ; EGPR-NDD-NEXT:    adcq %r9, %rdx
-; EGPR-NDD-NEXT:    addq %rcx, %r31, %r25
-; EGPR-NDD-NEXT:    adcq %rsi, %r30, %r12
+; EGPR-NDD-NEXT:    addq %rcx, %r29, %r27
+; EGPR-NDD-NEXT:    adcq %rsi, %r28, %r12
 ; EGPR-NDD-NEXT:    movzbl %dil, %r19d
 ; EGPR-NDD-NEXT:    adcq %rax, %r19
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r31
-; EGPR-NDD-NEXT:    imulq %r26, %rbx
-; EGPR-NDD-NEXT:    movq %r26, %rax
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r29
+; EGPR-NDD-NEXT:    imulq %r30, %rbx
+; EGPR-NDD-NEXT:    movq %r30, %rax
 ; EGPR-NDD-NEXT:    mulq %r8
-; EGPR-NDD-NEXT:    movq %rax, %r30
+; EGPR-NDD-NEXT:    movq %rax, %r28
 ; EGPR-NDD-NEXT:    addq %rbx, %rdx
 ; EGPR-NDD-NEXT:    imulq %r18, %r8
 ; EGPR-NDD-NEXT:    addq %rdx, %r8
-; EGPR-NDD-NEXT:    imulq %r29, %r11, %rcx
+; EGPR-NDD-NEXT:    imulq %r21, %r11, %rcx
 ; EGPR-NDD-NEXT:    movq %r11, %rax
-; EGPR-NDD-NEXT:    mulq %r28
+; EGPR-NDD-NEXT:    mulq %r20
 ; EGPR-NDD-NEXT:    addq %rdx, %rcx
-; EGPR-NDD-NEXT:    imulq %r28, %r17, %r16
+; EGPR-NDD-NEXT:    imulq %r20, %r17, %r16
 ; EGPR-NDD-NEXT:    addq %r16, %rcx
-; EGPR-NDD-NEXT:    addq %r30, %rax, %rsi
+; EGPR-NDD-NEXT:    addq %r28, %rax, %rsi
 ; EGPR-NDD-NEXT:    adcq %rcx, %r8
-; EGPR-NDD-NEXT:    movq %r28, %rax
-; EGPR-NDD-NEXT:    mulq %r26
-; EGPR-NDD-NEXT:    movq %rdx, %r30
-; EGPR-NDD-NEXT:    movq %rax, %r27
-; EGPR-NDD-NEXT:    movq %r29, %rax
-; EGPR-NDD-NEXT:    mulq %r26
-; EGPR-NDD-NEXT:    addq %r30, %rax, %rcx
+; EGPR-NDD-NEXT:    movq %r20, %rax
+; EGPR-NDD-NEXT:    mulq %r30
+; EGPR-NDD-NEXT:    movq %rdx, %r28
+; EGPR-NDD-NEXT:    movq %rax, %r31
+; EGPR-NDD-NEXT:    movq %r21, %rax
+; EGPR-NDD-NEXT:    mulq %r30
+; EGPR-NDD-NEXT:    addq %r28, %rax, %rcx
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %rdi
-; EGPR-NDD-NEXT:    movq %r28, %rax
+; EGPR-NDD-NEXT:    movq %r20, %rax
 ; EGPR-NDD-NEXT:    mulq %r18
 ; EGPR-NDD-NEXT:    addq %rax, %rcx
 ; EGPR-NDD-NEXT:    adcq %rdx, %rdi
 ; EGPR-NDD-NEXT:    setb %r9b
-; EGPR-NDD-NEXT:    movq %r29, %rax
+; EGPR-NDD-NEXT:    movq %r21, %rax
 ; EGPR-NDD-NEXT:    mulq %r18
 ; EGPR-NDD-NEXT:    addq %rdi, %rax
 ; EGPR-NDD-NEXT:    movzbl %r9b, %edi
 ; EGPR-NDD-NEXT:    adcq %rdi, %rdx
 ; EGPR-NDD-NEXT:    addq %rax, %rsi
 ; EGPR-NDD-NEXT:    adcq %rdx, %r8
-; EGPR-NDD-NEXT:    movq 112(%r20), %rdi
-; EGPR-NDD-NEXT:    movq %r22, %rax
+; EGPR-NDD-NEXT:    movq 112(%r22), %rdi
+; EGPR-NDD-NEXT:    movq %r24, %rax
 ; EGPR-NDD-NEXT:    mulq %rdi
-; EGPR-NDD-NEXT:    movq %rax, %r26
-; EGPR-NDD-NEXT:    imulq %r21, %rdi
+; EGPR-NDD-NEXT:    movq %rax, %r30
+; EGPR-NDD-NEXT:    imulq %r23, %rdi
 ; EGPR-NDD-NEXT:    addq %rdi, %rdx
-; EGPR-NDD-NEXT:    imulq 120(%r20), %r22, %rax
+; EGPR-NDD-NEXT:    imulq 120(%r22), %r24, %rax
 ; EGPR-NDD-NEXT:    addq %rax, %rdx, %r9
-; EGPR-NDD-NEXT:    movq 96(%r20), %r28
-; EGPR-NDD-NEXT:    movq 104(%r20), %rdi
-; EGPR-NDD-NEXT:    imulq %rdi, %r24, %r10
-; EGPR-NDD-NEXT:    movq %r24, %rax
-; EGPR-NDD-NEXT:    mulq %r28
+; EGPR-NDD-NEXT:    movq 96(%r22), %r20
+; EGPR-NDD-NEXT:    movq 104(%r22), %rdi
+; EGPR-NDD-NEXT:    imulq %rdi, %r26, %r10
+; EGPR-NDD-NEXT:    movq %r26, %rax
+; EGPR-NDD-NEXT:    mulq %r20
 ; EGPR-NDD-NEXT:    addq %r10, %rdx
-; EGPR-NDD-NEXT:    imulq %r28, %r14, %r23
-; EGPR-NDD-NEXT:    addq %r23, %rdx
-; EGPR-NDD-NEXT:    addq %rax, %r26
+; EGPR-NDD-NEXT:    imulq %r20, %r14, %r25
+; EGPR-NDD-NEXT:    addq %r25, %rdx
+; EGPR-NDD-NEXT:    addq %rax, %r30
 ; EGPR-NDD-NEXT:    adcq %rdx, %r9
-; EGPR-NDD-NEXT:    movq %r28, %rax
-; EGPR-NDD-NEXT:    mulq %r22
-; EGPR-NDD-NEXT:    movq %rdx, %r23
-; EGPR-NDD-NEXT:    movq %rax, %r24
+; EGPR-NDD-NEXT:    movq %r20, %rax
+; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    movq %rdx, %r25
+; EGPR-NDD-NEXT:    movq %rax, %r26
 ; EGPR-NDD-NEXT:    movq %rdi, %rax
-; EGPR-NDD-NEXT:    mulq %r22
-; EGPR-NDD-NEXT:    addq %rax, %r23
+; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    addq %rax, %r25
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %r10
-; EGPR-NDD-NEXT:    movq %r28, %rax
-; EGPR-NDD-NEXT:    mulq %r21
-; EGPR-NDD-NEXT:    addq %rax, %r23
+; EGPR-NDD-NEXT:    movq %r20, %rax
+; EGPR-NDD-NEXT:    mulq %r23
+; EGPR-NDD-NEXT:    addq %rax, %r25
 ; EGPR-NDD-NEXT:    adcq %rdx, %r10
 ; EGPR-NDD-NEXT:    setb %r11b
 ; EGPR-NDD-NEXT:    movq %rdi, %rax
-; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    mulq %r23
 ; EGPR-NDD-NEXT:    addq %r10, %rax
 ; EGPR-NDD-NEXT:    movzbl %r11b, %edi
 ; EGPR-NDD-NEXT:    adcq %rdi, %rdx
-; EGPR-NDD-NEXT:    addq %r26, %rax
+; EGPR-NDD-NEXT:    addq %r30, %rax
 ; EGPR-NDD-NEXT:    adcq %r9, %rdx
-; EGPR-NDD-NEXT:    addq %r27, %r24
-; EGPR-NDD-NEXT:    adcq %r23, %rcx
+; EGPR-NDD-NEXT:    addq %r31, %r26
+; EGPR-NDD-NEXT:    adcq %r25, %rcx
 ; EGPR-NDD-NEXT:    adcq %rsi, %rax
 ; EGPR-NDD-NEXT:    adcq %r8, %rdx
-; EGPR-NDD-NEXT:    addq %r24, %r25, %rbx
+; EGPR-NDD-NEXT:    addq %r26, %r27, %rbx
 ; EGPR-NDD-NEXT:    adcq %rcx, %r12
 ; EGPR-NDD-NEXT:    adcq %rax, %r19, %r13
-; EGPR-NDD-NEXT:    adcq %rdx, %r31, %r30
-; EGPR-NDD-NEXT:    movq 80(%r15), %r22
-; EGPR-NDD-NEXT:    movq %r22, %rax
+; EGPR-NDD-NEXT:    adcq %rdx, %r29, %r28
+; EGPR-NDD-NEXT:    movq 80(%r15), %r24
+; EGPR-NDD-NEXT:    movq %r24, %rax
 ; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload
 ; EGPR-NDD-NEXT:    mulq %r16
-; EGPR-NDD-NEXT:    movq %rax, %r26
+; EGPR-NDD-NEXT:    movq %rax, %r30
 ; EGPR-NDD-NEXT:    movq %rdx, %rdi
-; EGPR-NDD-NEXT:    movq 88(%r15), %r20
-; EGPR-NDD-NEXT:    movq %r20, %rax
+; EGPR-NDD-NEXT:    movq 88(%r15), %r22
+; EGPR-NDD-NEXT:    movq %r22, %rax
 ; EGPR-NDD-NEXT:    mulq %r16
 ; EGPR-NDD-NEXT:    addq %rax, %rdi
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %rcx
-; EGPR-NDD-NEXT:    movq %r22, %rax
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Reload
-; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    movq %r24, %rax
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Reload
+; EGPR-NDD-NEXT:    mulq %r23
 ; EGPR-NDD-NEXT:    addq %rax, %rdi
 ; EGPR-NDD-NEXT:    adcq %rdx, %rcx
 ; EGPR-NDD-NEXT:    setb %sil
-; EGPR-NDD-NEXT:    movq %r20, %rax
-; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    movq %r22, %rax
+; EGPR-NDD-NEXT:    mulq %r23
 ; EGPR-NDD-NEXT:    addq %rax, %rcx
 ; EGPR-NDD-NEXT:    movzbl %sil, %eax
 ; EGPR-NDD-NEXT:    adcq %rax, %rdx, %rsi
-; EGPR-NDD-NEXT:    movq 64(%r15), %r24
-; EGPR-NDD-NEXT:    movq %r24, %rax
+; EGPR-NDD-NEXT:    movq 64(%r15), %r26
+; EGPR-NDD-NEXT:    movq %r26, %rax
 ; EGPR-NDD-NEXT:    mulq %r16
-; EGPR-NDD-NEXT:    movq %rax, %r29
-; EGPR-NDD-NEXT:    movq %rdx, %r27
-; EGPR-NDD-NEXT:    movq 72(%r15), %r23
-; EGPR-NDD-NEXT:    movq %r23, %rax
+; EGPR-NDD-NEXT:    movq %rax, %r21
+; EGPR-NDD-NEXT:    movq %rdx, %r31
+; EGPR-NDD-NEXT:    movq 72(%r15), %r25
+; EGPR-NDD-NEXT:    movq %r25, %rax
 ; EGPR-NDD-NEXT:    mulq %r16
-; EGPR-NDD-NEXT:    addq %rax, %r27
+; EGPR-NDD-NEXT:    addq %rax, %r31
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %r8
-; EGPR-NDD-NEXT:    movq %r24, %rax
-; EGPR-NDD-NEXT:    mulq %r21
-; EGPR-NDD-NEXT:    addq %r27, %rax, %r31
+; EGPR-NDD-NEXT:    movq %r26, %rax
+; EGPR-NDD-NEXT:    mulq %r23
+; EGPR-NDD-NEXT:    addq %r31, %rax, %r29
 ; EGPR-NDD-NEXT:    adcq %rdx, %r8
 ; EGPR-NDD-NEXT:    setb %r9b
-; EGPR-NDD-NEXT:    movq %r23, %rax
-; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    movq %r25, %rax
+; EGPR-NDD-NEXT:    mulq %r23
 ; EGPR-NDD-NEXT:    addq %r8, %rax
 ; EGPR-NDD-NEXT:    movzbl %r9b, %r8d
 ; EGPR-NDD-NEXT:    adcq %r8, %rdx
-; EGPR-NDD-NEXT:    addq %rax, %r26, %r28
+; EGPR-NDD-NEXT:    addq %rax, %r30, %r20
 ; EGPR-NDD-NEXT:    adcq %rdx, %rdi
 ; EGPR-NDD-NEXT:    adcq $0, %rcx
 ; EGPR-NDD-NEXT:    adcq $0, %rsi
-; EGPR-NDD-NEXT:    movq %r24, %rax
+; EGPR-NDD-NEXT:    movq %r26, %rax
 ; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
 ; EGPR-NDD-NEXT:    mulq %r10
-; EGPR-NDD-NEXT:    movq %rdx, %r26
-; EGPR-NDD-NEXT:    movq %rax, %r27
-; EGPR-NDD-NEXT:    movq %r23, %rax
+; EGPR-NDD-NEXT:    movq %rdx, %r30
+; EGPR-NDD-NEXT:    movq %rax, %r31
+; EGPR-NDD-NEXT:    movq %r25, %rax
 ; EGPR-NDD-NEXT:    mulq %r10
-; EGPR-NDD-NEXT:    addq %rax, %r26
+; EGPR-NDD-NEXT:    addq %rax, %r30
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %r8
-; EGPR-NDD-NEXT:    movq %r24, %rax
+; EGPR-NDD-NEXT:    movq %r26, %rax
 ; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
 ; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    addq %r26, %rax, %r25
+; EGPR-NDD-NEXT:    addq %r30, %rax, %r27
 ; EGPR-NDD-NEXT:    adcq %rdx, %r8
 ; EGPR-NDD-NEXT:    setb %r9b
-; EGPR-NDD-NEXT:    movq %r23, %rax
+; EGPR-NDD-NEXT:    movq %r25, %rax
 ; EGPR-NDD-NEXT:    mulq %r11
 ; EGPR-NDD-NEXT:    addq %r8, %rax
 ; EGPR-NDD-NEXT:    movzbl %r9b, %r8d
 ; EGPR-NDD-NEXT:    adcq %r8, %rdx
-; EGPR-NDD-NEXT:    addq %r27, %r28
-; EGPR-NDD-NEXT:    adcq %rdi, %r25
+; EGPR-NDD-NEXT:    addq %r31, %r20
+; EGPR-NDD-NEXT:    adcq %rdi, %r27
 ; EGPR-NDD-NEXT:    adcq $0, %rax
 ; EGPR-NDD-NEXT:    adcq $0, %rdx
 ; EGPR-NDD-NEXT:    addq %rax, %rcx
 ; EGPR-NDD-NEXT:    adcq %rdx, %rsi
 ; EGPR-NDD-NEXT:    setb %dil
-; EGPR-NDD-NEXT:    movq %r22, %rax
+; EGPR-NDD-NEXT:    movq %r24, %rax
 ; EGPR-NDD-NEXT:    mulq %r10
-; EGPR-NDD-NEXT:    movq %rdx, %r26
-; EGPR-NDD-NEXT:    movq %rax, %r27
-; EGPR-NDD-NEXT:    movq %r20, %rax
+; EGPR-NDD-NEXT:    movq %rdx, %r30
+; EGPR-NDD-NEXT:    movq %rax, %r31
+; EGPR-NDD-NEXT:    movq %r22, %rax
 ; EGPR-NDD-NEXT:    mulq %r10
-; EGPR-NDD-NEXT:    addq %rax, %r26
+; EGPR-NDD-NEXT:    addq %rax, %r30
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %r8
-; EGPR-NDD-NEXT:    movq %r22, %rax
+; EGPR-NDD-NEXT:    movq %r24, %rax
 ; EGPR-NDD-NEXT:    mulq %r11
-; EGPR-NDD-NEXT:    addq %r26, %rax, %r19
+; EGPR-NDD-NEXT:    addq %r30, %rax, %r19
 ; EGPR-NDD-NEXT:    adcq %rdx, %r8
 ; EGPR-NDD-NEXT:    setb %r9b
-; EGPR-NDD-NEXT:    movq %r20, %rax
+; EGPR-NDD-NEXT:    movq %r22, %rax
 ; EGPR-NDD-NEXT:    mulq %r11
 ; EGPR-NDD-NEXT:    addq %r8, %rax
 ; EGPR-NDD-NEXT:    movzbl %r9b, %r8d
 ; EGPR-NDD-NEXT:    adcq %r8, %rdx
-; EGPR-NDD-NEXT:    addq %rcx, %r27
+; EGPR-NDD-NEXT:    addq %rcx, %r31
 ; EGPR-NDD-NEXT:    adcq %rsi, %r19
 ; EGPR-NDD-NEXT:    movzbl %dil, %ecx
 ; EGPR-NDD-NEXT:    adcq %rax, %rcx
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %rdi
-; EGPR-NDD-NEXT:    movq 96(%r15), %r26
-; EGPR-NDD-NEXT:    imulq %r11, %r26, %rsi
-; EGPR-NDD-NEXT:    movq %r26, %rax
+; EGPR-NDD-NEXT:    movq 96(%r15), %r30
+; EGPR-NDD-NEXT:    imulq %r11, %r30, %rsi
+; EGPR-NDD-NEXT:    movq %r30, %rax
 ; EGPR-NDD-NEXT:    mulq %r10
 ; EGPR-NDD-NEXT:    movq %rax, %r18
 ; EGPR-NDD-NEXT:    addq %rsi, %rdx
@@ -1758,7 +1758,7 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    imulq %r10, %r8, %rax
 ; EGPR-NDD-NEXT:    addq %rax, %rdx, %rsi
 ; EGPR-NDD-NEXT:    movq 112(%r15), %rax
-; EGPR-NDD-NEXT:    imulq %r21, %rax, %r9
+; EGPR-NDD-NEXT:    imulq %r23, %rax, %r9
 ; EGPR-NDD-NEXT:    mulq %r16
 ; EGPR-NDD-NEXT:    addq %r9, %rdx
 ; EGPR-NDD-NEXT:    imulq 120(%r15), %r16, %r9
@@ -1767,11 +1767,11 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    adcq %rsi, %rdx, %r9
 ; EGPR-NDD-NEXT:    movq %r16, %rax
 ; EGPR-NDD-NEXT:    movq %r16, %r18
-; EGPR-NDD-NEXT:    mulq %r26
+; EGPR-NDD-NEXT:    mulq %r30
 ; EGPR-NDD-NEXT:    movq %rdx, %r17
 ; EGPR-NDD-NEXT:    movq %rax, %rsi
-; EGPR-NDD-NEXT:    movq %r21, %rax
-; EGPR-NDD-NEXT:    mulq %r26
+; EGPR-NDD-NEXT:    movq %r23, %rax
+; EGPR-NDD-NEXT:    mulq %r30
 ; EGPR-NDD-NEXT:    addq %r17, %rax, %r11
 ; EGPR-NDD-NEXT:    adcq $0, %rdx, %r16
 ; EGPR-NDD-NEXT:    movq %r18, %rax
@@ -1779,72 +1779,72 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    addq %rax, %r11
 ; EGPR-NDD-NEXT:    adcq %rdx, %r16
 ; EGPR-NDD-NEXT:    setb %r17b
-; EGPR-NDD-NEXT:    movq %r21, %rax
+; EGPR-NDD-NEXT:    movq %r23, %rax
 ; EGPR-NDD-NEXT:    mulq %r8
 ; EGPR-NDD-NEXT:    addq %r16, %rax
 ; EGPR-NDD-NEXT:    movzbl %r17b, %r8d
 ; EGPR-NDD-NEXT:    adcq %r8, %rdx
 ; EGPR-NDD-NEXT:    addq %rax, %r10
 ; EGPR-NDD-NEXT:    adcq %r9, %rdx, %r17
-; EGPR-NDD-NEXT:    imulq {{[-0-9]+}}(%r{{[sb]}}p), %r24, %r8 # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    movq %r24, %rax
+; EGPR-NDD-NEXT:    imulq {{[-0-9]+}}(%r{{[sb]}}p), %r26, %r8 # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    movq %r26, %rax
 ; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload
 ; EGPR-NDD-NEXT:    mulq %r16
 ; EGPR-NDD-NEXT:    movq %rax, %r9
 ; EGPR-NDD-NEXT:    addq %r8, %rdx
-; EGPR-NDD-NEXT:    imulq %r16, %r23, %rax
+; EGPR-NDD-NEXT:    imulq %r16, %r25, %rax
 ; EGPR-NDD-NEXT:    addq %rax, %rdx, %r8
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Reload
-; EGPR-NDD-NEXT:    imulq %r21, %r22, %r16
-; EGPR-NDD-NEXT:    movq %r22, %rax
-; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r26 # 8-byte Reload
-; EGPR-NDD-NEXT:    mulq %r26
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Reload
+; EGPR-NDD-NEXT:    imulq %r23, %r24, %r16
+; EGPR-NDD-NEXT:    movq %r24, %rax
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r30 # 8-byte Reload
+; EGPR-NDD-NEXT:    mulq %r30
 ; EGPR-NDD-NEXT:    addq %r16, %rdx
-; EGPR-NDD-NEXT:    imulq %r26, %r20
-; EGPR-NDD-NEXT:    addq %r20, %rdx
+; EGPR-NDD-NEXT:    imulq %r30, %r22
+; EGPR-NDD-NEXT:    addq %r22, %rdx
 ; EGPR-NDD-NEXT:    addq %r9, %rax, %r16
 ; EGPR-NDD-NEXT:    adcq %r8, %rdx, %r18
-; EGPR-NDD-NEXT:    movq %r26, %rax
-; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    movq %r30, %rax
+; EGPR-NDD-NEXT:    mulq %r26
 ; EGPR-NDD-NEXT:    movq %rdx, %r8
 ; EGPR-NDD-NEXT:    movq %rax, %r9
-; EGPR-NDD-NEXT:    movq %r21, %rax
-; EGPR-NDD-NEXT:    movq %r21, %r22
-; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    movq %r23, %rax
+; EGPR-NDD-NEXT:    movq %r23, %r24
+; EGPR-NDD-NEXT:    mulq %r26
 ; EGPR-NDD-NEXT:    addq %rax, %r8
-; EGPR-NDD-NEXT:    adcq $0, %rdx, %r20
-; EGPR-NDD-NEXT:    movq %r26, %rax
-; EGPR-NDD-NEXT:    mulq %r23
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r22
+; EGPR-NDD-NEXT:    movq %r30, %rax
+; EGPR-NDD-NEXT:    mulq %r25
 ; EGPR-NDD-NEXT:    addq %rax, %r8
-; EGPR-NDD-NEXT:    adcq %rdx, %r20
-; EGPR-NDD-NEXT:    setb %r21b
-; EGPR-NDD-NEXT:    movq %r22, %rax
-; EGPR-NDD-NEXT:    mulq %r23
-; EGPR-NDD-NEXT:    addq %r20, %rax
-; EGPR-NDD-NEXT:    movzbl %r21b, %r20d
-; EGPR-NDD-NEXT:    adcq %r20, %rdx
+; EGPR-NDD-NEXT:    adcq %rdx, %r22
+; EGPR-NDD-NEXT:    setb %r23b
+; EGPR-NDD-NEXT:    movq %r24, %rax
+; EGPR-NDD-NEXT:    mulq %r25
+; EGPR-NDD-NEXT:    addq %r22, %rax
+; EGPR-NDD-NEXT:    movzbl %r23b, %r22d
+; EGPR-NDD-NEXT:    adcq %r22, %rdx
 ; EGPR-NDD-NEXT:    addq %r16, %rax
 ; EGPR-NDD-NEXT:    adcq %r18, %rdx
 ; EGPR-NDD-NEXT:    addq %r9, %rsi
 ; EGPR-NDD-NEXT:    adcq %r11, %r8
 ; EGPR-NDD-NEXT:    adcq %r10, %rax
 ; EGPR-NDD-NEXT:    adcq %r17, %rdx
-; EGPR-NDD-NEXT:    addq %r27, %rsi
+; EGPR-NDD-NEXT:    addq %r31, %rsi
 ; EGPR-NDD-NEXT:    adcq %r19, %r8
 ; EGPR-NDD-NEXT:    adcq %rcx, %rax
 ; EGPR-NDD-NEXT:    adcq %rdi, %rdx
-; EGPR-NDD-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r29 # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r31 # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r28 # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    adcq %rbp, %r25
+; EGPR-NDD-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r29 # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r20 # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    adcq %rbp, %r27
 ; EGPR-NDD-NEXT:    adcq %rbx, %rsi
 ; EGPR-NDD-NEXT:    adcq %r12, %r8
 ; EGPR-NDD-NEXT:    adcq %r13, %rax
-; EGPR-NDD-NEXT:    adcq %r30, %rdx
-; EGPR-NDD-NEXT:    addq %r29, {{[-0-9]+}}(%r{{[sb]}}p), %r29 # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    adcq %r31, {{[-0-9]+}}(%r{{[sb]}}p), %r31 # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    adcq %r28, {{[-0-9]+}}(%r{{[sb]}}p), %r28 # 8-byte Folded Reload
-; EGPR-NDD-NEXT:    adcq %r25, {{[-0-9]+}}(%r{{[sb]}}p), %r25 # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    adcq %r28, %rdx
+; EGPR-NDD-NEXT:    addq %r21, {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    adcq %r29, {{[-0-9]+}}(%r{{[sb]}}p), %r29 # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    adcq %r20, {{[-0-9]+}}(%r{{[sb]}}p), %r20 # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    adcq %r27, {{[-0-9]+}}(%r{{[sb]}}p), %r27 # 8-byte Folded Reload
 ; EGPR-NDD-NEXT:    adcq %rsi, (%rsp), %rsi # 8-byte Folded Reload
 ; EGPR-NDD-NEXT:    adcq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
 ; EGPR-NDD-NEXT:    adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
@@ -1866,10 +1866,10 @@ define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-NDD-NEXT:    movq %rdi, 48(%rcx)
 ; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
 ; EGPR-NDD-NEXT:    movq %rdi, 56(%rcx)
-; EGPR-NDD-NEXT:    movq %r29, 64(%rcx)
-; EGPR-NDD-NEXT:    movq %r31, 72(%rcx)
-; EGPR-NDD-NEXT:    movq %r28, 80(%rcx)
-; EGPR-NDD-NEXT:    movq %r25, 88(%rcx)
+; EGPR-NDD-NEXT:    movq %r21, 64(%rcx)
+; EGPR-NDD-NEXT:    movq %r29, 72(%rcx)
+; EGPR-NDD-NEXT:    movq %r20, 80(%rcx)
+; EGPR-NDD-NEXT:    movq %r27, 88(%rcx)
 ; EGPR-NDD-NEXT:    movq %rsi, 96(%rcx)
 ; EGPR-NDD-NEXT:    movq %r8, 104(%rcx)
 ; EGPR-NDD-NEXT:    movq %rax, 112(%rcx)

From 89da344e5879e5347b5057520d5230e40ae24831 Mon Sep 17 00:00:00 2001
From: Vinay Deshmukh <32487576+vinay-deshmukh@users.noreply.github.com>
Date: Thu, 19 Dec 2024 07:49:16 -0500
Subject: [PATCH 053/209] [analyzer] Handle [[assume(cond)]] as
 __builtin_assume(cond) (#116462)

Resolves #100762

Gist of the change:
1. All the symbol analysis, constraint manager and expression parsing
logic was already present, but the previous code didn't "visit" the
expressions within `assume()` by parsing those expressions, all of the
code "just works" by evaluating the SVals, and hence leaning on the same
logic that makes the code with `__builtin_assume` work
2. "Ignore" an expression from adding in CFG if it has side-effects (
similar to CGStmt.cpp (todo add link))
3. Add additional test case for ternary operator handling and modify
CFG.cpp's VisitGuardedExpr code for `continue`-ing if the `ProgramPoint`
is a `StmtPoint`

---------

Co-authored-by: Balazs Benics <benicsbalazs@gmail.com>
---
 clang/include/clang/AST/AttrIterator.h        | 12 ++++
 .../Core/PathSensitive/ExprEngine.h           |  4 ++
 clang/lib/Analysis/CFG.cpp                    | 72 +++++++++++++------
 clang/lib/StaticAnalyzer/Core/ExprEngine.cpp  |  8 ++-
 clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp |  7 +-
 .../lib/StaticAnalyzer/Core/ExprEngineCXX.cpp | 18 +++++
 .../test/Analysis/cxx23-assume-attribute.cpp  | 58 +++++++++++++++
 clang/test/Analysis/out-of-bounds-new.cpp     | 64 ++++++++++++++++-
 8 files changed, 218 insertions(+), 25 deletions(-)
 create mode 100644 clang/test/Analysis/cxx23-assume-attribute.cpp

diff --git a/clang/include/clang/AST/AttrIterator.h b/clang/include/clang/AST/AttrIterator.h
index 7e2bb0381d4c8..2f39c144dc160 100644
--- a/clang/include/clang/AST/AttrIterator.h
+++ b/clang/include/clang/AST/AttrIterator.h
@@ -16,6 +16,7 @@
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/ADL.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
 #include <cassert>
 #include <cstddef>
@@ -124,6 +125,17 @@ inline auto *getSpecificAttr(const Container &container) {
   return It != specific_attr_end<IterTy>(container) ? *It : nullptr;
 }
 
+template <typename SpecificAttr, typename Container>
+inline auto getSpecificAttrs(const Container &container) {
+  using ValueTy = llvm::detail::ValueOfRange<Container>;
+  using ValuePointeeTy = std::remove_pointer_t<ValueTy>;
+  using IterTy = std::conditional_t<std::is_const_v<ValuePointeeTy>,
+                                    const SpecificAttr, SpecificAttr>;
+  auto Begin = specific_attr_begin<IterTy>(container);
+  auto End = specific_attr_end<IterTy>(container);
+  return llvm::make_range(Begin, End);
+}
+
 } // namespace clang
 
 #endif // LLVM_CLANG_AST_ATTRITERATOR_H
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
index 8c7493e27fcaa..078a1d840d051 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
@@ -498,6 +498,10 @@ class ExprEngine {
   void VisitInitListExpr(const InitListExpr *E, ExplodedNode *Pred,
                          ExplodedNodeSet &Dst);
 
+  /// VisitAttributedStmt - Transfer function logic for AttributedStmt
+  void VisitAttributedStmt(const AttributedStmt *A, ExplodedNode *Pred,
+                           ExplodedNodeSet &Dst);
+
   /// VisitLogicalExpr - Transfer function logic for '&&', '||'
   void VisitLogicalExpr(const BinaryOperator* B, ExplodedNode *Pred,
                         ExplodedNodeSet &Dst);
diff --git a/clang/lib/Analysis/CFG.cpp b/clang/lib/Analysis/CFG.cpp
index 304bbb2b422c6..65f915ef087af 100644
--- a/clang/lib/Analysis/CFG.cpp
+++ b/clang/lib/Analysis/CFG.cpp
@@ -433,7 +433,7 @@ class reverse_children {
   ArrayRef<Stmt *> children;
 
 public:
-  reverse_children(Stmt *S);
+  reverse_children(Stmt *S, ASTContext &Ctx);
 
   using iterator = ArrayRef<Stmt *>::reverse_iterator;
 
@@ -443,28 +443,47 @@ class reverse_children {
 
 } // namespace
 
-reverse_children::reverse_children(Stmt *S) {
-  if (CallExpr *CE = dyn_cast<CallExpr>(S)) {
-    children = CE->getRawSubExprs();
+reverse_children::reverse_children(Stmt *S, ASTContext &Ctx) {
+  switch (S->getStmtClass()) {
+  case Stmt::CallExprClass: {
+    children = cast<CallExpr>(S)->getRawSubExprs();
     return;
   }
-  switch (S->getStmtClass()) {
-    // Note: Fill in this switch with more cases we want to optimize.
-    case Stmt::InitListExprClass: {
-      InitListExpr *IE = cast<InitListExpr>(S);
-      children = llvm::ArrayRef(reinterpret_cast<Stmt **>(IE->getInits()),
-                                IE->getNumInits());
-      return;
-    }
-    default:
-      break;
+
+  // Note: Fill in this switch with more cases we want to optimize.
+  case Stmt::InitListExprClass: {
+    InitListExpr *IE = cast<InitListExpr>(S);
+    children = llvm::ArrayRef(reinterpret_cast<Stmt **>(IE->getInits()),
+                              IE->getNumInits());
+    return;
   }
+  case Stmt::AttributedStmtClass: {
+    auto *AS = cast<AttributedStmt>(S);
 
-  // Default case for all other statements.
-  llvm::append_range(childrenBuf, S->children());
+    // for an attributed stmt, the "children()" returns only the NullStmt
+    // (;) but semantically the "children" are supposed to be the
+    // expressions _within_ i.e. the two square brackets i.e. [[ HERE ]]
+    // so we add the subexpressions first, _then_ add the "children"
 
-  // This needs to be done *after* childrenBuf has been populated.
-  children = childrenBuf;
+    for (const auto *Attr : AS->getAttrs()) {
+      if (const auto *AssumeAttr = dyn_cast<CXXAssumeAttr>(Attr)) {
+        Expr *AssumeExpr = AssumeAttr->getAssumption();
+        if (!AssumeExpr->HasSideEffects(Ctx)) {
+          childrenBuf.push_back(AssumeExpr);
+        }
+      }
+      // Visit the actual children AST nodes.
+      // For CXXAssumeAttrs, this is always a NullStmt.
+      llvm::append_range(childrenBuf, AS->children());
+      children = childrenBuf;
+    }
+    return;
+  }
+  default:
+    // Default case for all other statements.
+    llvm::append_range(childrenBuf, S->children());
+    children = childrenBuf;
+  }
 }
 
 namespace {
@@ -2431,7 +2450,7 @@ CFGBlock *CFGBuilder::VisitChildren(Stmt *S) {
 
   // Visit the children in their reverse order so that they appear in
   // left-to-right (natural) order in the CFG.
-  reverse_children RChildren(S);
+  reverse_children RChildren(S, *Context);
   for (Stmt *Child : RChildren) {
     if (Child)
       if (CFGBlock *R = Visit(Child))
@@ -2447,7 +2466,7 @@ CFGBlock *CFGBuilder::VisitInitListExpr(InitListExpr *ILE, AddStmtChoice asc) {
   }
   CFGBlock *B = Block;
 
-  reverse_children RChildren(ILE);
+  reverse_children RChildren(ILE, *Context);
   for (Stmt *Child : RChildren) {
     if (!Child)
       continue;
@@ -2482,6 +2501,14 @@ static bool isFallthroughStatement(const AttributedStmt *A) {
   return isFallthrough;
 }
 
+static bool isCXXAssumeAttr(const AttributedStmt *A) {
+  bool hasAssumeAttr = hasSpecificAttr<CXXAssumeAttr>(A->getAttrs());
+
+  assert((!hasAssumeAttr || isa<NullStmt>(A->getSubStmt())) &&
+         "expected [[assume]] not to have children");
+  return hasAssumeAttr;
+}
+
 CFGBlock *CFGBuilder::VisitAttributedStmt(AttributedStmt *A,
                                           AddStmtChoice asc) {
   // AttributedStmts for [[likely]] can have arbitrary statements as children,
@@ -2497,6 +2524,11 @@ CFGBlock *CFGBuilder::VisitAttributedStmt(AttributedStmt *A,
     appendStmt(Block, A);
   }
 
+  if (isCXXAssumeAttr(A) && asc.alwaysAdd(*this, A)) {
+    autoCreateBlock();
+    appendStmt(Block, A);
+  }
+
   return VisitChildren(A);
 }
 
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 0a74a80a6a62f..44c9e54bde5e3 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1941,7 +1941,6 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     // to be explicitly evaluated.
     case Stmt::PredefinedExprClass:
     case Stmt::AddrLabelExprClass:
-    case Stmt::AttributedStmtClass:
     case Stmt::IntegerLiteralClass:
     case Stmt::FixedPointLiteralClass:
     case Stmt::CharacterLiteralClass:
@@ -1972,6 +1971,13 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
       break;
     }
 
+    case Stmt::AttributedStmtClass: {
+      Bldr.takeNodes(Pred);
+      VisitAttributedStmt(cast<AttributedStmt>(S), Pred, Dst);
+      Bldr.addNodes(Dst);
+      break;
+    }
+
     case Stmt::CXXDefaultArgExprClass:
     case Stmt::CXXDefaultInitExprClass: {
       Bldr.takeNodes(Pred);
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
index 7a900780384a9..1315bd10496f5 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
@@ -794,9 +794,10 @@ void ExprEngine::VisitGuardedExpr(const Expr *Ex,
 
   // Find the predecessor block.
   ProgramStateRef SrcState = state;
+
   for (const ExplodedNode *N = Pred ; N ; N = *N->pred_begin()) {
-    ProgramPoint PP = N->getLocation();
-    if (PP.getAs<PreStmtPurgeDeadSymbols>() || PP.getAs<BlockEntrance>()) {
+    auto Edge = N->getLocationAs<BlockEdge>();
+    if (!Edge.has_value()) {
       // If the state N has multiple predecessors P, it means that successors
       // of P are all equivalent.
       // In turn, that means that all nodes at P are equivalent in terms
@@ -804,7 +805,7 @@ void ExprEngine::VisitGuardedExpr(const Expr *Ex,
       // FIXME: a more robust solution which does not walk up the tree.
       continue;
     }
-    SrcBlock = PP.castAs<BlockEdge>().getSrc();
+    SrcBlock = Edge->getSrc();
     SrcState = N->getState();
     break;
   }
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
index f7020da2e6da2..5f9dbcb55e811 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/AST/AttrIterator.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/ParentMap.h"
 #include "clang/AST/StmtCXX.h"
@@ -1200,3 +1201,20 @@ void ExprEngine::VisitLambdaExpr(const LambdaExpr *LE, ExplodedNode *Pred,
   // FIXME: Move all post/pre visits to ::Visit().
   getCheckerManager().runCheckersForPostStmt(Dst, Tmp, LE, *this);
 }
+
+void ExprEngine::VisitAttributedStmt(const AttributedStmt *A,
+                                     ExplodedNode *Pred, ExplodedNodeSet &Dst) {
+  ExplodedNodeSet CheckerPreStmt;
+  getCheckerManager().runCheckersForPreStmt(CheckerPreStmt, Pred, A, *this);
+
+  ExplodedNodeSet EvalSet;
+  StmtNodeBuilder Bldr(CheckerPreStmt, EvalSet, *currBldrCtx);
+
+  for (const auto *Attr : getSpecificAttrs<CXXAssumeAttr>(A->getAttrs())) {
+    for (ExplodedNode *N : CheckerPreStmt) {
+      Visit(Attr->getAssumption(), N, EvalSet);
+    }
+  }
+
+  getCheckerManager().runCheckersForPostStmt(Dst, EvalSet, A, *this);
+}
diff --git a/clang/test/Analysis/cxx23-assume-attribute.cpp b/clang/test/Analysis/cxx23-assume-attribute.cpp
new file mode 100644
index 0000000000000..defcdeec282f6
--- /dev/null
+++ b/clang/test/Analysis/cxx23-assume-attribute.cpp
@@ -0,0 +1,58 @@
+// RUN: %clang_analyze_cc1 -std=c++23 -triple x86_64-pc-linux-gnu \
+// RUN:   -analyzer-checker=core,debug.ExprInspection -verify %s
+
+template <typename T> void clang_analyzer_dump(T);
+template <typename T> void clang_analyzer_value(T);
+
+int ternary_in_builtin_assume(int a, int b) {
+  __builtin_assume(a > 10 ? b == 4 : b == 10);
+
+  clang_analyzer_value(a);
+  // expected-warning@-1 {{[-2147483648, 10]}}
+  // expected-warning@-2 {{[11, 2147483647]}}
+
+  clang_analyzer_dump(b); // expected-warning{{4}} expected-warning{{10}}
+
+  if (a > 20) {
+    clang_analyzer_dump(b + 100); // expected-warning {{104}}
+    return 2;
+  }
+  if (a > 10) {
+    clang_analyzer_dump(b + 200); // expected-warning {{204}}
+    return 1;
+  }
+  clang_analyzer_dump(b + 300); // expected-warning {{310}}
+  return 0;
+}
+
+// From: https://github.com/llvm/llvm-project/pull/116462#issuecomment-2517853226
+int ternary_in_assume(int a, int b) {
+  // FIXME notes
+  // Currently, if this test is run without the core.builtin.Builtin checker, the above function with the __builtin_assume behaves identically to the following test
+  // i.e. calls to `clang_analyzer_dump` result in "extraneous"  prints of the SVal(s) `reg<int b> ...`
+  // as opposed to 4 or 10
+  // which likely implies the Program State(s) did not get narrowed.
+  // A new checker is likely needed to be implemented to properly handle the expressions within `[[assume]]` to eliminate the states where `b` is not narrowed.
+
+  [[assume(a > 10 ? b == 4 : b == 10)]];
+  clang_analyzer_value(a);
+  // expected-warning@-1 {{[-2147483648, 10]}}
+  // expected-warning@-2 {{[11, 2147483647]}}
+
+  clang_analyzer_dump(b); // expected-warning {{4}} expected-warning {{10}}
+  // expected-warning-re@-1 {{reg_${{[0-9]+}}<int b>}} FIXME: We shouldn't have this dump.
+
+  if (a > 20) {
+    clang_analyzer_dump(b + 100); // expected-warning {{104}}
+    // expected-warning-re@-1 {{(reg_${{[0-9]+}}<int b>) + 100}} FIXME: We shouldn't have this dump.
+    return 2;
+  }
+  if (a > 10) {
+    clang_analyzer_dump(b + 200); // expected-warning {{204}}
+    // expected-warning-re@-1 {{(reg_${{[0-9]+}}<int b>) + 200}} FIXME: We shouldn't have this dump.
+    return 1;
+  }
+  clang_analyzer_dump(b + 300); // expected-warning {{310}}
+  // expected-warning-re@-1 {{(reg_${{[0-9]+}}<int b>) + 300}} FIXME: We shouldn't have this dump.
+  return 0;
+}
diff --git a/clang/test/Analysis/out-of-bounds-new.cpp b/clang/test/Analysis/out-of-bounds-new.cpp
index f541bdf810d79..39a40eb10bea7 100644
--- a/clang/test/Analysis/out-of-bounds-new.cpp
+++ b/clang/test/Analysis/out-of-bounds-new.cpp
@@ -1,4 +1,11 @@
-// RUN: %clang_analyze_cc1 -std=c++11 -Wno-array-bounds -analyzer-checker=unix,core,alpha.security.ArrayBoundV2 -verify %s
+// RUN: %clang_analyze_cc1 -std=c++11 -Wno-array-bounds -verify %s \
+// RUN:   -analyzer-checker=unix,core,alpha.security.ArrayBoundV2,debug.ExprInspection
+
+template <typename T> void clang_analyzer_dump(T);
+template <typename T> void clang_analyzer_value(T);
+void clang_analyzer_eval(bool);
+template <typename T>
+void clang_analyzer_explain(T);
 
 // Tests doing an out-of-bounds access after the end of an array using:
 // - constant integer index
@@ -180,3 +187,58 @@ int test_reference_that_might_be_after_the_end(int idx) {
   return ref;
 }
 
+// From: https://github.com/llvm/llvm-project/issues/100762
+extern int arrOf10[10];
+void using_builtin(int x) {
+  __builtin_assume(x > 101); // CallExpr
+  arrOf10[x] = 404; // expected-warning {{Out of bound access to memory}}
+}
+
+void using_assume_attr(int ax) {
+  [[assume(ax > 100)]]; // NullStmt with an "assume" attribute.
+  arrOf10[ax] = 405; // expected-warning {{Out of bound access to memory}}
+}
+
+void using_many_assume_attr(int yx) {
+  [[assume(yx > 104), assume(yx > 200), assume(yx < 300)]]; // NullStmt with an attribute
+  arrOf10[yx] = 406; // expected-warning{{Out of bound access to memory}}
+}
+
+
+int using_builtin_assume_has_no_sideeffects(int y) {
+  // We should not apply sideeffects of the argument of [[assume(...)]].
+  // "y" should not get incremented;
+  __builtin_assume(++y == 43); // expected-warning {{assumption is ignored because it contains (potential) side-effects}}
+  clang_analyzer_eval(y == 42); // expected-warning {{FALSE}}
+  return y;
+}
+
+
+
+int using_assume_attr_has_no_sideeffects(int y) {
+
+  // We should not apply sideeffects of the argument of [[assume(...)]].
+  // "y" should not get incremented;
+  [[assume(++y == 43)]]; // expected-warning {{assumption is ignored because it contains (potential) side-effects}}
+ 
+  clang_analyzer_eval(y == 42); // expected-warning {{TRUE}} expected-warning {{FALSE}} FIXME: This should be only TRUE.
+
+  clang_analyzer_eval(y == 43); // expected-warning {{FALSE}} expected-warning {{TRUE}} FIXME: This should be only FALSE.
+
+  return y;
+}
+
+
+int using_builtinassume_has_no_sideeffects(int u) {
+  // We should not apply sideeffects of the argument of __builtin_assume(...)
+  // "u" should not get incremented;
+  __builtin_assume(++u == 43); // expected-warning {{assumption is ignored because it contains (potential) side-effects}}
+ 
+  // FIXME: evaluate this to true
+  clang_analyzer_eval(u == 42); // expected-warning {{FALSE}}  current behavior 
+
+  // FIXME: evaluate this to false
+  clang_analyzer_eval(u == 43); // expected-warning {{TRUE}}  current behavior 
+
+  return u;
+}

From 9bb1d0369c064d50c5f0f7ed6313289c8a42d14f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 19 Dec 2024 12:55:44 +0000
Subject: [PATCH 054/209] [X86] getShuffleCost - when splitting shuffles, if a
 whole vector source is just copied we should treat this as free. (#120561)

If the shuffle split results in referencing a single legalised whole vector (i.e. no permutation), then this can be treated as free.

We already do something similar for broadcasts / whole subvector insertion + extraction - its purely an issue for register allocation.
---
 .../lib/Target/X86/X86TargetTransformInfo.cpp |  4 +--
 llvm/test/Analysis/CostModel/X86/reduction.ll | 28 +++++++++----------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 6cd1f0f78a375..54c9998c0ead2 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1728,7 +1728,7 @@ InstructionCost X86TTIImpl::getShuffleCost(
         // destination register is just a copy of the source register or the
         // copy of the previous destination register (the cost is
         // TTI::TCC_Basic). If the source register is just reused, the cost for
-        // this operation is 0.
+        // this operation is TTI::TCC_Free.
         NumOfDests =
             getTypeLegalizationCost(
                 FixedVectorType::get(BaseTp->getElementType(), Mask.size()))
@@ -1762,7 +1762,7 @@ InstructionCost X86TTIImpl::getShuffleCost(
               if (SrcReg != DestReg &&
                   any_of(RegMask, [](int I) { return I != PoisonMaskElem; })) {
                 // Just a copy of the source register.
-                Cost += TTI::TCC_Basic;
+                Cost += TTI::TCC_Free;
               }
               PrevSrcReg = SrcReg;
               PrevRegMask = RegMask;
diff --git a/llvm/test/Analysis/CostModel/X86/reduction.ll b/llvm/test/Analysis/CostModel/X86/reduction.ll
index e4686f50d22bc..c55545998d57f 100644
--- a/llvm/test/Analysis/CostModel/X86/reduction.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduction.ll
@@ -61,7 +61,7 @@ define fastcc float @reduction_cost_float(<4 x float> %rdx) {
 
 define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
 ; SSE-LABEL: 'reduction_cost_int'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
@@ -91,7 +91,7 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; SLM-LABEL: 'reduction_cost_int'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
@@ -416,7 +416,7 @@ define fastcc float @no_pairwise_reduction4float(<4 x float> %rdx, float %f1) {
 
 define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1) {
 ; SSE2-LABEL: 'no_pairwise_reduction4double'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
@@ -424,7 +424,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction4double'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
@@ -432,7 +432,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SSE42-LABEL: 'no_pairwise_reduction4double'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
@@ -456,7 +456,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret double %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction4double'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7
@@ -474,7 +474,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1
 
 define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SSE2-LABEL: 'no_pairwise_reduction8float'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
@@ -484,7 +484,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSSE3-LABEL: 'no_pairwise_reduction8float'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
@@ -494,7 +494,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SSE42-LABEL: 'no_pairwise_reduction8float'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
@@ -524,7 +524,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret float %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction8float'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = fadd <8 x float> %rdx, %rdx.shuf3
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x float> %bin.rdx4, <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf
@@ -592,7 +592,7 @@ define fastcc i32 @no_pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) {
 
 define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
 ; SSE-LABEL: 'no_pairwise_reduction4i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
@@ -616,7 +616,7 @@ define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction4i64'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> <i32 2, i32 3, i32 poison, i32 poison>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7
@@ -696,7 +696,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 
 define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
 ; SSE-LABEL: 'no_pairwise_reduction8i32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf
@@ -726,7 +726,7 @@ define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r
 ;
 ; SLM-LABEL: 'no_pairwise_reduction8i32'
-; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx4 = add <8 x i32> %rdx, %rdx.shuf3
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf = shufflevector <8 x i32> %bin.rdx4, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf

From ca98a3d9bbc254cbb7f028866a7d2077b7994ee8 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 19 Dec 2024 13:16:31 +0000
Subject: [PATCH 055/209] [AArch64][SVE] Use SVE for scalar FP converts in
 streaming[-compatible] functions (1/n) (#118505)

In streaming[-compatible] functions, use SVE for scalar FP conversions
to/from integer types. This can help avoid moves between FPRs and GRPs,
which could be costly.

This patch also updates definitions of SCVTF_ZPmZ_StoD and
UCVTF_ZPmZ_StoD to disallow lowering to them from ISD nodes, as doing so
requires creating a [U|S]INT_TO_FP_MERGE_PASSTHRU node with inconsistent
types.

Follow up to #112213.

Note: This PR does not include support for f64 <-> i32 conversions (like
#112564), which needs a bit more work to support.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  60 ++-
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |   4 +-
 .../sve-streaming-mode-cvt-fp-int-fp.ll       |  93 ++++-
 .../sve-streaming-mode-cvt-fp-to-int.ll       | 252 +++++++++++++
 .../sve-streaming-mode-cvt-int-to-fp.ll       | 252 +++++++++++++
 ...e-streaming-mode-fixed-length-fp-to-int.ll | 356 ++++++++----------
 ...e-streaming-mode-fixed-length-int-to-fp.ll |  94 +++--
 7 files changed, 857 insertions(+), 254 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll
 create mode 100644 llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 494506def33a3..8a9ee08869cd3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -19115,13 +19115,67 @@ static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
   return SDValue();
 }
 
+/// Tries to replace scalar FP <-> INT conversions with SVE in streaming
+/// functions, this can help to reduce the number of fmovs to/from GPRs.
+static SDValue
+tryToReplaceScalarFPConversionWithSVE(SDNode *N, SelectionDAG &DAG,
+                                      TargetLowering::DAGCombinerInfo &DCI,
+                                      const AArch64Subtarget *Subtarget) {
+  if (N->isStrictFPOpcode())
+    return SDValue();
+
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  if (!Subtarget->isSVEorStreamingSVEAvailable() ||
+      (!Subtarget->isStreaming() && !Subtarget->isStreamingCompatible()))
+    return SDValue();
+
+  auto isSupportedType = [](EVT VT) {
+    return !VT.isVector() && VT != MVT::bf16 && VT != MVT::f128;
+  };
+
+  SDValue SrcVal = N->getOperand(0);
+  EVT SrcTy = SrcVal.getValueType();
+  EVT DestTy = N->getValueType(0);
+
+  if (!isSupportedType(SrcTy) || !isSupportedType(DestTy))
+    return SDValue();
+
+  EVT SrcVecTy;
+  EVT DestVecTy;
+  if (DestTy.bitsGT(SrcTy)) {
+    DestVecTy = getPackedSVEVectorVT(DestTy);
+    SrcVecTy = DestVecTy.changeVectorElementType(SrcTy);
+  } else {
+    SrcVecTy = getPackedSVEVectorVT(SrcTy);
+    DestVecTy = SrcVecTy.changeVectorElementType(DestTy);
+  }
+
+  // Ensure the resulting src/dest vector type is legal.
+  if (SrcVecTy == MVT::nxv2i32 || DestVecTy == MVT::nxv2i32)
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
+  SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, SrcVecTy,
+                            DAG.getUNDEF(SrcVecTy), SrcVal, ZeroIdx);
+  SDValue Convert = DAG.getNode(N->getOpcode(), DL, DestVecTy, Vec);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestTy, Convert, ZeroIdx);
+}
+
 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI,
                                      const AArch64Subtarget *Subtarget) {
   // First try to optimize away the conversion when it's conditionally from
   // a constant. Vectors only.
   if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
     return Res;
 
+  if (SDValue Res =
+          tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
+    return Res;
+
   EVT VT = N->getValueType(0);
   if (VT != MVT::f32 && VT != MVT::f64)
     return SDValue();
@@ -19160,6 +19214,10 @@ static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const AArch64Subtarget *Subtarget) {
+  if (SDValue Res =
+          tryToReplaceScalarFPConversionWithSVE(N, DAG, DCI, Subtarget))
+    return Res;
+
   if (!Subtarget->isNeonAvailable())
     return SDValue();
 
@@ -26240,7 +26298,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performMulCombine(N, DAG, DCI, Subtarget);
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
-    return performIntToFpCombine(N, DAG, Subtarget);
+    return performIntToFpCombine(N, DAG, DCI, Subtarget);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::FP_TO_SINT_SAT:
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index dfdc78e00f2a2..c8892de647437 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2348,8 +2348,8 @@ let Predicates = [HasSVEorSME] in {
   defm FCVT_ZPmZ_HtoD   : sve_fp_2op_p_zd< 0b1101001, "fcvt",   ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16,   AArch64fcvte_mt,  nxv2f64, nxv2i1, nxv2f16, ElementSizeD>;
   defm FCVT_ZPmZ_DtoS   : sve_fp_2op_p_zdr<0b1101010, "fcvt",   ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64,   AArch64fcvtr_mt,  nxv2f32, nxv2i1, nxv2f64, ElementSizeD>;
   defm FCVT_ZPmZ_StoD   : sve_fp_2op_p_zd< 0b1101011, "fcvt",   ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32,   AArch64fcvte_mt,  nxv2f64, nxv2i1, nxv2f32, ElementSizeD>;
-  defm SCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd< 0b1110000, "scvtf",  ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32,  AArch64scvtf_mt,  nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
-  defm UCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd< 0b1110001, "ucvtf",  ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32,  AArch64ucvtf_mt,  nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
+  defm SCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd< 0b1110000, "scvtf",  ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32,  null_frag,        nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
+  defm UCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd< 0b1110001, "ucvtf",  ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32,  null_frag,        nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
   defm UCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd< 0b0110101, "ucvtf",  ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32,  AArch64ucvtf_mt,  nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
   defm SCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd< 0b1110100, "scvtf",  ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64,  AArch64scvtf_mt,  nxv2f32, nxv2i1, nxv2i64, ElementSizeD>;
   defm SCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd< 0b0110100, "scvtf",  ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32,  AArch64scvtf_mt,  nxv4f16, nxv4i1, nxv4i32, ElementSizeS>;
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
index 0d291e0bf0798..f4ae66a3b2259 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
@@ -1,15 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -force-streaming-compatible  < %s | FileCheck %s
-; RUN: llc -force-streaming-compatible -mattr=+sme2p2  < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
-; RUN: llc < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
+; RUN: llc -mattr=+sme2p2 -force-streaming-compatible < %s | FileCheck %s --check-prefix=USE-NEON-NO-GPRS
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
 define double @t1(double %x) {
 ; CHECK-LABEL: t1:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzs x8, d0
-; CHECK-NEXT:    scvtf d0, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; USE-NEON-NO-GPRS-LABEL: t1:
@@ -17,6 +21,12 @@ define double @t1(double %x) {
 ; USE-NEON-NO-GPRS-NEXT:    fcvtzs d0, d0
 ; USE-NEON-NO-GPRS-NEXT:    scvtf d0, d0
 ; USE-NEON-NO-GPRS-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: t1:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzs x8, d0
+; NONEON-NOSVE-NEXT:    scvtf d0, x8
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %conv = fptosi double %x to i64
   %conv1 = sitofp i64 %conv to double
@@ -26,8 +36,11 @@ entry:
 define float @t2(float %x) {
 ; CHECK-LABEL: t2:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzs w8, s0
-; CHECK-NEXT:    scvtf s0, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; USE-NEON-NO-GPRS-LABEL: t2:
@@ -35,6 +48,12 @@ define float @t2(float %x) {
 ; USE-NEON-NO-GPRS-NEXT:    fcvtzs s0, s0
 ; USE-NEON-NO-GPRS-NEXT:    scvtf s0, s0
 ; USE-NEON-NO-GPRS-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: t2:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %conv = fptosi float %x to i32
   %conv1 = sitofp i32 %conv to float
@@ -44,11 +63,20 @@ entry:
 define half @t3(half %x)  {
 ; CHECK-LABEL: t3:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    fcvtzs w8, s0
-; CHECK-NEXT:    scvtf s0, w8
-; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: t3:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w8, s0
+; NONEON-NOSVE-NEXT:    scvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %conv = fptosi half %x to i32
   %conv1 = sitofp i32 %conv to half
@@ -58,8 +86,11 @@ entry:
 define double @t4(double %x) {
 ; CHECK-LABEL: t4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzu x8, d0
-; CHECK-NEXT:    ucvtf d0, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; USE-NEON-NO-GPRS-LABEL: t4:
@@ -67,6 +98,12 @@ define double @t4(double %x) {
 ; USE-NEON-NO-GPRS-NEXT:    fcvtzu d0, d0
 ; USE-NEON-NO-GPRS-NEXT:    ucvtf d0, d0
 ; USE-NEON-NO-GPRS-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: t4:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzu x8, d0
+; NONEON-NOSVE-NEXT:    ucvtf d0, x8
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %conv = fptoui double %x to i64
   %conv1 = uitofp i64 %conv to double
@@ -76,8 +113,11 @@ entry:
 define float @t5(float %x) {
 ; CHECK-LABEL: t5:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtzu w8, s0
-; CHECK-NEXT:    ucvtf s0, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; USE-NEON-NO-GPRS-LABEL: t5:
@@ -85,6 +125,12 @@ define float @t5(float %x) {
 ; USE-NEON-NO-GPRS-NEXT:    fcvtzu s0, s0
 ; USE-NEON-NO-GPRS-NEXT:    ucvtf s0, s0
 ; USE-NEON-NO-GPRS-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: t5:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %conv = fptoui float %x to i32
   %conv1 = uitofp i32 %conv to float
@@ -94,11 +140,20 @@ entry:
 define half @t6(half %x)  {
 ; CHECK-LABEL: t6:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvt s0, h0
-; CHECK-NEXT:    fcvtzu w8, s0
-; CHECK-NEXT:    ucvtf s0, w8
-; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: t6:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w8, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
 entry:
   %conv = fptoui half %x to i32
   %conv1 = uitofp i32 %conv to half
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll
new file mode 100644
index 0000000000000..3ae0089d409d0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-to-int.ll
@@ -0,0 +1,252 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define i32 @f16_to_s32(half %x) {
+; CHECK-LABEL: f16_to_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f16_to_s32:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs w0, s0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptosi half %x to i32
+  ret i32 %cvt
+}
+
+define i64 @f16_to_s64(half %x) {
+; CHECK-LABEL: f16_to_s64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f16_to_s64:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzs x0, s0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptosi half %x to i64
+  ret i64 %cvt
+}
+
+define i32 @f32_to_s32(float %x) {
+; CHECK-LABEL: f32_to_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f32_to_s32:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzs w0, s0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptosi float %x to i32
+  ret i32 %cvt
+}
+
+define i64 @f32_to_s64(float %x) {
+; CHECK-LABEL: f32_to_s64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f32_to_s64:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzs x0, s0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptosi float %x to i64
+  ret i64 %cvt
+}
+
+define i32 @f64_to_s32(double %x) {
+; CHECK-LABEL: f64_to_s32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs w0, d0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f64_to_s32:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzs w0, d0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptosi double %x to i32
+  ret i32 %cvt
+}
+
+define i64 @f64_to_s64(double %x) {
+; CHECK-LABEL: f64_to_s64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f64_to_s64:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzs x0, d0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptosi double %x to i64
+  ret i64 %cvt
+}
+
+define i32 @f16_to_u32(half %x) {
+; CHECK-LABEL: f16_to_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f16_to_u32:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu w0, s0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptoui half %x to i32
+  ret i32 %cvt
+}
+
+define i64 @f16_to_u64(half %x) {
+; CHECK-LABEL: f16_to_u64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f16_to_u64:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvt s0, h0
+; NONEON-NOSVE-NEXT:    fcvtzu x0, s0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptoui half %x to i64
+  ret i64 %cvt
+}
+
+define i32 @f32_to_u32(float %x) {
+; CHECK-LABEL: f32_to_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f32_to_u32:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzu w0, s0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptoui float %x to i32
+  ret i32 %cvt
+}
+
+define i64 @f32_to_u64(float %x) {
+; CHECK-LABEL: f32_to_u64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.s
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f32_to_u64:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzu x0, s0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptoui float %x to i64
+  ret i64 %cvt
+}
+
+define i32 @f64_to_u32(double %x) {
+; CHECK-LABEL: f64_to_u32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu w0, d0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f64_to_u32:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzu w0, d0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptoui double %x to i32
+  ret i32 %cvt
+}
+
+define i64 @f64_to_u64(double %x) {
+; CHECK-LABEL: f64_to_u64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: f64_to_u64:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzu x0, d0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = fptoui double %x to i64
+  ret i64 %cvt
+}
+
+define i32 @strict_convert_signed(double %x) {
+; CHECK-LABEL: strict_convert_signed:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs w0, d0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: strict_convert_signed:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzs w0, d0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %x, metadata !"fpexcept.strict") #0
+  ret i32 %cvt
+}
+
+define i32 @strict_convert_unsigned(float %x) {
+; CHECK-LABEL: strict_convert_unsigned:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu w0, s0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: strict_convert_unsigned:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    fcvtzu w0, s0
+; NONEON-NOSVE-NEXT:    ret
+  entry:
+  %cvt = call i32 @llvm.experimental.constrained.fptoui.i32.f32(float %x, metadata !"fpexcept.strict") #0
+  ret i32 %cvt
+}
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll
new file mode 100644
index 0000000000000..d4221dab4fcff
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-int-to-fp.ll
@@ -0,0 +1,252 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
+; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
+; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define half @s32_to_f16(i32 %x) {
+; CHECK-LABEL: s32_to_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: s32_to_f16:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    scvtf s0, w0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = sitofp i32 %x to half
+  ret half %cvt
+}
+
+define float @s32_to_f32(i32 %x) {
+; CHECK-LABEL: s32_to_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: s32_to_f32:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    scvtf s0, w0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = sitofp i32 %x to float
+  ret float %cvt
+}
+
+define double @s32_to_f64(i32 %x) {
+; CHECK-LABEL: s32_to_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    scvtf d0, w0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: s32_to_f64:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    scvtf d0, w0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = sitofp i32 %x to double
+  ret double %cvt
+}
+
+define half @u32_to_f16(i32 %x) {
+; CHECK-LABEL: u32_to_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: u32_to_f16:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    ucvtf s0, w0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = uitofp i32 %x to half
+  ret half %cvt
+}
+
+define float @u32_to_f32(i32 %x) {
+; CHECK-LABEL: u32_to_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: u32_to_f32:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    ucvtf s0, w0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = uitofp i32 %x to float
+  ret float %cvt
+}
+
+define double @u32_to_f64(i32 %x) {
+; CHECK-LABEL: u32_to_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ucvtf d0, w0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: u32_to_f64:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    ucvtf d0, w0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = uitofp i32 %x to double
+  ret double %cvt
+}
+
+define half @s64_to_f16(i64 %x) {
+; CHECK-LABEL: s64_to_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: s64_to_f16:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    scvtf s0, x0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = sitofp i64 %x to half
+  ret half %cvt
+}
+
+define float @s64_to_f32(i64 %x) {
+; CHECK-LABEL: s64_to_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: s64_to_f32:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    scvtf s0, x0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = sitofp i64 %x to float
+  ret float %cvt
+}
+
+define double @s64_to_f64(i64 %x) {
+; CHECK-LABEL: s64_to_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: s64_to_f64:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    scvtf d0, x0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = sitofp i64 %x to double
+  ret double %cvt
+}
+
+define half @u64_to_f16(i64 %x) {
+; CHECK-LABEL: u64_to_f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: u64_to_f16:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    ucvtf s0, x0
+; NONEON-NOSVE-NEXT:    fcvt h0, s0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = uitofp i64 %x to half
+  ret half %cvt
+}
+
+define float @u64_to_f32(i64 %x) {
+; CHECK-LABEL: u64_to_f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: u64_to_f32:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    ucvtf s0, x0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = uitofp i64 %x to float
+  ret float %cvt
+}
+
+define double @u64_to_f64(i64 %x) {
+; CHECK-LABEL: u64_to_f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov d0, x0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: u64_to_f64:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    ucvtf d0, x0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = uitofp i64 %x to double
+  ret double %cvt
+}
+
+define float @strict_convert_signed(i32 %x) {
+; CHECK-LABEL: strict_convert_signed:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    scvtf s0, w0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: strict_convert_signed:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    scvtf s0, w0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %cvt
+}
+
+define float @strict_convert_unsigned(i64 %x) {
+; CHECK-LABEL: strict_convert_unsigned:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ucvtf s0, x0
+; CHECK-NEXT:    ret
+;
+; NONEON-NOSVE-LABEL: strict_convert_unsigned:
+; NONEON-NOSVE:       // %bb.0: // %entry
+; NONEON-NOSVE-NEXT:    ucvtf s0, x0
+; NONEON-NOSVE-NEXT:    ret
+entry:
+  %cvt = call float @llvm.experimental.constrained.uitofp.f32.i64(i64 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %cvt
+}
+
+attributes #0 = { strictfp }
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
index 11fee267660c0..b61c30af37994 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-to-int.ll
@@ -418,8 +418,10 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) {
 define <1 x i64> @fcvtzu_v1f16_v1i64(<1 x half> %op1) {
 ; CHECK-LABEL: fcvtzu_v1f16_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtzu x8, h0
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v1f16_v1i64:
@@ -441,10 +443,9 @@ define <2 x i64> @fcvtzu_v2f16_v2i64(<2 x half> %op1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z1.h, z0.h[1]
-; CHECK-NEXT:    fcvtzu x8, h0
-; CHECK-NEXT:    fcvtzu x9, h1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.h
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -472,20 +473,17 @@ define void @fcvtzu_v4f16_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v4f16_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.h, z0.h[3]
 ; CHECK-NEXT:    mov z2.h, z0.h[2]
 ; CHECK-NEXT:    mov z3.h, z0.h[1]
-; CHECK-NEXT:    fcvtzu x10, h0
-; CHECK-NEXT:    fcvtzu x8, h1
-; CHECK-NEXT:    fcvtzu x9, h2
-; CHECK-NEXT:    fcvtzu x11, h3
-; CHECK-NEXT:    fmov d2, x10
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
-; CHECK-NEXT:    fmov d1, x11
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.h
+; CHECK-NEXT:    fcvtzu z2.d, p0/m, z2.h
+; CHECK-NEXT:    fcvtzu z3.d, p0/m, z3.h
 ; CHECK-NEXT:    zip1 z1.d, z2.d, z1.d
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    zip1 z0.d, z0.d, z3.d
+; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v4f16_v4i64:
@@ -522,36 +520,29 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v8f16_v8i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov z2.h, z0.h[3]
 ; CHECK-NEXT:    mov z3.h, z0.h[2]
 ; CHECK-NEXT:    mov z4.h, z0.h[1]
-; CHECK-NEXT:    fcvtzu x10, h0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
-; CHECK-NEXT:    fcvtzu x8, h2
-; CHECK-NEXT:    fcvtzu x9, h3
-; CHECK-NEXT:    fcvtzu x11, h4
+; CHECK-NEXT:    fcvtzu z2.d, p0/m, z2.h
+; CHECK-NEXT:    fcvtzu z3.d, p0/m, z3.h
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT:    fcvtzu z4.d, p0/m, z4.h
 ; CHECK-NEXT:    mov z5.h, z1.h[3]
 ; CHECK-NEXT:    mov z6.h, z1.h[2]
-; CHECK-NEXT:    mov z2.h, z1.h[1]
-; CHECK-NEXT:    fcvtzu x14, h1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    fmov d3, x11
-; CHECK-NEXT:    fcvtzu x12, h5
-; CHECK-NEXT:    fcvtzu x13, h6
-; CHECK-NEXT:    fcvtzu x15, h2
-; CHECK-NEXT:    fmov d2, x10
-; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
-; CHECK-NEXT:    fmov d1, x12
-; CHECK-NEXT:    fmov d4, x13
-; CHECK-NEXT:    zip1 z2.d, z2.d, z3.d
-; CHECK-NEXT:    fmov d3, x14
-; CHECK-NEXT:    zip1 z1.d, z4.d, z1.d
-; CHECK-NEXT:    fmov d4, x15
-; CHECK-NEXT:    stp q2, q0, [x1]
-; CHECK-NEXT:    zip1 z3.d, z3.d, z4.d
-; CHECK-NEXT:    stp q3, q1, [x1, #32]
+; CHECK-NEXT:    mov z7.h, z1.h[1]
+; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.h
+; CHECK-NEXT:    zip1 z2.d, z3.d, z2.d
+; CHECK-NEXT:    zip1 z0.d, z0.d, z4.d
+; CHECK-NEXT:    fcvtzu z5.d, p0/m, z5.h
+; CHECK-NEXT:    fcvtzu z6.d, p0/m, z6.h
+; CHECK-NEXT:    fcvtzu z7.d, p0/m, z7.h
+; CHECK-NEXT:    stp q0, q2, [x1]
+; CHECK-NEXT:    zip1 z3.d, z6.d, z5.d
+; CHECK-NEXT:    zip1 z1.d, z1.d, z7.d
+; CHECK-NEXT:    stp q1, q3, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v8f16_v8i64:
@@ -604,67 +595,54 @@ define void @fcvtzu_v8f16_v8i64(ptr %a, ptr %b) {
 define void @fcvtzu_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzu_v16f16_v16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mov z2.h, z0.h[3]
-; CHECK-NEXT:    mov z4.h, z1.h[1]
-; CHECK-NEXT:    mov z6.h, z1.h[3]
-; CHECK-NEXT:    fcvtzu x9, h1
-; CHECK-NEXT:    fcvtzu x8, h0
-; CHECK-NEXT:    mov z7.h, z0.h[1]
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    ext z5.b, z5.b, z1.b, #8
-; CHECK-NEXT:    fcvtzu x10, h2
-; CHECK-NEXT:    fcvtzu x11, h4
-; CHECK-NEXT:    fcvtzu x12, h6
-; CHECK-NEXT:    mov z1.h, z1.h[2]
-; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    fmov d16, x9
-; CHECK-NEXT:    mov z2.h, z3.h[3]
-; CHECK-NEXT:    mov z4.h, z5.h[3]
-; CHECK-NEXT:    fcvtzu x14, h3
-; CHECK-NEXT:    fcvtzu x13, h1
-; CHECK-NEXT:    fcvtzu x15, h5
-; CHECK-NEXT:    mov z1.h, z3.h[1]
-; CHECK-NEXT:    mov z6.h, z5.h[1]
-; CHECK-NEXT:    mov z5.h, z5.h[2]
-; CHECK-NEXT:    mov z3.h, z3.h[2]
-; CHECK-NEXT:    fcvtzu x9, h2
-; CHECK-NEXT:    fmov d2, x10
-; CHECK-NEXT:    fcvtzu x10, h4
-; CHECK-NEXT:    fmov d4, x11
-; CHECK-NEXT:    fcvtzu x11, h7
-; CHECK-NEXT:    fmov d7, x12
-; CHECK-NEXT:    fcvtzu x12, h0
-; CHECK-NEXT:    fmov d0, x13
-; CHECK-NEXT:    fcvtzu x13, h1
-; CHECK-NEXT:    fmov d1, x14
-; CHECK-NEXT:    fcvtzu x14, h6
-; CHECK-NEXT:    fmov d6, x15
-; CHECK-NEXT:    fcvtzu x15, h5
-; CHECK-NEXT:    fmov d5, x9
-; CHECK-NEXT:    fcvtzu x9, h3
-; CHECK-NEXT:    zip1 z4.d, z16.d, z4.d
-; CHECK-NEXT:    fmov d16, x8
-; CHECK-NEXT:    zip1 z0.d, z0.d, z7.d
-; CHECK-NEXT:    fmov d3, x12
-; CHECK-NEXT:    fmov d7, x10
-; CHECK-NEXT:    stp q4, q0, [x1, #64]
-; CHECK-NEXT:    fmov d0, x14
-; CHECK-NEXT:    fmov d4, x9
-; CHECK-NEXT:    zip1 z2.d, z3.d, z2.d
-; CHECK-NEXT:    fmov d3, x11
-; CHECK-NEXT:    zip1 z0.d, z6.d, z0.d
-; CHECK-NEXT:    zip1 z4.d, z4.d, z5.d
-; CHECK-NEXT:    zip1 z3.d, z16.d, z3.d
-; CHECK-NEXT:    fmov d16, x15
-; CHECK-NEXT:    stp q3, q2, [x1]
-; CHECK-NEXT:    fmov d2, x13
-; CHECK-NEXT:    zip1 z7.d, z16.d, z7.d
-; CHECK-NEXT:    zip1 z1.d, z1.d, z2.d
-; CHECK-NEXT:    stp q0, q7, [x1, #96]
-; CHECK-NEXT:    stp q1, q4, [x1, #32]
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z3.h, z1.h[1]
+; CHECK-NEXT:    mov z5.h, z0.h[3]
+; CHECK-NEXT:    mov z6.h, z0.h[2]
+; CHECK-NEXT:    mov z16.d, z0.d
+; CHECK-NEXT:    movprfx z2, z1
+; CHECK-NEXT:    fcvtzu z2.d, p0/m, z1.h
+; CHECK-NEXT:    mov z4.h, z1.h[3]
+; CHECK-NEXT:    mov z7.h, z1.h[2]
+; CHECK-NEXT:    mov z17.h, z0.h[1]
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    fcvtzu z3.d, p0/m, z3.h
+; CHECK-NEXT:    fcvtzu z5.d, p0/m, z5.h
+; CHECK-NEXT:    fcvtzu z6.d, p0/m, z6.h
+; CHECK-NEXT:    ext z16.b, z16.b, z0.b, #8
+; CHECK-NEXT:    fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT:    fcvtzu z4.d, p0/m, z4.h
+; CHECK-NEXT:    fcvtzu z17.d, p0/m, z17.h
+; CHECK-NEXT:    fcvtzu z7.d, p0/m, z7.h
+; CHECK-NEXT:    mov z20.h, z1.h[3]
+; CHECK-NEXT:    mov z18.h, z16.h[3]
+; CHECK-NEXT:    mov z19.h, z16.h[2]
+; CHECK-NEXT:    mov z21.h, z16.h[1]
+; CHECK-NEXT:    zip1 z2.d, z2.d, z3.d
+; CHECK-NEXT:    mov z3.h, z1.h[2]
+; CHECK-NEXT:    zip1 z5.d, z6.d, z5.d
+; CHECK-NEXT:    mov z6.h, z1.h[1]
+; CHECK-NEXT:    zip1 z0.d, z0.d, z17.d
+; CHECK-NEXT:    fcvtzu z16.d, p0/m, z16.h
+; CHECK-NEXT:    fcvtzu z18.d, p0/m, z18.h
+; CHECK-NEXT:    movprfx z17, z21
+; CHECK-NEXT:    fcvtzu z17.d, p0/m, z21.h
+; CHECK-NEXT:    fcvtzu z19.d, p0/m, z19.h
+; CHECK-NEXT:    zip1 z4.d, z7.d, z4.d
+; CHECK-NEXT:    movprfx z7, z20
+; CHECK-NEXT:    fcvtzu z7.d, p0/m, z20.h
+; CHECK-NEXT:    fcvtzu z3.d, p0/m, z3.h
+; CHECK-NEXT:    fcvtzu z1.d, p0/m, z1.h
+; CHECK-NEXT:    stp q0, q5, [x1, #64]
+; CHECK-NEXT:    fcvtzu z6.d, p0/m, z6.h
+; CHECK-NEXT:    zip1 z0.d, z19.d, z18.d
+; CHECK-NEXT:    zip1 z5.d, z16.d, z17.d
+; CHECK-NEXT:    stp q2, q4, [x1]
+; CHECK-NEXT:    zip1 z2.d, z3.d, z7.d
+; CHECK-NEXT:    zip1 z1.d, z1.d, z6.d
+; CHECK-NEXT:    stp q5, q0, [x1, #96]
+; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzu_v16f16_v16i64:
@@ -2135,8 +2113,10 @@ define void @fcvtzs_v16f16_v16i32(ptr %a, ptr %b) {
 define <1 x i64> @fcvtzs_v1f16_v1i64(<1 x half> %op1) {
 ; CHECK-LABEL: fcvtzs_v1f16_v1i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtzs x8, h0
-; CHECK-NEXT:    fmov d0, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v1f16_v1i64:
@@ -2159,10 +2139,9 @@ define <2 x i64> @fcvtzs_v2f16_v2i64(<2 x half> %op1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    mov z1.h, z0.h[1]
-; CHECK-NEXT:    fcvtzs x8, h0
-; CHECK-NEXT:    fcvtzs x9, h1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fmov d1, x9
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.h
 ; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-NEXT:    ret
@@ -2190,20 +2169,17 @@ define void @fcvtzs_v4f16_v4i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v4f16_v4i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.h, z0.h[3]
 ; CHECK-NEXT:    mov z2.h, z0.h[2]
 ; CHECK-NEXT:    mov z3.h, z0.h[1]
-; CHECK-NEXT:    fcvtzs x10, h0
-; CHECK-NEXT:    fcvtzs x8, h1
-; CHECK-NEXT:    fcvtzs x9, h2
-; CHECK-NEXT:    fcvtzs x11, h3
-; CHECK-NEXT:    fmov d2, x10
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
-; CHECK-NEXT:    fmov d1, x11
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.h
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.h
+; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.h
 ; CHECK-NEXT:    zip1 z1.d, z2.d, z1.d
-; CHECK-NEXT:    stp q1, q0, [x1]
+; CHECK-NEXT:    zip1 z0.d, z0.d, z3.d
+; CHECK-NEXT:    stp q0, q1, [x1]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v4f16_v4i64:
@@ -2240,36 +2216,29 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v8f16_v8i64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov z1.d, z0.d
 ; CHECK-NEXT:    mov z2.h, z0.h[3]
 ; CHECK-NEXT:    mov z3.h, z0.h[2]
 ; CHECK-NEXT:    mov z4.h, z0.h[1]
-; CHECK-NEXT:    fcvtzs x10, h0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
-; CHECK-NEXT:    fcvtzs x8, h2
-; CHECK-NEXT:    fcvtzs x9, h3
-; CHECK-NEXT:    fcvtzs x11, h4
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z2.h
+; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.h
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.h
 ; CHECK-NEXT:    mov z5.h, z1.h[3]
 ; CHECK-NEXT:    mov z6.h, z1.h[2]
-; CHECK-NEXT:    mov z2.h, z1.h[1]
-; CHECK-NEXT:    fcvtzs x14, h1
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    fmov d1, x9
-; CHECK-NEXT:    fmov d3, x11
-; CHECK-NEXT:    fcvtzs x12, h5
-; CHECK-NEXT:    fcvtzs x13, h6
-; CHECK-NEXT:    fcvtzs x15, h2
-; CHECK-NEXT:    fmov d2, x10
-; CHECK-NEXT:    zip1 z0.d, z1.d, z0.d
-; CHECK-NEXT:    fmov d1, x12
-; CHECK-NEXT:    fmov d4, x13
-; CHECK-NEXT:    zip1 z2.d, z2.d, z3.d
-; CHECK-NEXT:    fmov d3, x14
-; CHECK-NEXT:    zip1 z1.d, z4.d, z1.d
-; CHECK-NEXT:    fmov d4, x15
-; CHECK-NEXT:    stp q2, q0, [x1]
-; CHECK-NEXT:    zip1 z3.d, z3.d, z4.d
-; CHECK-NEXT:    stp q3, q1, [x1, #32]
+; CHECK-NEXT:    mov z7.h, z1.h[1]
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.h
+; CHECK-NEXT:    zip1 z2.d, z3.d, z2.d
+; CHECK-NEXT:    zip1 z0.d, z0.d, z4.d
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.h
+; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.h
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z7.h
+; CHECK-NEXT:    stp q0, q2, [x1]
+; CHECK-NEXT:    zip1 z3.d, z6.d, z5.d
+; CHECK-NEXT:    zip1 z1.d, z1.d, z7.d
+; CHECK-NEXT:    stp q1, q3, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v8f16_v8i64:
@@ -2322,67 +2291,54 @@ define void @fcvtzs_v8f16_v8i64(ptr %a, ptr %b) {
 define void @fcvtzs_v16f16_v16i64(ptr %a, ptr %b) {
 ; CHECK-LABEL: fcvtzs_v16f16_v16i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp q0, q1, [x0]
-; CHECK-NEXT:    mov z3.d, z0.d
-; CHECK-NEXT:    mov z5.d, z1.d
-; CHECK-NEXT:    mov z2.h, z0.h[3]
-; CHECK-NEXT:    mov z4.h, z1.h[1]
-; CHECK-NEXT:    mov z6.h, z1.h[3]
-; CHECK-NEXT:    fcvtzs x9, h1
-; CHECK-NEXT:    fcvtzs x8, h0
-; CHECK-NEXT:    mov z7.h, z0.h[1]
-; CHECK-NEXT:    ext z3.b, z3.b, z0.b, #8
-; CHECK-NEXT:    ext z5.b, z5.b, z1.b, #8
-; CHECK-NEXT:    fcvtzs x10, h2
-; CHECK-NEXT:    fcvtzs x11, h4
-; CHECK-NEXT:    fcvtzs x12, h6
-; CHECK-NEXT:    mov z1.h, z1.h[2]
-; CHECK-NEXT:    mov z0.h, z0.h[2]
-; CHECK-NEXT:    fmov d16, x9
-; CHECK-NEXT:    mov z2.h, z3.h[3]
-; CHECK-NEXT:    mov z4.h, z5.h[3]
-; CHECK-NEXT:    fcvtzs x14, h3
-; CHECK-NEXT:    fcvtzs x13, h1
-; CHECK-NEXT:    fcvtzs x15, h5
-; CHECK-NEXT:    mov z1.h, z3.h[1]
-; CHECK-NEXT:    mov z6.h, z5.h[1]
-; CHECK-NEXT:    mov z5.h, z5.h[2]
-; CHECK-NEXT:    mov z3.h, z3.h[2]
-; CHECK-NEXT:    fcvtzs x9, h2
-; CHECK-NEXT:    fmov d2, x10
-; CHECK-NEXT:    fcvtzs x10, h4
-; CHECK-NEXT:    fmov d4, x11
-; CHECK-NEXT:    fcvtzs x11, h7
-; CHECK-NEXT:    fmov d7, x12
-; CHECK-NEXT:    fcvtzs x12, h0
-; CHECK-NEXT:    fmov d0, x13
-; CHECK-NEXT:    fcvtzs x13, h1
-; CHECK-NEXT:    fmov d1, x14
-; CHECK-NEXT:    fcvtzs x14, h6
-; CHECK-NEXT:    fmov d6, x15
-; CHECK-NEXT:    fcvtzs x15, h5
-; CHECK-NEXT:    fmov d5, x9
-; CHECK-NEXT:    fcvtzs x9, h3
-; CHECK-NEXT:    zip1 z4.d, z16.d, z4.d
-; CHECK-NEXT:    fmov d16, x8
-; CHECK-NEXT:    zip1 z0.d, z0.d, z7.d
-; CHECK-NEXT:    fmov d3, x12
-; CHECK-NEXT:    fmov d7, x10
-; CHECK-NEXT:    stp q4, q0, [x1, #64]
-; CHECK-NEXT:    fmov d0, x14
-; CHECK-NEXT:    fmov d4, x9
-; CHECK-NEXT:    zip1 z2.d, z3.d, z2.d
-; CHECK-NEXT:    fmov d3, x11
-; CHECK-NEXT:    zip1 z0.d, z6.d, z0.d
-; CHECK-NEXT:    zip1 z4.d, z4.d, z5.d
-; CHECK-NEXT:    zip1 z3.d, z16.d, z3.d
-; CHECK-NEXT:    fmov d16, x15
-; CHECK-NEXT:    stp q3, q2, [x1]
-; CHECK-NEXT:    fmov d2, x13
-; CHECK-NEXT:    zip1 z7.d, z16.d, z7.d
-; CHECK-NEXT:    zip1 z1.d, z1.d, z2.d
-; CHECK-NEXT:    stp q0, q7, [x1, #96]
-; CHECK-NEXT:    stp q1, q4, [x1, #32]
+; CHECK-NEXT:    ldp q1, q0, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z3.h, z1.h[1]
+; CHECK-NEXT:    mov z5.h, z0.h[3]
+; CHECK-NEXT:    mov z6.h, z0.h[2]
+; CHECK-NEXT:    mov z16.d, z0.d
+; CHECK-NEXT:    movprfx z2, z1
+; CHECK-NEXT:    fcvtzs z2.d, p0/m, z1.h
+; CHECK-NEXT:    mov z4.h, z1.h[3]
+; CHECK-NEXT:    mov z7.h, z1.h[2]
+; CHECK-NEXT:    mov z17.h, z0.h[1]
+; CHECK-NEXT:    ext z1.b, z1.b, z1.b, #8
+; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.h
+; CHECK-NEXT:    fcvtzs z5.d, p0/m, z5.h
+; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.h
+; CHECK-NEXT:    ext z16.b, z16.b, z0.b, #8
+; CHECK-NEXT:    fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT:    fcvtzs z4.d, p0/m, z4.h
+; CHECK-NEXT:    fcvtzs z17.d, p0/m, z17.h
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z7.h
+; CHECK-NEXT:    mov z20.h, z1.h[3]
+; CHECK-NEXT:    mov z18.h, z16.h[3]
+; CHECK-NEXT:    mov z19.h, z16.h[2]
+; CHECK-NEXT:    mov z21.h, z16.h[1]
+; CHECK-NEXT:    zip1 z2.d, z2.d, z3.d
+; CHECK-NEXT:    mov z3.h, z1.h[2]
+; CHECK-NEXT:    zip1 z5.d, z6.d, z5.d
+; CHECK-NEXT:    mov z6.h, z1.h[1]
+; CHECK-NEXT:    zip1 z0.d, z0.d, z17.d
+; CHECK-NEXT:    fcvtzs z16.d, p0/m, z16.h
+; CHECK-NEXT:    fcvtzs z18.d, p0/m, z18.h
+; CHECK-NEXT:    movprfx z17, z21
+; CHECK-NEXT:    fcvtzs z17.d, p0/m, z21.h
+; CHECK-NEXT:    fcvtzs z19.d, p0/m, z19.h
+; CHECK-NEXT:    zip1 z4.d, z7.d, z4.d
+; CHECK-NEXT:    movprfx z7, z20
+; CHECK-NEXT:    fcvtzs z7.d, p0/m, z20.h
+; CHECK-NEXT:    fcvtzs z3.d, p0/m, z3.h
+; CHECK-NEXT:    fcvtzs z1.d, p0/m, z1.h
+; CHECK-NEXT:    stp q0, q5, [x1, #64]
+; CHECK-NEXT:    fcvtzs z6.d, p0/m, z6.h
+; CHECK-NEXT:    zip1 z0.d, z19.d, z18.d
+; CHECK-NEXT:    zip1 z5.d, z16.d, z17.d
+; CHECK-NEXT:    stp q2, q4, [x1]
+; CHECK-NEXT:    zip1 z2.d, z3.d, z7.d
+; CHECK-NEXT:    zip1 z1.d, z1.d, z6.d
+; CHECK-NEXT:    stp q5, q0, [x1, #96]
+; CHECK-NEXT:    stp q1, q2, [x1, #32]
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: fcvtzs_v16f16_v16i64:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index e595686cb4975..d61f92b406294 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -1142,10 +1142,9 @@ define <2 x half> @ucvtf_v2i64_v2f16(<2 x i64> %op1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov z1.d, z0.d[1]
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    fmov x9, d1
-; CHECK-NEXT:    ucvtf h0, x8
-; CHECK-NEXT:    ucvtf h1, x9
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    ucvtf z1.h, p0/m, z1.d
 ; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -2596,10 +2595,9 @@ define <2 x half> @scvtf_v2i64_v2f16(<2 x i64> %op1) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    mov z1.d, z0.d[1]
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    fmov x9, d1
-; CHECK-NEXT:    scvtf h0, x8
-; CHECK-NEXT:    scvtf h1, x9
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    scvtf z1.h, p0/m, z1.d
 ; CHECK-NEXT:    zip1 z0.h, z0.h, z1.h
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
@@ -2795,7 +2793,10 @@ define half @scvtf_i16_f16(ptr %0) {
 ; CHECK-LABEL: scvtf_i16_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrsh w8, [x0]
-; CHECK-NEXT:    scvtf h0, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_i16_f16:
@@ -2813,7 +2814,10 @@ define float @scvtf_i16_f32(ptr %0) {
 ; CHECK-LABEL: scvtf_i16_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrsh w8, [x0]
-; CHECK-NEXT:    scvtf s0, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_i16_f32:
@@ -2846,8 +2850,10 @@ define double @scvtf_i16_f64(ptr %0) {
 define half @scvtf_i32_f16(ptr %0) {
 ; CHECK-LABEL: scvtf_i32_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    scvtf h0, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_i32_f16:
@@ -2864,8 +2870,10 @@ define half @scvtf_i32_f16(ptr %0) {
 define float @scvtf_i32_f32(ptr %0) {
 ; CHECK-LABEL: scvtf_i32_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    scvtf s0, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_i32_f32:
@@ -2898,8 +2906,10 @@ define double @scvtf_i32_f64(ptr %0) {
 define half @scvtf_i64_f16(ptr %0) {
 ; CHECK-LABEL: scvtf_i64_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    scvtf h0, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    scvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_i64_f16:
@@ -2916,8 +2926,10 @@ define half @scvtf_i64_f16(ptr %0) {
 define float @scvtf_i64_f32(ptr %0) {
 ; CHECK-LABEL: scvtf_i64_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    scvtf s0, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    scvtf z0.s, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_i64_f32:
@@ -2933,8 +2945,10 @@ define float @scvtf_i64_f32(ptr %0) {
 define double @scvtf_i64_f64(ptr %0) {
 ; CHECK-LABEL: scvtf_i64_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    scvtf d0, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    scvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: scvtf_i64_f64:
@@ -2951,7 +2965,10 @@ define half @ucvtf_i16_f16(ptr %0) {
 ; CHECK-LABEL: ucvtf_i16_f16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ucvtf h0, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i16_f16:
@@ -2969,7 +2986,10 @@ define float @ucvtf_i16_f32(ptr %0) {
 ; CHECK-LABEL: ucvtf_i16_f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ucvtf s0, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i16_f32:
@@ -3002,8 +3022,10 @@ define double @ucvtf_i16_f64(ptr %0) {
 define half @ucvtf_i32_f16(ptr %0) {
 ; CHECK-LABEL: ucvtf_i32_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ucvtf h0, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i32_f16:
@@ -3020,8 +3042,10 @@ define half @ucvtf_i32_f16(ptr %0) {
 define float @ucvtf_i32_f32(ptr %0) {
 ; CHECK-LABEL: ucvtf_i32_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ucvtf s0, w8
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.s
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i32_f32:
@@ -3054,8 +3078,10 @@ define double @ucvtf_i32_f64(ptr %0) {
 define half @ucvtf_i64_f16(ptr %0) {
 ; CHECK-LABEL: ucvtf_i64_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ucvtf h0, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ucvtf z0.h, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i64_f16:
@@ -3072,8 +3098,10 @@ define half @ucvtf_i64_f16(ptr %0) {
 define float @ucvtf_i64_f32(ptr %0) {
 ; CHECK-LABEL: ucvtf_i64_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ucvtf s0, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ucvtf z0.s, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i64_f32:
@@ -3089,8 +3117,10 @@ define float @ucvtf_i64_f32(ptr %0) {
 define double @ucvtf_i64_f64(ptr %0) {
 ; CHECK-LABEL: ucvtf_i64_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ucvtf d0, x8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ucvtf z0.d, p0/m, z0.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i64_f64:

From 32220601247896f508ccfde614f5ba6afc85b27d Mon Sep 17 00:00:00 2001
From: Djordje Todorovic <djordje.todorovic@htecgroup.com>
Date: Thu, 19 Dec 2024 14:26:43 +0100
Subject: [PATCH 056/209] Reland "[RISCV] Add scheduling model for mips p8700
 CPU" (#120550)

This patch introduces a scheduling model for the MIPS p8700, an
out-of-order
RISC-V processor. The model includes pipelines for the following units:

- 2 Integer Arithmetic/Logical Units (ALU and AL2)
- Multiply/Divide Unit (MDU)
- Branch Unit (CTI)
- Load/Store Unit (LSU)
- Short Floating-Point Pipe (FPUS)
- Long Floating-Point Pipe (FPUL)

For additional details, refer to the official product page:
https://mips.com/products/hardware/p8700/.

Also adds `UnsupportedSchedZfhmin` to handle cases like
`WriteFCvtF16ToF32` that
previously caused build failures.
---
 llvm/lib/Target/RISCV/RISCV.td               |   1 +
 llvm/lib/Target/RISCV/RISCVProcessors.td     |   3 +-
 llvm/lib/Target/RISCV/RISCVSchedMIPSP8700.td | 281 +++++++++++++++++++
 llvm/test/tools/llvm-mca/RISCV/MIPS/p8700.s  | 139 +++++++++
 4 files changed, 422 insertions(+), 2 deletions(-)
 create mode 100644 llvm/lib/Target/RISCV/RISCVSchedMIPSP8700.td
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/MIPS/p8700.s

diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index 00c3d702e12a2..1df6f9ae1944c 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -46,6 +46,7 @@ include "RISCVMacroFusion.td"
 // RISC-V Scheduling Models
 //===----------------------------------------------------------------------===//
 
+include "RISCVSchedMIPSP8700.td"
 include "RISCVSchedRocket.td"
 include "RISCVSchedSiFive7.td"
 include "RISCVSchedSiFiveP400.td"
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 445e084d07686..053a3b11f39bc 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -105,7 +105,7 @@ def GENERIC_RV64 : RISCVProcessorModel<"generic-rv64",
 def GENERIC : RISCVTuneProcessorModel<"generic", NoSchedModel>, GenericTuneInfo;
 
 def MIPS_P8700 : RISCVProcessorModel<"mips-p8700",
-                                     NoSchedModel,
+                                     MIPSP8700Model,
                                      [Feature64Bit,
                                       FeatureStdExtI,
                                       FeatureStdExtM,
@@ -321,7 +321,6 @@ def SIFIVE_P470 : RISCVProcessorModel<"sifive-p470", SiFiveP400Model,
                                                   [TuneNoSinkSplatOperands,
                                                    TuneVXRMPipelineFlush])>;
 
-
 def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", SiFiveP600Model,
                                       !listconcat(RVA22U64Features,
                                       [FeatureStdExtV,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedMIPSP8700.td b/llvm/lib/Target/RISCV/RISCVSchedMIPSP8700.td
new file mode 100644
index 0000000000000..550f83a59b8b0
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVSchedMIPSP8700.td
@@ -0,0 +1,281 @@
+//===-- RISCVSchedMIPSP8700.td - MIPS RISC-V Processor -----*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// P8700 - a RISC-V processor by MIPS.
+// Pipelines:
+//   - 2 Integer Arithmetic and Logical Units (ALU and AL2)
+//   - Multiply / Divide Unit (MDU)
+//   - Branch Unit (CTI)
+//   - Load Store Unit (LSU)
+//   - Short Floating Point Pipe (FPUS)
+//   - Long Floating Point Pipe (FPUL)
+//===----------------------------------------------------------------------===//
+
+def MIPSP8700Model : SchedMachineModel {
+  int IssueWidth = 4;
+  int MicroOpBufferSize = 96;
+  int LoadLatency = 4;
+  int MispredictPenalty = 8;
+  let CompleteModel = 0;
+}
+
+let SchedModel = MIPSP8700Model in {
+// Handle ALQ Pipelines.
+// It contains 1 ALU Unit only.
+def p8700ALQ : ProcResource<1> { let BufferSize = 16; }
+
+// Handle AGQ Pipelines.
+def p8700AGQ : ProcResource<3> { let BufferSize = 16; }
+def p8700IssueAL2 : ProcResource<1> { let Super = p8700AGQ; }
+def p8700IssueCTI : ProcResource<1> { let Super = p8700AGQ; }
+def p8700IssueLSU : ProcResource<1> { let Super = p8700AGQ; }
+def p8700WriteEitherALU : ProcResGroup<[p8700ALQ, p8700IssueAL2]>;
+
+// Handle Multiply Divide Pipe.
+def p8700GpDiv : ProcResource<1>;
+def p8700GpMul : ProcResource<1>;
+
+def : WriteRes<WriteIALU, [p8700WriteEitherALU]>;
+def : WriteRes<WriteIALU32, [p8700WriteEitherALU]>;
+def : WriteRes<WriteShiftImm, [p8700WriteEitherALU]>;
+def : WriteRes<WriteShiftImm32, [p8700WriteEitherALU]>;
+def : WriteRes<WriteShiftReg, [p8700WriteEitherALU]>;
+def : WriteRes<WriteShiftReg32, [p8700WriteEitherALU]>;
+
+// Handle zba.
+def : WriteRes<WriteSHXADD, [p8700WriteEitherALU]>;
+def : WriteRes<WriteSHXADD32, [p8700WriteEitherALU]>;
+
+// Handle zbb.
+let Latency = 2 in {
+def : WriteRes<WriteCLZ, [p8700IssueAL2]>;
+def : WriteRes<WriteCTZ, [p8700IssueAL2]>;
+def : WriteRes<WriteCPOP, [p8700IssueAL2]>;
+def : WriteRes<WriteCLZ32, [p8700IssueAL2]>;
+def : WriteRes<WriteCTZ32, [p8700IssueAL2]>;
+def : WriteRes<WriteCPOP32, [p8700IssueAL2]>;
+}
+def : WriteRes<WriteRotateReg, [p8700WriteEitherALU]>;
+def : WriteRes<WriteRotateImm, [p8700WriteEitherALU]>;
+def : WriteRes<WriteRotateReg32, [p8700WriteEitherALU]>;
+def : WriteRes<WriteRotateImm32, [p8700WriteEitherALU]>;
+def : WriteRes<WriteREV8, [p8700WriteEitherALU]>;
+def : WriteRes<WriteORCB, [p8700WriteEitherALU]>;
+def : WriteRes<WriteIMinMax, [p8700WriteEitherALU]>;
+
+let Latency = 0 in
+def : WriteRes<WriteNop, [p8700WriteEitherALU]>;
+
+let Latency = 4 in {
+def : WriteRes<WriteLDB, [p8700IssueLSU]>;
+def : WriteRes<WriteLDH, [p8700IssueLSU]>;
+def : WriteRes<WriteLDW, [p8700IssueLSU]>;
+def : WriteRes<WriteLDD, [p8700IssueLSU]>;
+
+def : WriteRes<WriteAtomicW, [p8700IssueLSU]>;
+def : WriteRes<WriteAtomicD, [p8700IssueLSU]>;
+def : WriteRes<WriteAtomicLDW, [p8700IssueLSU]>;
+def : WriteRes<WriteAtomicLDD, [p8700IssueLSU]>;
+}
+
+let Latency = 8 in {
+def : WriteRes<WriteFLD32, [p8700IssueLSU]>;
+def : WriteRes<WriteFLD64, [p8700IssueLSU]>;
+}
+
+let Latency = 3 in {
+def : WriteRes<WriteSTB, [p8700IssueLSU]>;
+def : WriteRes<WriteSTH, [p8700IssueLSU]>;
+def : WriteRes<WriteSTW, [p8700IssueLSU]>;
+def : WriteRes<WriteSTD, [p8700IssueLSU]>;
+
+def : WriteRes<WriteAtomicSTW, [p8700IssueLSU]>;
+def : WriteRes<WriteAtomicSTD, [p8700IssueLSU]>;
+}
+
+def : WriteRes<WriteFST32, [p8700IssueLSU]>;
+def : WriteRes<WriteFST64, [p8700IssueLSU]>;
+
+let Latency = 7 in {
+def : WriteRes<WriteFMovI32ToF32, [p8700IssueLSU]>;
+def : WriteRes<WriteFMovF32ToI32, [p8700IssueLSU]>;
+def : WriteRes<WriteFMovI64ToF64, [p8700IssueLSU]>;
+def : WriteRes<WriteFMovF64ToI64, [p8700IssueLSU]>;
+}
+
+let Latency = 4 in {
+def : WriteRes<WriteIMul, [p8700GpMul]>;
+def : WriteRes<WriteIMul32, [p8700GpMul]>;
+}
+
+let Latency = 7, ReleaseAtCycles = [7] in {
+def : WriteRes<WriteIDiv, [p8700GpDiv]>;
+def : WriteRes<WriteIDiv32,  [p8700GpDiv]>;
+def : WriteRes<WriteIRem, [p8700GpDiv]>;
+def : WriteRes<WriteIRem32, [p8700GpDiv]>;
+}
+
+def : WriteRes<WriteCSR, [p8700ALQ]>;
+
+// Handle CTI Pipeline.
+def : WriteRes<WriteJmp, [p8700IssueCTI]>;
+def : WriteRes<WriteJalr, [p8700IssueCTI]>;
+let Latency = 2 in {
+def : WriteRes<WriteJal, [p8700IssueCTI]>;
+def : WriteRes<WriteJalr, [p8700IssueCTI]>;
+}
+
+// Handle FPU Pipelines.
+def p8700FPQ : ProcResource<3> { let BufferSize = 16; }
+def p8700IssueFPUS : ProcResource<1> { let Super = p8700FPQ; }
+def p8700IssueFPUL : ProcResource<1> { let Super = p8700FPQ; }
+def p8700FpuApu    : ProcResource<1>;
+def p8700FpuLong   : ProcResource<1>;
+
+let Latency = 4 in {
+def : WriteRes<WriteFCvtI32ToF32, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFCvtI32ToF64, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFCvtI64ToF32, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFCvtI64ToF64, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFCvtF32ToI32, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFCvtF32ToI64, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFCvtF32ToF64, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFCvtF64ToI32, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFCvtF64ToI64, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFCvtF64ToF32, [p8700IssueFPUL, p8700FpuApu]>;
+
+def : WriteRes<WriteFAdd32, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFAdd64, [p8700IssueFPUL, p8700FpuApu]>;
+}
+
+let Latency = 2 in {
+def : WriteRes<WriteFSGNJ32, [p8700IssueFPUS, p8700FpuApu]>;
+def : WriteRes<WriteFMinMax32, [p8700IssueFPUS, p8700FpuApu]>;
+def : WriteRes<WriteFSGNJ64, [p8700IssueFPUS, p8700FpuApu]>;
+def : WriteRes<WriteFMinMax64, [p8700IssueFPUS, p8700FpuApu]>;
+
+def : WriteRes<WriteFCmp32, [p8700IssueFPUS, p8700FpuApu]>;
+def : WriteRes<WriteFCmp64, [p8700IssueFPUS, p8700FpuApu]>;
+}
+
+def : WriteRes<WriteFClass32, [p8700IssueFPUS, p8700FpuApu]>;
+def : WriteRes<WriteFClass64, [p8700IssueFPUS, p8700FpuApu]>;
+
+let Latency = 8 in {
+def : WriteRes<WriteFMA32, [p8700FpuLong, p8700FpuApu]>;
+def : WriteRes<WriteFMA64, [p8700FpuLong, p8700FpuApu]>;
+}
+
+let Latency = 5 in {
+def : WriteRes<WriteFMul32, [p8700FpuLong, p8700FpuApu]>;
+def : WriteRes<WriteFMul64, [p8700FpuLong, p8700FpuApu]>;
+}
+
+let Latency = 11, ReleaseAtCycles = [1, 11] in {
+def : WriteRes<WriteFDiv32, [p8700FpuLong, p8700FpuApu]>;
+def : WriteRes<WriteFSqrt32, [p8700FpuLong, p8700FpuApu]>;
+}
+
+let Latency = 17, ReleaseAtCycles = [1, 17] in {
+def : WriteRes<WriteFDiv64, [p8700IssueFPUL, p8700FpuApu]>;
+def : WriteRes<WriteFSqrt64, [p8700IssueFPUL, p8700FpuApu]>;
+}
+
+// Bypass and advance.
+def : ReadAdvance<ReadIALU, 0>;
+def : ReadAdvance<ReadIALU32, 0>;
+def : ReadAdvance<ReadShiftImm, 0>;
+def : ReadAdvance<ReadShiftImm32, 0>;
+def : ReadAdvance<ReadShiftReg, 0>;
+def : ReadAdvance<ReadShiftReg32, 0>;
+def : ReadAdvance<ReadSHXADD, 0>;
+def : ReadAdvance<ReadSHXADD32, 0>;
+def : ReadAdvance<ReadRotateReg, 0>;
+def : ReadAdvance<ReadRotateImm, 0>;
+def : ReadAdvance<ReadCLZ, 0>;
+def : ReadAdvance<ReadCTZ, 0>;
+def : ReadAdvance<ReadCPOP, 0>;
+def : ReadAdvance<ReadRotateReg32, 0>;
+def : ReadAdvance<ReadRotateImm32, 0>;
+def : ReadAdvance<ReadCLZ32, 0>;
+def : ReadAdvance<ReadCTZ32, 0>;
+def : ReadAdvance<ReadCPOP32, 0>;
+def : ReadAdvance<ReadREV8, 0>;
+def : ReadAdvance<ReadORCB, 0>;
+def : ReadAdvance<ReadIMul, 0>;
+def : ReadAdvance<ReadIMul32, 0>;
+def : ReadAdvance<ReadIDiv, 0>;
+def : ReadAdvance<ReadIDiv32, 0>;
+def : ReadAdvance<ReadJmp, 0>;
+def : ReadAdvance<ReadJalr, 0>;
+def : ReadAdvance<ReadFMovI32ToF32, 0>;
+def : ReadAdvance<ReadFMovF32ToI32, 0>;
+def : ReadAdvance<ReadFMovI64ToF64, 0>;
+def : ReadAdvance<ReadFMovF64ToI64, 0>;
+def : ReadAdvance<ReadFSGNJ32, 0>;
+def : ReadAdvance<ReadFMinMax32, 0>;
+def : ReadAdvance<ReadFSGNJ64, 0>;
+def : ReadAdvance<ReadFMinMax64, 0>;
+def : ReadAdvance<ReadFCmp32, 0>;
+def : ReadAdvance<ReadFCmp64, 0>;
+def : ReadAdvance<ReadFCvtI32ToF32, 0>;
+def : ReadAdvance<ReadFCvtI32ToF64, 0>;
+def : ReadAdvance<ReadFCvtI64ToF32, 0>;
+def : ReadAdvance<ReadFCvtI64ToF64, 0>;
+def : ReadAdvance<ReadFCvtF32ToI32, 0>;
+def : ReadAdvance<ReadFCvtF32ToI64, 0>;
+def : ReadAdvance<ReadFCvtF32ToF64, 0>;
+def : ReadAdvance<ReadFCvtF64ToI32, 0>;
+def : ReadAdvance<ReadFCvtF64ToI64, 0>;
+def : ReadAdvance<ReadFCvtF64ToF32, 0>;
+def : ReadAdvance<ReadFAdd32, 0>;
+def : ReadAdvance<ReadFAdd64, 0>;
+def : ReadAdvance<ReadFMul32, 0>;
+def : ReadAdvance<ReadFMul64, 0>;
+def : ReadAdvance<ReadFMA32, 0>;
+def : ReadAdvance<ReadFMA32Addend, 0>;
+def : ReadAdvance<ReadFMA64, 0>;
+def : ReadAdvance<ReadFMA64Addend, 0>;
+def : ReadAdvance<ReadFDiv32, 0>;
+def : ReadAdvance<ReadFSqrt32, 0>;
+def : ReadAdvance<ReadFDiv64, 0>;
+def : ReadAdvance<ReadFSqrt64, 0>;
+def : ReadAdvance<ReadAtomicWA, 0>;
+def : ReadAdvance<ReadAtomicWD, 0>;
+def : ReadAdvance<ReadAtomicDA, 0>;
+def : ReadAdvance<ReadAtomicDD, 0>;
+def : ReadAdvance<ReadAtomicLDW, 0>;
+def : ReadAdvance<ReadAtomicLDD, 0>;
+def : ReadAdvance<ReadAtomicSTW, 0>;
+def : ReadAdvance<ReadAtomicSTD, 0>;
+def : ReadAdvance<ReadFStoreData, 0>;
+def : ReadAdvance<ReadCSR, 0>;
+def : ReadAdvance<ReadMemBase, 0>;
+def : ReadAdvance<ReadStoreData, 0>;
+def : ReadAdvance<ReadFMemBase, 0>;
+def : ReadAdvance<ReadFClass32, 0>;
+def : ReadAdvance<ReadFClass64, 0>;
+def : ReadAdvance<ReadIMinMax, 0>;
+def : ReadAdvance<ReadIRem, 0>;
+def : ReadAdvance<ReadIRem32, 0>;
+
+// Unsupported extensions.
+defm : UnsupportedSchedV;
+defm : UnsupportedSchedZbc;
+defm : UnsupportedSchedZbs;
+defm : UnsupportedSchedZbkb;
+defm : UnsupportedSchedZbkx;
+defm : UnsupportedSchedZfa;
+defm : UnsupportedSchedZfhmin;
+defm : UnsupportedSchedSFB;
+defm : UnsupportedSchedZabha;
+defm : UnsupportedSchedXsfvcp;
+defm : UnsupportedSchedZvk;
+defm : UnsupportedSchedZvkned;
+}
diff --git a/llvm/test/tools/llvm-mca/RISCV/MIPS/p8700.s b/llvm/test/tools/llvm-mca/RISCV/MIPS/p8700.s
new file mode 100644
index 0000000000000..d93129129a176
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/MIPS/p8700.s
@@ -0,0 +1,139 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=mips-p8700 -timeline -iterations=1 < %s | FileCheck %s
+
+# A few instructions to test the pipeline:
+# - Integer division (IDiv) exercises the p8700GpDiv resource.
+# - Integer multiplication (IMul) uses p8700GpMul.
+# - Floating-point multiplication uses the FPUL pipeline.
+# - Load/Store instructions use the LSU pipeline.
+# - Simple ALU instructions test the p8700WriteEitherALU and p8700IssueAL2 resources.
+# - A jump instruction to test the CTI pipeline.
+
+# Integer division: a0 = a1 / a2
+# Exercises p8700GpDiv resource.
+  div     a0, a1, a2
+
+# Integer multiplication: a4 = a1 * a2
+# Exercises p8700GpMul resource.
+  mul     a4, a1, a2
+
+# Floating-point multiply: f1 = f2 * f3 (single precision)
+# Exercises p8700FpuLong + p8700FpuApu resources.
+  fmul.s  f1, f2, f3
+
+# Load/Store: load word from a0 into a3, then store a3 into a1
+# Exercises p8700IssueLSU resource.
+  lw      a3, 0(a0)
+  sw      a3, 0(a1)
+
+# Simple ALU operations (adding two registers, rotating bits)
+# Exercises p8700WriteEitherALU.
+  add     a5, a1, a2
+  ror     a6, a5, a2
+
+# A jump instruction: a simple forward jump
+# Exercises p8700IssueCTI resource.
+  jal     x0, .Lend
+
+  add     a7, a4, a0  # Instruction after jump (won't execute)
+.Lend:
+  nop
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      10
+# CHECK-NEXT: Total Cycles:      17
+# CHECK-NEXT: Total uOps:        10
+
+# CHECK:      Dispatch Width:    4
+# CHECK-NEXT: uOps Per Cycle:    0.59
+# CHECK-NEXT: IPC:               0.59
+# CHECK-NEXT: Block RThroughput: 7.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      7     7.00                        div	a0, a1, a2
+# CHECK-NEXT:  1      4     1.00                        mul	a4, a1, a2
+# CHECK-NEXT:  1      5     1.00                        fmul.s	ft1, ft2, ft3
+# CHECK-NEXT:  1      4     1.00    *                   lw	a3, 0(a0)
+# CHECK-NEXT:  1      3     1.00           *            sw	a3, 0(a1)
+# CHECK-NEXT:  1      1     0.50                        add	a5, a1, a2
+# CHECK-NEXT:  1      1     0.50                        ror	a6, a5, a2
+# CHECK-NEXT:  1      1     1.00                        j	.Lend
+# CHECK-NEXT:  1      1     0.50                        add	a7, a4, a0
+# CHECK-NEXT:  1      0     0.50                        nop
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0.0] - p8700AGQ
+# CHECK-NEXT: [0.1] - p8700AGQ
+# CHECK-NEXT: [0.2] - p8700AGQ
+# CHECK-NEXT: [1]   - p8700ALQ
+# CHECK-NEXT: [2.0] - p8700FPQ
+# CHECK-NEXT: [2.1] - p8700FPQ
+# CHECK-NEXT: [2.2] - p8700FPQ
+# CHECK-NEXT: [3]   - p8700FpuApu
+# CHECK-NEXT: [4]   - p8700FpuLong
+# CHECK-NEXT: [5]   - p8700GpDiv
+# CHECK-NEXT: [6]   - p8700GpMul
+# CHECK-NEXT: [7]   - p8700IssueAL2
+# CHECK-NEXT: [8]   - p8700IssueCTI
+# CHECK-NEXT: [9]   - p8700IssueFPUL
+# CHECK-NEXT: [10]  - p8700IssueFPUS
+# CHECK-NEXT: [11]  - p8700IssueLSU
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0.0]  [0.1]  [0.2]  [1]    [2.0]  [2.1]  [2.2]  [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
+# CHECK-NEXT: 1.00   1.00   1.00   2.00    -      -      -     1.00   1.00   7.00   1.00   2.00   1.00    -      -     2.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0.0]  [0.1]  [0.2]  [1]    [2.0]  [2.1]  [2.2]  [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     7.00    -      -      -      -      -      -     div	a0, a1, a2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -     mul	a4, a1, a2
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -     fmul.s	ft1, ft2, ft3
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   lw	a3, 0(a0)
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     1.00   sw	a3, 0(a1)
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     add	a5, a1, a2
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     ror	a6, a5, a2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     1.00    -      -      -     j	.Lend
+# CHECK-NEXT:  -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -     add	a7, a4, a0
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -     nop
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0123456
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeeER.    ..   div	a0, a1, a2
+# CHECK-NEXT: [0,1]     DeeeeE---R.    ..   mul	a4, a1, a2
+# CHECK-NEXT: [0,2]     DeeeeeE--R.    ..   fmul.s	ft1, ft2, ft3
+# CHECK-NEXT: [0,3]     D=======eeeeER ..   lw	a3, 0(a0)
+# CHECK-NEXT: [0,4]     .D==========eeeER   sw	a3, 0(a1)
+# CHECK-NEXT: [0,5]     .DeE------------R   add	a5, a1, a2
+# CHECK-NEXT: [0,6]     .D=eE-----------R   ror	a6, a5, a2
+# CHECK-NEXT: [0,7]     .DeE------------R   j	.Lend
+# CHECK-NEXT: [0,8]     . D=====eE------R   add	a7, a4, a0
+# CHECK-NEXT: [0,9]     . DE------------R   nop
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       div	a0, a1, a2
+# CHECK-NEXT: 1.     1     1.0    1.0    3.0       mul	a4, a1, a2
+# CHECK-NEXT: 2.     1     1.0    1.0    2.0       fmul.s	ft1, ft2, ft3
+# CHECK-NEXT: 3.     1     8.0    0.0    0.0       lw	a3, 0(a0)
+# CHECK-NEXT: 4.     1     11.0   0.0    0.0       sw	a3, 0(a1)
+# CHECK-NEXT: 5.     1     1.0    1.0    12.0      add	a5, a1, a2
+# CHECK-NEXT: 6.     1     2.0    0.0    11.0      ror	a6, a5, a2
+# CHECK-NEXT: 7.     1     1.0    1.0    12.0      j	.Lend
+# CHECK-NEXT: 8.     1     6.0    0.0    6.0       add	a7, a4, a0
+# CHECK-NEXT: 9.     1     1.0    1.0    12.0      nop
+# CHECK-NEXT:        1     3.3    0.6    5.8       <total>

From db84ae3a68173fc561acb79adb6c557cb73ad938 Mon Sep 17 00:00:00 2001
From: SpencerAbson <Spencer.Abson@arm.com>
Date: Thu, 19 Dec 2024 13:27:07 +0000
Subject: [PATCH 057/209] [Clang][AArch64] Add signed index/offset variants of
 sve2p1 qword stores (#120549)

This patch adds signed offset/index variants to the SVE2p1 quadword
store intrinsics, in accordance with
https://github.com/ARM-software/acle/pull/359.
---
 clang/include/clang/Basic/arm_sve.td          |   4 +-
 clang/include/clang/Basic/arm_sve_sme_incl.td |   1 +
 .../sve2p1-intrinsics/acle_sve2p1_store.c     | 352 ++++++++++++++++++
 clang/utils/TableGen/SveEmitter.cpp           |   4 +
 4 files changed, 360 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index e9396e34adad8..ca295356985ba 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -474,7 +474,8 @@ let SVETargetGuard = "sve2p1", SMETargetGuard = InvalidMode in {
   def SVST1Q_SCATTER_U64BASE : MInst<"svst1q_scatter[_{2}base][_{d}]",  "vPgd", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;
 
   // Store one vector (scalar base + vector offset)
-  def SVST1Q_SCATTER_U64OFFSET : MInst<"svst1q_scatter_[{3}]offset[_{d}]", "vPpgd", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_vector_offset">;
+  def SVST1Q_SCATTER_OFFSETS_U : MInst<"svst1q_scatter_[{3}]offset[_{d}]", "vPpgd", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_vector_offset">;
+  def SVST1Q_SCATTER_OFFSETS_S : MInst<"svst1q_scatter_[{3}]offset[_{d}]", "vPp#d", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_vector_offset">;
 
   // Store N vectors into N-element structure (scalar base)
   defm SVST2Q : StructStore<"svst2q[_{d}]", "vPc2", "aarch64_sve_st2q">;
@@ -488,6 +489,7 @@ let SVETargetGuard = "sve2p1", SMETargetGuard = InvalidMode in {
 
   // Scatter store quadwords (scalar base + vector index)
   def SVST1Q_SCATTER_INDICES_U : MInst<"svst1q_scatter_[{3}]index[_{d}]", "vPpgd", "sUsiUilUlbhfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1q_scatter_index">;
+  def SVST1Q_SCATTER_INDICES_S : MInst<"svst1q_scatter_[{3}]index[_{d}]", "vPp#d", "sUsiUilUlbhfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1q_scatter_index">;
 
   // Scatter store quadwords (vector base + scalar index)
   def SVST1Q_SCATTER_INDEX_S   : MInst<"svst1q_scatter[_{2}base]_index[_{d}]", "vPgld", "sUsiUilUlbhfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
index ee899209ad832..6ba0d49d303d7 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -101,6 +101,7 @@ include "arm_immcheck_incl.td"
 // [: svuint8_t
 // t: svint32_t
 // z: svuint32_t
+// #: svint64_t
 // g: svuint64_t
 // O: svfloat16_t
 // M: svfloat32_t
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_store.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_store.c
index 657787e851ee2..b91780304dacb 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_store.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_store.c
@@ -1931,6 +1931,22 @@ void test_svst1q_scatter_u64index_s16(svbool_t pg, int16_t *base, svuint64_t idx
   SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _s16)(pg, base, idx, data);
 }
 
+// CHECK-LABEL: @test_svst1q_scatter_s64index_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv8i16(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_s64index_s16u10__SVBool_tPsu11__SVInt64_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv8i16(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64index_s16(svbool_t pg, int16_t *base, svint64_t idx, svint16_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_, s64, index, _s16)(pg, base, idx, data);
+}
+
 // CHECK-LABEL: @test_svst1q_scatter_u64index_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
@@ -1947,6 +1963,22 @@ void test_svst1q_scatter_u64index_u16(svbool_t pg, uint16_t *base, svuint64_t id
   SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _u16)(pg, base, idx, data);
 }
 
+// CHECK-LABEL: @test_svst1q_scatter_s64index_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv8i16(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_s64index_u16u10__SVBool_tPtu11__SVInt64_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv8i16(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64index_u16(svbool_t pg, uint16_t *base, svint64_t idx, svuint16_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_, s64, index, _u16)(pg, base, idx, data);
+}
+
 // CHECK-LABEL: @test_svst1q_scatter_u64index_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
@@ -1963,6 +1995,22 @@ void test_svst1q_scatter_u64index_s32(svbool_t pg, int32_t *base, svuint64_t idx
   SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _s32)(pg, base, idx, data);
 }
 
+// CHECK-LABEL: @test_svst1q_scatter_s64index_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv4i32(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_s64index_s32u10__SVBool_tPiu11__SVInt64_tu11__SVInt32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv4i32(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64index_s32(svbool_t pg, int32_t *base, svint64_t idx, svint32_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_, s64, index, _s32)(pg, base, idx, data);
+}
+
 // CHECK-LABEL: @test_svst1q_scatter_u64index_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
@@ -1979,6 +2027,22 @@ void test_svst1q_scatter_u64index_u32(svbool_t pg, uint32_t *base, svuint64_t id
   SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _u32)(pg, base, idx, data);
 }
 
+// CHECK-LABEL: @test_svst1q_scatter_s64index_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv4i32(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_s64index_u32u10__SVBool_tPju11__SVInt64_tu12__SVUint32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv4i32(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64index_u32(svbool_t pg, uint32_t *base, svint64_t idx, svuint32_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_, s64, index, _u32)(pg, base, idx, data);
+}
+
 // CHECK-LABEL: @test_svst1q_scatter_u64index_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
@@ -1995,6 +2059,22 @@ void test_svst1q_scatter_u64index_s64(svbool_t pg, int64_t *base, svuint64_t idx
   SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _s64)(pg, base, idx, data);
 }
 
+// CHECK-LABEL: @test_svst1q_scatter_s64index_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_s64index_s64u10__SVBool_tPlu11__SVInt64_tS1_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64index_s64(svbool_t pg, int64_t *base, svint64_t idx, svint64_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_, s64, index, _s64)(pg, base, idx, data);
+}
+
 // CHECK-LABEL: @test_svst1q_scatter_u64index_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
@@ -2011,6 +2091,22 @@ void test_svst1q_scatter_u64index_u64(svbool_t pg, uint64_t *base, svuint64_t id
   SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _u64)(pg, base, idx, data);
 }
 
+// CHECK-LABEL: @test_svst1q_scatter_s64index_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_s64index_u64u10__SVBool_tPmu11__SVInt64_tu12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64index_u64(svbool_t pg, uint64_t *base, svint64_t idx, svuint64_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_, s64, index, _u64)(pg, base, idx, data);
+}
+
 // CHECK-LABEL: @test_svst1q_scatter_u64index_bf16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
@@ -2027,6 +2123,22 @@ void test_svst1q_scatter_u64index_bf16(svbool_t pg, bfloat16_t *base, svuint64_t
   SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _bf16)(pg, base, idx, data);
 }
 
+// CHECK-LABEL: @test_svst1q_scatter_s64index_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svst1q_scatter_s64index_bf16u10__SVBool_tPu6__bf16u11__SVInt64_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64index_bf16(svbool_t pg, bfloat16_t *base, svint64_t idx, svbfloat16_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_, s64, index, _bf16)(pg, base, idx, data);
+}
+
 // CHECK-LABEL: @test_svst1q_scatter_u64index_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
@@ -2043,6 +2155,22 @@ void test_svst1q_scatter_u64index_f16(svbool_t pg, float16_t *base, svuint64_t i
   SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _f16)(pg, base, idx, data);
 }
 
+// CHECK-LABEL: @test_svst1q_scatter_s64index_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv8f16(<vscale x 8 x half> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_s64index_f16u10__SVBool_tPDhu11__SVInt64_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv8f16(<vscale x 8 x half> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64index_f16(svbool_t pg, float16_t *base, svint64_t idx, svfloat16_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_, s64, index, _f16)(pg, base, idx, data);
+}
+
 // CHECK-LABEL: @test_svst1q_scatter_u64index_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
@@ -2059,6 +2187,22 @@ void test_svst1q_scatter_u64index_f32(svbool_t pg, float32_t *base, svuint64_t i
   SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _f32)(pg, base, idx, data);
 }
 
+// CHECK-LABEL: @test_svst1q_scatter_s64index_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv4f32(<vscale x 4 x float> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_s64index_f32u10__SVBool_tPfu11__SVInt64_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv4f32(<vscale x 4 x float> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64index_f32(svbool_t pg, float32_t *base, svint64_t idx, svfloat32_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_, s64, index, _f32)(pg, base, idx, data);
+}
+
 // CHECK-LABEL: @test_svst1q_scatter_u64index_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
@@ -2075,6 +2219,22 @@ void test_svst1q_scatter_u64index_f64(svbool_t pg, float64_t *base, svuint64_t i
   SVE_ACLE_FUNC(svst1q_scatter_, u64, index, _f64)(pg, base, idx, data);
 }
 
+// CHECK-LABEL: @test_svst1q_scatter_s64index_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2f64(<vscale x 2 x double> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_s64index_f64u10__SVBool_tPdu11__SVInt64_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.index.nxv2f64(<vscale x 2 x double> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[IDX:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64index_f64(svbool_t pg, float64_t *base, svint64_t idx, svfloat64_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_, s64, index, _f64)(pg, base, idx, data);
+}
+
 // CHECK-LABEL: @test_svst1q_scatter_u64base_index_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
@@ -2271,6 +2431,22 @@ void test_svst1q_scatter_u64offset_s8(svbool_t pg, int8_t *base, svuint64_t off,
   SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_s8)(pg, base, off, data);
 }
 
+// CHECK-LABEL: @test_svst1q_scatter_s64offset_s8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv16i8(<vscale x 16 x i8> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_s64offset_s8u10__SVBool_tPau11__SVInt64_tu10__SVInt8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv16i8(<vscale x 16 x i8> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64offset_s8(svbool_t pg, int8_t *base, svint64_t off, svint8_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,s64,offset,_s8)(pg, base, off, data);
+}
+
 // CHECK-LABEL: @test_svst1q_scatter_u64offset_u8(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
@@ -2287,6 +2463,22 @@ void test_svst1q_scatter_u64offset_u8(svbool_t pg, uint8_t *base, svuint64_t off
   SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_u8)(pg, base, off, data);
 }
 
+// CHECK-LABEL: @test_svst1q_scatter_s64offset_u8(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv16i8(<vscale x 16 x i8> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z32test_svst1q_scatter_s64offset_u8u10__SVBool_tPhu11__SVInt64_tu11__SVUint8_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv16i8(<vscale x 16 x i8> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64offset_u8(svbool_t pg, uint8_t *base, svint64_t off, svuint8_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,s64,offset,_u8)(pg, base, off, data);
+}
+
 // CHECK-LABEL: @test_svst1q_scatter_u64offset_s16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
@@ -2303,6 +2495,22 @@ void test_svst1q_scatter_u64offset_s16(svbool_t pg, int16_t *base, svuint64_t of
   SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_s16)(pg, base, off, data);
 }
 
+// CHECK-LABEL: @test_svst1q_scatter_s64offset_s16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8i16(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svst1q_scatter_s64offset_s16u10__SVBool_tPsu11__SVInt64_tu11__SVInt16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8i16(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64offset_s16(svbool_t pg, int16_t *base, svint64_t off, svint16_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,s64,offset,_s16)(pg, base, off, data);
+}
+
 // CHECK-LABEL: @test_svst1q_scatter_u64offset_u16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
@@ -2319,6 +2527,22 @@ void test_svst1q_scatter_u64offset_u16(svbool_t pg, uint16_t *base, svuint64_t o
   SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_u16)(pg, base, off, data);
 }
 
+// CHECK-LABEL: @test_svst1q_scatter_s64offset_u16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8i16(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svst1q_scatter_s64offset_u16u10__SVBool_tPtu11__SVInt64_tu12__SVUint16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8i16(<vscale x 8 x i16> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64offset_u16(svbool_t pg, uint16_t *base, svint64_t off, svuint16_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,s64,offset,_u16)(pg, base, off, data);
+}
+
 // CHECK-LABEL: @test_svst1q_scatter_u64offset_s32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
@@ -2335,6 +2559,22 @@ void test_svst1q_scatter_u64offset_s32(svbool_t pg, int32_t *base, svuint64_t of
   SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_s32)(pg, base, off, data);
 }
 
+// CHECK-LABEL: @test_svst1q_scatter_s64offset_s32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv4i32(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svst1q_scatter_s64offset_s32u10__SVBool_tPiu11__SVInt64_tu11__SVInt32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv4i32(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64offset_s32(svbool_t pg, int32_t *base, svint64_t off, svint32_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,s64,offset,_s32)(pg, base, off, data);
+}
+
 // CHECK-LABEL: @test_svst1q_scatter_u64offset_u32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
@@ -2351,6 +2591,22 @@ void test_svst1q_scatter_u64offset_u32(svbool_t pg, uint32_t *base, svuint64_t o
   SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_u32)(pg, base, off, data);
 }
 
+// CHECK-LABEL: @test_svst1q_scatter_s64offset_u32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv4i32(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svst1q_scatter_s64offset_u32u10__SVBool_tPju11__SVInt64_tu12__SVUint32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv4i32(<vscale x 4 x i32> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64offset_u32(svbool_t pg, uint32_t *base, svint64_t off, svuint32_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,s64,offset,_u32)(pg, base, off, data);
+}
+
 // CHECK-LABEL: @test_svst1q_scatter_u64offset_s64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
@@ -2367,6 +2623,22 @@ void test_svst1q_scatter_u64offset_s64(svbool_t pg, int64_t *base, svuint64_t of
   SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_s64)(pg, base, off, data);
 }
 
+// CHECK-LABEL: @test_svst1q_scatter_s64offset_s64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svst1q_scatter_s64offset_s64u10__SVBool_tPlu11__SVInt64_tS1_(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64offset_s64(svbool_t pg, int64_t *base, svint64_t off, svint64_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,s64,offset,_s64)(pg, base, off, data);
+}
+
 // CHECK-LABEL: @test_svst1q_scatter_u64offset_u64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
@@ -2383,6 +2655,22 @@ void test_svst1q_scatter_u64offset_u64(svbool_t pg, uint64_t *base, svuint64_t o
   SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_u64)(pg, base, off, data);
 }
 
+// CHECK-LABEL: @test_svst1q_scatter_s64offset_u64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svst1q_scatter_s64offset_u64u10__SVBool_tPmu11__SVInt64_tu12__SVUint64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv2i64(<vscale x 2 x i64> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64offset_u64(svbool_t pg, uint64_t *base, svint64_t off, svuint64_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,s64,offset,_u64)(pg, base, off, data);
+}
+
 // CHECK-LABEL: @test_svst1q_scatter_u64offset_bf16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
@@ -2399,6 +2687,22 @@ void test_svst1q_scatter_u64offset_bf16(svbool_t pg, bfloat16_t *base, svuint64_
   SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_bf16)(pg, base, off, data);
 }
 
+// CHECK-LABEL: @test_svst1q_scatter_s64offset_bf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z34test_svst1q_scatter_s64offset_bf16u10__SVBool_tPu6__bf16u11__SVInt64_tu14__SVBfloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8bf16(<vscale x 8 x bfloat> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64offset_bf16(svbool_t pg, bfloat16_t *base, svint64_t off, svbfloat16_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,s64,offset,_bf16)(pg, base, off, data);
+}
+
 // CHECK-LABEL: @test_svst1q_scatter_u64offset_f16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
@@ -2415,6 +2719,22 @@ void test_svst1q_scatter_u64offset_f16(svbool_t pg, float16_t *base, svuint64_t
   SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_f16)(pg, base, off, data);
 }
 
+// CHECK-LABEL: @test_svst1q_scatter_s64offset_f16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8f16(<vscale x 8 x half> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svst1q_scatter_s64offset_f16u10__SVBool_tPDhu11__SVInt64_tu13__SVFloat16_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv8f16(<vscale x 8 x half> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64offset_f16(svbool_t pg, float16_t *base, svint64_t off, svfloat16_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,s64,offset,_f16)(pg, base, off, data);
+}
+
 // CHECK-LABEL: @test_svst1q_scatter_u64offset_f32(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
@@ -2431,6 +2751,22 @@ void test_svst1q_scatter_u64offset_f32(svbool_t pg, float32_t *base, svuint64_t
   SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_f32)(pg, base, off, data);
 }
 
+// CHECK-LABEL: @test_svst1q_scatter_s64offset_f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv4f32(<vscale x 4 x float> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svst1q_scatter_s64offset_f32u10__SVBool_tPfu11__SVInt64_tu13__SVFloat32_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv4f32(<vscale x 4 x float> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64offset_f32(svbool_t pg, float32_t *base, svint64_t off, svfloat32_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,s64,offset,_f32)(pg, base, off, data);
+}
+
 // CHECK-LABEL: @test_svst1q_scatter_u64offset_f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
@@ -2446,3 +2782,19 @@ void test_svst1q_scatter_u64offset_f32(svbool_t pg, float32_t *base, svuint64_t
 void test_svst1q_scatter_u64offset_f64(svbool_t pg, float64_t *base, svuint64_t off, svfloat64_t data) {
   SVE_ACLE_FUNC(svst1q_scatter_,u64,offset,_f64)(pg, base, off, data);
 }
+
+// CHECK-LABEL: @test_svst1q_scatter_s64offset_f64(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv2f64(<vscale x 2 x double> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: @_Z33test_svst1q_scatter_s64offset_f64u10__SVBool_tPdu11__SVInt64_tu13__SVFloat64_t(
+// CPP-CHECK-NEXT:  entry:
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1q.scatter.vector.offset.nxv2f64(<vscale x 2 x double> [[DATA:%.*]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE:%.*]], <vscale x 2 x i64> [[OFF:%.*]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svst1q_scatter_s64offset_f64(svbool_t pg, float64_t *base, svint64_t off, svfloat64_t data) {
+  SVE_ACLE_FUNC(svst1q_scatter_,s64,offset,_f64)(pg, base, off, data);
+}
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index 14e5637f62517..cf7e5a1ee3e00 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -779,6 +779,10 @@ void SVEType::applyModifier(char Mod) {
     Kind = UInt;
     ElementBitwidth = 64;
     break;
+  case '#':
+    Kind = SInt;
+    ElementBitwidth = 64;
+    break;
   case '[':
     Kind = UInt;
     ElementBitwidth = 8;

From eba7690d2b94ebe7fcf3e8ceecd4486f328de035 Mon Sep 17 00:00:00 2001
From: Dhruv Srivastava <dhruv.srivastava@ibm.com>
Date: Thu, 19 Dec 2024 19:20:27 +0530
Subject: [PATCH 058/209] [lldb][AIX] GetOpt support in AIX (#120574)

This PR is in reference to porting LLDB on AIX.

Link to discussions on llvm discourse and github:

1. https://discourse.llvm.org/t/port-lldb-to-ibm-aix/80640
2. https://github.com/llvm/llvm-project/issues/101657
The complete changes for porting are present in this draft PR:
https://github.com/llvm/llvm-project/pull/102601

Adding changes for minimal build for lldb binary on AIX.
getopt.h is missing in AIX, so instead relying on LLDB's getopt
functions.
---
 lldb/include/lldb/Host/HostGetOpt.h       | 2 +-
 lldb/include/lldb/Host/common/GetOptInc.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lldb/include/lldb/Host/HostGetOpt.h b/lldb/include/lldb/Host/HostGetOpt.h
index 52cfdf4dbb89c..b2b436e64a692 100644
--- a/lldb/include/lldb/Host/HostGetOpt.h
+++ b/lldb/include/lldb/Host/HostGetOpt.h
@@ -9,7 +9,7 @@
 #ifndef LLDB_HOST_HOSTGETOPT_H
 #define LLDB_HOST_HOSTGETOPT_H
 
-#if !defined(_MSC_VER) && !defined(__NetBSD__)
+#if !defined(_MSC_VER) && !defined(__NetBSD__) && !defined(_AIX)
 
 #include <getopt.h>
 #include <unistd.h>
diff --git a/lldb/include/lldb/Host/common/GetOptInc.h b/lldb/include/lldb/Host/common/GetOptInc.h
index 3fb9add479541..c9c9e2496d5f9 100644
--- a/lldb/include/lldb/Host/common/GetOptInc.h
+++ b/lldb/include/lldb/Host/common/GetOptInc.h
@@ -11,11 +11,11 @@
 
 #include "lldb/lldb-defines.h"
 
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) || defined(_AIX)
 #define REPLACE_GETOPT
 #define REPLACE_GETOPT_LONG
 #endif
-#if defined(_MSC_VER) || defined(__NetBSD__)
+#if defined(_MSC_VER) || defined(__NetBSD__) || defined(_AIX)
 #define REPLACE_GETOPT_LONG_ONLY
 #endif
 

From 0745add7f458e0a65e048f8c74933bdb48ae97d9 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Thu, 19 Dec 2024 15:05:45 +0100
Subject: [PATCH 059/209] [mlir][GPU] Do not strip location info when lowering
 to NVVM (#120432)

This is needed for a subsequent commit that reads location information
when lowering `gpu.assert`.
---
 mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
index fb440756e0c1d..20d7372eef85d 100644
--- a/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
+++ b/mlir/lib/Dialect/GPU/Pipelines/GPUToNVVMPipeline.cpp
@@ -73,7 +73,6 @@ void buildCommonPassPipeline(
 //===----------------------------------------------------------------------===//
 void buildGpuPassPipeline(OpPassManager &pm,
                           const mlir::gpu::GPUToNVVMPipelineOptions &options) {
-  pm.addNestedPass<gpu::GPUModuleOp>(createStripDebugInfoPass());
   ConvertGpuOpsToNVVMOpsOptions opt;
   opt.useBarePtrCallConv = options.kernelUseBarePtrCallConv;
   opt.indexBitwidth = options.indexBitWidth;

From 9469fd24b9a377947ed7726aee671a6095d44e44 Mon Sep 17 00:00:00 2001
From: SpencerAbson <Spencer.Abson@arm.com>
Date: Thu, 19 Dec 2024 14:13:02 +0000
Subject: [PATCH 060/209] [Clang][AArch64] Remove const from base pointers in
 sve2p1 stores (#120551)

This patch removes the const qualifier from the base pointer argument of
`svst1wq`/`svst1wq_vnum` and `svst1dq`/`svst1dq_vnum`, in accordance
with https://github.com/ARM-software/acle/pull/359.
---
 clang/include/clang/Basic/arm_sve.td          |  8 ++--
 clang/include/clang/Basic/arm_sve_sme_incl.td |  1 +
 .../acle_sve2p1_st1_single.c                  | 48 +++++++++----------
 3 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index ca295356985ba..1c6bdb8cad2d1 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -463,11 +463,11 @@ let SVETargetGuard = "sve,bf16", SMETargetGuard = "sme,bf16" in {
 
 let SVETargetGuard = "sve2p1", SMETargetGuard = InvalidMode in {
   // Contiguous truncating store from quadword (single vector).
-  def SVST1UWQ      : MInst<"svst1wq[_{d}]", "vPcd", "iUif",  [IsStore], MemEltTyInt32, "aarch64_sve_st1wq">;
-  def SVST1UWQ_VNUM : MInst<"svst1wq_vnum[_{d}]", "vPcld", "iUif", [IsStore], MemEltTyInt32, "aarch64_sve_st1wq">;
+  def SVST1UWQ      : MInst<"svst1wq[_{d}]", "vPpd", "iUif",  [IsStore], MemEltTyInt32, "aarch64_sve_st1wq">;
+  def SVST1UWQ_VNUM : MInst<"svst1wq_vnum[_{d}]", "vPpld", "iUif", [IsStore], MemEltTyInt32, "aarch64_sve_st1wq">;
 
-  def SVST1UDQ      : MInst<"svst1dq[_{d}]", "vPcd", "lUld",  [IsStore], MemEltTyInt64, "aarch64_sve_st1dq">;
-  def SVST1UDQ_VNUM : MInst<"svst1dq_vnum[_{d}]", "vPcld", "lUld", [IsStore], MemEltTyInt64, "aarch64_sve_st1dq">;
+  def SVST1UDQ      : MInst<"svst1dq[_{d}]", "vPpd", "lUld",  [IsStore], MemEltTyInt64, "aarch64_sve_st1dq">;
+  def SVST1UDQ_VNUM : MInst<"svst1dq_vnum[_{d}]", "vPpld", "lUld", [IsStore], MemEltTyInt64, "aarch64_sve_st1dq">;
 
   // Store one vector (vector base + scalar offset)
   def SVST1Q_SCATTER_U64BASE_OFFSET : MInst<"svst1q_scatter[_{2}base]_offset[_{d}]",  "vPgld", "cUcsUsiUilUlfhdb", [IsScatterStore, IsByteIndexed], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
index 6ba0d49d303d7..13e7cf45471c2 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -70,6 +70,7 @@ include "arm_immcheck_incl.td"
 // x: vector of signed integers
 // u: vector of unsigned integers
 // d: default
+// p: pointer type
 // c: const pointer type
 // P: predicate type
 // s: scalar of element type
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_st1_single.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_st1_single.c
index 6c1969c446248..058ff81bdf126 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_st1_single.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_st1_single.c
@@ -30,14 +30,14 @@
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
 // CHECK-NEXT:    ret void
 //
-// CPP-CHECK-LABEL: define dso_local void @_Z16test_svst1wq_u32u10__SVBool_tPKju12__SVUint32_t
+// CPP-CHECK-LABEL: define dso_local void @_Z16test_svst1wq_u32u10__SVBool_tPju12__SVUint32_t
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x i32> [[ZT:%.*]]) #[[ATTR0:[0-9]+]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1wq_u32(svbool_t pred, uint32_t const * base, svuint32_t zt) {
+void test_svst1wq_u32(svbool_t pred, uint32_t *base, svuint32_t zt) {
   SVE_ACLE_FUNC(svst1wq, _u32, , )(pred, base, zt);
 }
 
@@ -51,7 +51,7 @@ void test_svst1wq_u32(svbool_t pred, uint32_t const * base, svuint32_t zt) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
-// CPP-CHECK-LABEL: define dso_local void @_Z21test_svst1wq_vnum_u32u10__SVBool_tPKju12__SVUint32_t
+// CPP-CHECK-LABEL: define dso_local void @_Z21test_svst1wq_vnum_u32u10__SVBool_tPju12__SVUint32_t
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x i32> [[ZT:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
@@ -61,7 +61,7 @@ void test_svst1wq_u32(svbool_t pred, uint32_t const * base, svuint32_t zt) {
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1wq_vnum_u32(svbool_t pred, uint32_t const * base, svuint32_t zt) {
+void test_svst1wq_vnum_u32(svbool_t pred, uint32_t *base, svuint32_t zt) {
   SVE_ACLE_FUNC(svst1wq_vnum, _u32, , )(pred, base, 1, zt);
 }
 
@@ -72,14 +72,14 @@ void test_svst1wq_vnum_u32(svbool_t pred, uint32_t const * base, svuint32_t zt)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
 // CHECK-NEXT:    ret void
 //
-// CPP-CHECK-LABEL: define dso_local void @_Z16test_svst1wq_s32u10__SVBool_tPKiu11__SVInt32_t
+// CPP-CHECK-LABEL: define dso_local void @_Z16test_svst1wq_s32u10__SVBool_tPiu11__SVInt32_t
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x i32> [[ZT:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1wq_s32(svbool_t pred, int32_t const * base, svint32_t zt) {
+void test_svst1wq_s32(svbool_t pred, int32_t *base, svint32_t zt) {
   SVE_ACLE_FUNC(svst1wq, _s32, , )(pred, base, zt);
 }
 
@@ -93,7 +93,7 @@ void test_svst1wq_s32(svbool_t pred, int32_t const * base, svint32_t zt) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
-// CPP-CHECK-LABEL: define dso_local void @_Z21test_svst1wq_vnum_s32u10__SVBool_tPKiu11__SVInt32_t
+// CPP-CHECK-LABEL: define dso_local void @_Z21test_svst1wq_vnum_s32u10__SVBool_tPiu11__SVInt32_t
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x i32> [[ZT:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
@@ -103,7 +103,7 @@ void test_svst1wq_s32(svbool_t pred, int32_t const * base, svint32_t zt) {
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4i32(<vscale x 4 x i32> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1wq_vnum_s32(svbool_t pred, int32_t const * base, svint32_t zt) {
+void test_svst1wq_vnum_s32(svbool_t pred, int32_t *base, svint32_t zt) {
   SVE_ACLE_FUNC(svst1wq_vnum, _s32, , )(pred, base, 1, zt);
 }
 
@@ -114,14 +114,14 @@ void test_svst1wq_vnum_s32(svbool_t pred, int32_t const * base, svint32_t zt) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4f32(<vscale x 4 x float> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
 // CHECK-NEXT:    ret void
 //
-// CPP-CHECK-LABEL: define dso_local void @_Z16test_svst1wq_f32u10__SVBool_tPKfu13__SVFloat32_t
+// CPP-CHECK-LABEL: define dso_local void @_Z16test_svst1wq_f32u10__SVBool_tPfu13__SVFloat32_t
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x float> [[ZT:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4f32(<vscale x 4 x float> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1wq_f32(svbool_t pred, float32_t const * base, svfloat32_t zt) {
+void test_svst1wq_f32(svbool_t pred, float32_t *base, svfloat32_t zt) {
   SVE_ACLE_FUNC(svst1wq, _f32, , )(pred, base, zt);
 }
 
@@ -135,7 +135,7 @@ void test_svst1wq_f32(svbool_t pred, float32_t const * base, svfloat32_t zt) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4f32(<vscale x 4 x float> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
-// CPP-CHECK-LABEL: define dso_local void @_Z21test_svst1wq_vnum_f32u10__SVBool_tPKfu13__SVFloat32_t
+// CPP-CHECK-LABEL: define dso_local void @_Z21test_svst1wq_vnum_f32u10__SVBool_tPfu13__SVFloat32_t
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 4 x float> [[ZT:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
@@ -145,7 +145,7 @@ void test_svst1wq_f32(svbool_t pred, float32_t const * base, svfloat32_t zt) {
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1wq.nxv4f32(<vscale x 4 x float> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1wq_vnum_f32(svbool_t pred, float32_t const * base, svfloat32_t zt) {
+void test_svst1wq_vnum_f32(svbool_t pred, float32_t *base, svfloat32_t zt) {
   SVE_ACLE_FUNC(svst1wq_vnum, _f32, , )(pred, base, 1, zt);
 }
 
@@ -159,14 +159,14 @@ void test_svst1wq_vnum_f32(svbool_t pred, float32_t const * base, svfloat32_t zt
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
 // CHECK-NEXT:    ret void
 //
-// CPP-CHECK-LABEL: define dso_local void @_Z16test_svst1dq_u64u10__SVBool_tPKmu12__SVUint64_t
+// CPP-CHECK-LABEL: define dso_local void @_Z16test_svst1dq_u64u10__SVBool_tPmu12__SVUint64_t
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x i64> [[ZT:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1dq_u64(svbool_t pred, uint64_t const * base, svuint64_t zt) {
+void test_svst1dq_u64(svbool_t pred, uint64_t *base, svuint64_t zt) {
   SVE_ACLE_FUNC(svst1dq, _u64, , )(pred, base, zt);
 }
 
@@ -180,7 +180,7 @@ void test_svst1dq_u64(svbool_t pred, uint64_t const * base, svuint64_t zt) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
-// CPP-CHECK-LABEL: define dso_local void @_Z21test_svst1dq_vnum_u64u10__SVBool_tPKmu12__SVUint64_t
+// CPP-CHECK-LABEL: define dso_local void @_Z21test_svst1dq_vnum_u64u10__SVBool_tPmu12__SVUint64_t
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x i64> [[ZT:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
@@ -190,7 +190,7 @@ void test_svst1dq_u64(svbool_t pred, uint64_t const * base, svuint64_t zt) {
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1dq_vnum_u64(svbool_t pred, uint64_t const * base, svuint64_t zt) {
+void test_svst1dq_vnum_u64(svbool_t pred, uint64_t *base, svuint64_t zt) {
   SVE_ACLE_FUNC(svst1dq_vnum, _u64, , )(pred, base, -8, zt);
 }
 
@@ -201,14 +201,14 @@ void test_svst1dq_vnum_u64(svbool_t pred, uint64_t const * base, svuint64_t zt)
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
 // CHECK-NEXT:    ret void
 //
-// CPP-CHECK-LABEL: define dso_local void @_Z16test_svst1dq_s64u10__SVBool_tPKlu11__SVInt64_t
+// CPP-CHECK-LABEL: define dso_local void @_Z16test_svst1dq_s64u10__SVBool_tPlu11__SVInt64_t
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x i64> [[ZT:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1dq_s64(svbool_t pred, int64_t const * base, svint64_t zt) {
+void test_svst1dq_s64(svbool_t pred, int64_t *base, svint64_t zt) {
   SVE_ACLE_FUNC(svst1dq, _s64, , )(pred, base, zt);
 }
 
@@ -222,7 +222,7 @@ void test_svst1dq_s64(svbool_t pred, int64_t const * base, svint64_t zt) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
-// CPP-CHECK-LABEL: define dso_local void @_Z21test_svst1dq_vnum_s64u10__SVBool_tPKlu11__SVInt64_t
+// CPP-CHECK-LABEL: define dso_local void @_Z21test_svst1dq_vnum_s64u10__SVBool_tPlu11__SVInt64_t
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x i64> [[ZT:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
@@ -232,7 +232,7 @@ void test_svst1dq_s64(svbool_t pred, int64_t const * base, svint64_t zt) {
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2i64(<vscale x 2 x i64> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1dq_vnum_s64(svbool_t pred, int64_t const * base, svint64_t zt) {
+void test_svst1dq_vnum_s64(svbool_t pred, int64_t *base, svint64_t zt) {
   SVE_ACLE_FUNC(svst1dq_vnum, _s64, , )(pred, base, -8, zt);
 }
 
@@ -243,14 +243,14 @@ void test_svst1dq_vnum_s64(svbool_t pred, int64_t const * base, svint64_t zt) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2f64(<vscale x 2 x double> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
 // CHECK-NEXT:    ret void
 //
-// CPP-CHECK-LABEL: define dso_local void @_Z16test_svst1dq_f64u10__SVBool_tPKdu13__SVFloat64_t
+// CPP-CHECK-LABEL: define dso_local void @_Z16test_svst1dq_f64u10__SVBool_tPdu13__SVFloat64_t
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x double> [[ZT:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2f64(<vscale x 2 x double> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[BASE]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1dq_f64(svbool_t pred, float64_t const * base, svfloat64_t zt) {
+void test_svst1dq_f64(svbool_t pred, float64_t *base, svfloat64_t zt) {
   SVE_ACLE_FUNC(svst1dq, _f64, , )(pred, base, zt);
 }
 
@@ -264,7 +264,7 @@ void test_svst1dq_f64(svbool_t pred, float64_t const * base, svfloat64_t zt) {
 // CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2f64(<vscale x 2 x double> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
-// CPP-CHECK-LABEL: define dso_local void @_Z21test_svst1dq_vnum_f64u10__SVBool_tPKdu13__SVFloat64_t
+// CPP-CHECK-LABEL: define dso_local void @_Z21test_svst1dq_vnum_f64u10__SVBool_tPdu13__SVFloat64_t
 // CPP-CHECK-SAME: (<vscale x 16 x i1> [[PRED:%.*]], ptr noundef [[BASE:%.*]], <vscale x 2 x double> [[ZT:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 1 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv1i1(<vscale x 16 x i1> [[PRED]])
@@ -274,6 +274,6 @@ void test_svst1dq_f64(svbool_t pred, float64_t const * base, svfloat64_t zt) {
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st1dq.nxv2f64(<vscale x 2 x double> [[ZT]], <vscale x 1 x i1> [[TMP0]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svst1dq_vnum_f64(svbool_t pred, float64_t const * base, svfloat64_t zt) {
+void test_svst1dq_vnum_f64(svbool_t pred, float64_t  *base, svfloat64_t zt) {
   SVE_ACLE_FUNC(svst1dq_vnum, _f64, , )(pred, base, -8, zt);
 }

From 94837c8b5761d20310947be5d2e1e568f67e8c0c Mon Sep 17 00:00:00 2001
From: Tim Creech <timothy.m.creech@intel.com>
Date: Thu, 19 Dec 2024 09:46:45 -0500
Subject: [PATCH 061/209] Add llvm-profgen to the list of toolchain tools
 (#120106)

This tool is used for SPGO and is invoked directly by users as described
in the Clang User's Manual[^1].

This change will include llvm-profgen in installations configured with
LLVM_INSTALL_TOOLCHAIN_ONLY, such as those provided by LLVM's executable
Windows installers. This is useful now that LLVM can perform SPGO on
Windows.

[^1]:
https://clang.llvm.org/docs/UsersManual.html#using-sampling-profilers
---
 llvm/cmake/modules/AddLLVM.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 6cf0ee1a54dbd..54a54db338e69 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -1446,6 +1446,7 @@ if(NOT LLVM_TOOLCHAIN_TOOLS)
     llvm-strings
     llvm-strip
     llvm-profdata
+    llvm-profgen
     llvm-symbolizer
     # symlink version of some of above tools that are enabled by
     # LLVM_INSTALL_BINUTILS_SYMLINKS.

From 6f8afafd308d37d9abc4af0801dd5a4451c13718 Mon Sep 17 00:00:00 2001
From: Veera <32646674+veera-sivarajan@users.noreply.github.com>
Date: Thu, 19 Dec 2024 09:52:55 -0500
Subject: [PATCH 062/209] [InstCombine] Fold `A == MIN_INT ? B != MIN_INT : A <
 B` to `A < B` (#120177)

This PR folds:
 `A == MIN_INT ? B != MIN_INT : A < B` to `A < B`
 `A == MAX_INT ? B != MAX_INT : A > B` to `A > B`

Proof: https://alive2.llvm.org/ce/z/bR6E2s

This helps in optimizing comparison of optional unsigned non-zero types
in https://github.com/rust-lang/rust/issues/49892.

Rust compiler's current output: https://rust.godbolt.org/z/9fxfq3Gn8
---
 llvm/lib/Analysis/InstructionSimplify.cpp     |   1 -
 .../InstCombine/InstCombineSelect.cpp         |  44 ++
 .../select-with-extreme-eq-cond.ll            | 457 ++++++++++++++++++
 3 files changed, 501 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/InstCombine/select-with-extreme-eq-cond.ll

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 726c0f29e3992..8567a0504f54e 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -7144,7 +7144,6 @@ static Value *simplifyInstructionWithOperands(Instruction *I,
                             NewOps[1], I->getFastMathFlags(), Q, MaxRecurse);
   case Instruction::Select:
     return simplifySelectInst(NewOps[0], NewOps[1], NewOps[2], Q, MaxRecurse);
-    break;
   case Instruction::GetElementPtr: {
     auto *GEPI = cast<GetElementPtrInst>(I);
     return simplifyGEPInst(GEPI->getSourceElementType(), NewOps[0],
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 5eebfdb774d2e..3d251d662bd53 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1781,6 +1781,46 @@ static Value *foldSelectInstWithICmpConst(SelectInst &SI, ICmpInst *ICI,
   return nullptr;
 }
 
+/// `A == MIN_INT ? B != MIN_INT : A < B` --> `A < B`
+/// `A == MAX_INT ? B != MAX_INT : A > B` --> `A > B`
+static Instruction *foldSelectWithExtremeEqCond(Value *CmpLHS, Value *CmpRHS,
+                                                Value *TrueVal,
+                                                Value *FalseVal) {
+  Type *Ty = CmpLHS->getType();
+
+  if (Ty->isPtrOrPtrVectorTy())
+    return nullptr;
+
+  CmpPredicate Pred;
+  Value *B;
+
+  if (!match(FalseVal, m_c_ICmp(Pred, m_Specific(CmpLHS), m_Value(B))))
+    return nullptr;
+
+  Value *TValRHS;
+  if (!match(TrueVal, m_SpecificICmp(ICmpInst::ICMP_NE, m_Specific(B),
+                                     m_Value(TValRHS))))
+    return nullptr;
+
+  APInt C;
+  unsigned BitWidth = Ty->getScalarSizeInBits();
+
+  if (ICmpInst::isLT(Pred)) {
+    C = CmpInst::isSigned(Pred) ? APInt::getSignedMinValue(BitWidth)
+                                : APInt::getMinValue(BitWidth);
+  } else if (ICmpInst::isGT(Pred)) {
+    C = CmpInst::isSigned(Pred) ? APInt::getSignedMaxValue(BitWidth)
+                                : APInt::getMaxValue(BitWidth);
+  } else {
+    return nullptr;
+  }
+
+  if (!match(CmpRHS, m_SpecificInt(C)) || !match(TValRHS, m_SpecificInt(C)))
+    return nullptr;
+
+  return new ICmpInst(Pred, CmpLHS, B);
+}
+
 static Instruction *foldSelectICmpEq(SelectInst &SI, ICmpInst *ICI,
                                      InstCombinerImpl &IC) {
   ICmpInst::Predicate Pred = ICI->getPredicate();
@@ -1795,6 +1835,10 @@ static Instruction *foldSelectICmpEq(SelectInst &SI, ICmpInst *ICI,
   if (Pred == ICmpInst::ICMP_NE)
     std::swap(TrueVal, FalseVal);
 
+  if (Instruction *Res =
+          foldSelectWithExtremeEqCond(CmpLHS, CmpRHS, TrueVal, FalseVal))
+    return Res;
+
   // Transform (X == C) ? X : Y -> (X == C) ? C : Y
   // specific handling for Bitwise operation.
   // x&y -> (x|y) ^ (x^y)  or  (x|y) & ~(x^y)
diff --git a/llvm/test/Transforms/InstCombine/select-with-extreme-eq-cond.ll b/llvm/test/Transforms/InstCombine/select-with-extreme-eq-cond.ll
new file mode 100644
index 0000000000000..7f2cca44eab3b
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/select-with-extreme-eq-cond.ll
@@ -0,0 +1,457 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i1 @compare_unsigned_min(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @compare_unsigned_min(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+start:
+  %cmp1 = icmp eq i8 %x, 0
+  %cmp2 = icmp ne i8 %y, 0
+  %cmp3 = icmp ult i8 %x, %y
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}
+
+define i1 @compare_signed_min(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @compare_signed_min(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[TMP4]]
+;
+start:
+  %cmp1 = icmp eq i8 %x, -128
+  %cmp2 = icmp ne i8 %y, -128
+  %cmp3 = icmp slt i8 %x, %y
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}
+
+define i1 @compare_unsigned_max(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @compare_unsigned_max(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[TMP4]]
+;
+start:
+  %cmp1 = icmp eq i8 %x, 255
+  %cmp2 = icmp ne i8 %y, 255
+  %cmp3 = icmp ugt i8 %x, %y
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}
+
+define i1 @compare_signed_max(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @compare_signed_max(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[TMP4]]
+;
+start:
+  %cmp1 = icmp eq i8 %x, 127
+  %cmp2 = icmp ne i8 %y, 127
+  %cmp3 = icmp sgt i8 %x, %y
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}
+
+define i1 @relational_cmp_unsigned_min(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @relational_cmp_unsigned_min(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[TMP4]]
+;
+start:
+  %cmp1 = icmp ule i8 %x, 0
+  %cmp2 = icmp ugt i8 %y, 0
+  %cmp3 = icmp ult i8 %x, %y
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}
+
+define i1 @relational_cmp_signed_min(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @relational_cmp_signed_min(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[TMP4]]
+;
+start:
+  %cmp1 = icmp sle i8 %x, -128
+  %cmp2 = icmp sgt i8 %y, -128
+  %cmp3 = icmp slt i8 %x, %y
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}
+
+define i1 @relational_cmp_unsigned_max(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @relational_cmp_unsigned_max(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[TMP4]]
+;
+start:
+  %cmp1 = icmp uge i8 %x, 255
+  %cmp2 = icmp ult i8 %y, 255
+  %cmp3 = icmp ugt i8 %x, %y
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}
+
+define i1 @relational_cmp_signed_max(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @relational_cmp_signed_max(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[TMP4]]
+;
+start:
+  %cmp1 = icmp sge i8 %x, 127
+  %cmp2 = icmp slt i8 %y, 127
+  %cmp3 = icmp sgt i8 %x, %y
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}
+
+declare void @use(i1)
+
+define i1 @compare_signed_max_multiuse(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @compare_signed_max_multiuse(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i8 [[X]], [[Y]]
+; CHECK-NEXT:    call void @use(i1 [[TMP4]])
+; CHECK-NEXT:    [[RESULT:%.*]] = icmp sgt i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  %cmp1 = icmp eq i8 %x, 127
+  %cmp2 = icmp ne i8 %y, 127
+  %cmp3 = icmp sgt i8 %x, %y
+  call void @use(i1 %cmp3)
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}
+
+define i1 @compare_signed_min_samesign(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @compare_signed_min_samesign(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[RESULT:%.*]] = icmp slt i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  %cmp1 = icmp eq i8 %x, -128
+  %cmp2 = icmp ne i8 %y, -128
+  %cmp3 = icmp samesign slt i8 %x, %y
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}
+
+define i1 @compare_flipped(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @compare_flipped(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[RESULT:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  %cmp1 = icmp eq i8 %x, 0
+  %cmp2 = icmp ne i8 %y, 0
+  %cmp3 = icmp ugt i8 %y, %x
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}
+
+define i1 @compare_swapped(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @compare_swapped(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[RESULT:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  %cmp1 = icmp ne i8 %x, 0
+  %cmp2 = icmp ne i8 %y, 0
+  %cmp3 = icmp ult i8 %x, %y
+  %result = select i1 %cmp1, i1 %cmp3, i1 %cmp2
+  ret i1 %result
+}
+
+define i1 @compare_swapped_flipped_unsigned_max(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @compare_swapped_flipped_unsigned_max(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[RESULT:%.*]] = icmp ugt i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  %cmp1 = icmp ne i8 %x, 255
+  %cmp2 = icmp ne i8 %y, 255
+  %cmp3 = icmp ult i8 %y, %x
+  %result = select i1 %cmp1, i1 %cmp3, i1 %cmp2
+  ret i1 %result
+}
+
+define i1 @compare_unsigned_min_illegal_type(i9 %x, i9 %y) {
+; CHECK-LABEL: define i1 @compare_unsigned_min_illegal_type(
+; CHECK-SAME: i9 [[X:%.*]], i9 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i9 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+start:
+  %cmp1 = icmp eq i9 %x, 0
+  %cmp2 = icmp ne i9 %y, 0
+  %cmp3 = icmp ult i9 %x, %y
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}
+
+define i1 @compare_true_poison(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @compare_true_poison(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+start:
+  %cmp1 = icmp eq i8 %x, 0
+  %cmp2 = icmp ne i8 %y, 0
+  %cmp3 = icmp ult i8 %x, %y
+  %result = select i1 %cmp1, i1 poison, i1 %cmp3
+  ret i1 %result
+}
+
+define <2 x i1> @compare_vector(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: define <2 x i1> @compare_vector(
+; CHECK-SAME: <2 x i8> [[X:%.*]], <2 x i8> [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i8> [[X]], [[Y]]
+; CHECK-NEXT:    ret <2 x i1> [[TMP1]]
+;
+  %cmp1 = icmp eq <2 x i8> %x, zeroinitializer
+  %cmp2 = icmp ne <2 x i8> %y, zeroinitializer
+  %cmp3 = icmp ult <2 x i8> %x, %y
+  %result = select <2 x i1> %cmp1, <2 x i1> %cmp2, <2 x i1> %cmp3
+  ret <2 x i1> %result
+}
+
+define i1 @compare_pointer_negative(ptr %x, ptr %y) {
+; CHECK-LABEL: define i1 @compare_pointer_negative(
+; CHECK-SAME: ptr [[X:%.*]], ptr [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[X]], inttoptr (i8 127 to ptr)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne ptr [[Y]], inttoptr (i8 127 to ptr)
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt ptr [[X]], [[Y]]
+; CHECK-NEXT:    [[RESULT:%.*]] = select i1 [[TMP2]], i1 [[TMP3]], i1 [[TMP4]]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  %X = inttoptr i8 127 to ptr
+  %cmp1 = icmp eq ptr %x, %X
+  %cmp2 = icmp ne ptr %y, %X
+  %cmp3 = icmp sgt ptr %x, %y
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}
+
+define i1 @compare_float_negative(half %x, half %y) {
+; CHECK-LABEL: define i1 @compare_float_negative(
+; CHECK-SAME: half [[X:%.*]], half [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp oeq half [[X]], 0xH0000
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp one half [[Y]], 0xH0000
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp ult half [[X]], [[Y]]
+; CHECK-NEXT:    [[RESULT:%.*]] = select i1 [[TMP2]], i1 [[TMP3]], i1 [[TMP4]]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  %cmp1 = fcmp oeq half %x, 0.0
+  %cmp2 = fcmp one half %y, 0.0
+  %cmp3 = fcmp ult half %x, %y
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}
+
+define i1 @compare_unsigned_max_swapped_lhs_rhs_negative(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @compare_unsigned_max_swapped_lhs_rhs_negative(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[X]], -1
+; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp ne i8 [[Y]], -1
+; CHECK-NEXT:    [[RESULT:%.*]] = select i1 [[DOTNOT]], i1 [[TMP2]], i1 false
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  %cmp1 = icmp eq i8 %x, 255
+  %cmp2 = icmp ne i8 %y, 255
+  %cmp3 = icmp ugt i8 %x, %y
+  %result = select i1 %cmp2, i1 %cmp1, i1 %cmp3
+  ret i1 %result
+}
+
+define i1 @compare_signed_min_negative(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @compare_signed_min_negative(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[X]], -127
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i8 [[Y]], -127
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[RESULT:%.*]] = select i1 [[TMP2]], i1 [[TMP3]], i1 [[TMP4]]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  %cmp1 = icmp eq i8 %x, -127
+  %cmp2 = icmp ne i8 %y, -127
+  %cmp3 = icmp slt i8 %x, %y
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}
+
+define i1 @compare_unsigned_max_negative(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @compare_unsigned_max_negative(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[X]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i8 [[Y]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[RESULT:%.*]] = select i1 [[TMP2]], i1 [[TMP3]], i1 [[TMP4]]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  %cmp1 = icmp eq i8 %x, 255
+  %cmp2 = icmp ne i8 %y, 255
+  %cmp3 = icmp ult i8 %x, %y
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}
+
+define i1 @non_strict_op_negative(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @non_strict_op_negative(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i8 [[Y]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ule i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[RESULT:%.*]] = select i1 [[TMP2]], i1 [[TMP3]], i1 [[TMP4]]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  %cmp1 = icmp eq i8 %x, 0
+  %cmp2 = icmp ne i8 %y, 0
+  %cmp3 = icmp ule i8 %x, %y
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}
+
+define i1 @compare_poison_negative(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @compare_poison_negative(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i8 [[Y]], 0
+; CHECK-NEXT:    ret i1 [[CMP2]]
+;
+start:
+  %cmp1 = icmp eq i8 %x, 0
+  %cmp2 = icmp ne i8 %y, 0
+  %result = select i1 %cmp1, i1 %cmp2, i1 poison
+  ret i1 %result
+}
+
+define i1 @mismatched_cond_lhs_negative(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: define i1 @mismatched_cond_lhs_negative(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i8 [[Y]], 0
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i8 [[Z]], [[Y]]
+; CHECK-NEXT:    [[RESULT:%.*]] = select i1 [[CMP1]], i1 [[CMP2]], i1 [[CMP3]]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  %cmp1 = icmp eq i8 %x, 0
+  %cmp2 = icmp ne i8 %y, 0
+  %cmp3 = icmp ult i8 %z, %y
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}
+
+define i1 @mismatched_trueval_lhs_negative(i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: define i1 @mismatched_trueval_lhs_negative(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]], i8 [[Z:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i8 [[Y]], 0
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i8 [[X]], [[Z]]
+; CHECK-NEXT:    [[RESULT:%.*]] = select i1 [[CMP1]], i1 [[CMP2]], i1 [[CMP3]]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  %cmp1 = icmp eq i8 %x, 0
+  %cmp2 = icmp ne i8 %y, 0
+  %cmp3 = icmp ult i8 %x, %z
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}
+
+define i1 @mismatched_constant_negative(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @mismatched_constant_negative(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i8 [[Y]], 2
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[RESULT:%.*]] = select i1 [[CMP1]], i1 [[CMP2]], i1 [[CMP3]]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  %cmp1 = icmp eq i8 %x, 0
+  %cmp2 = icmp ne i8 %y, 2
+  %cmp3 = icmp ult i8 %x, %y
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}
+
+define i1 @mismatched_constant_two_negative(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @mismatched_constant_two_negative(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[X]], 2
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i8 [[Y]], 0
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[RESULT:%.*]] = select i1 [[CMP1]], i1 [[CMP2]], i1 [[CMP3]]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  %cmp1 = icmp eq i8 %x, 2
+  %cmp2 = icmp ne i8 %y, 0
+  %cmp3 = icmp ult i8 %x, %y
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}
+
+define i1 @compare_eq_eq_negative(i8 %x, i8 %y) {
+; CHECK-LABEL: define i1 @compare_eq_eq_negative(
+; CHECK-SAME: i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[START:.*:]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 [[Y]], 0
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i8 [[X]], [[Y]]
+; CHECK-NEXT:    [[RESULT:%.*]] = select i1 [[CMP1]], i1 [[CMP2]], i1 [[CMP3]]
+; CHECK-NEXT:    ret i1 [[RESULT]]
+;
+start:
+  %cmp1 = icmp eq i8 %x, 0
+  %cmp2 = icmp eq i8 %y, 0
+  %cmp3 = icmp ult i8 %x, %y
+  %result = select i1 %cmp1, i1 %cmp2, i1 %cmp3
+  ret i1 %result
+}

From 1f2d934525833c4aae5f0436fd99551c776fd246 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Thu, 19 Dec 2024 16:38:58 +0100
Subject: [PATCH 063/209] [clang][bytecode] Support pointers in
 __builtin_mem{move,cpy} (#120560)

Unfortunately, that means we can't use the __builtin_bit_cast
implementation for this.
---
 clang/lib/AST/ByteCode/InterpBuiltin.cpp      | 22 ++++++++++---
 .../lib/AST/ByteCode/InterpBuiltinBitCast.cpp | 31 +++++++++++++++++++
 clang/lib/AST/ByteCode/InterpBuiltinBitCast.h |  4 +++
 clang/lib/AST/ByteCode/Pointer.h              |  3 +-
 clang/test/AST/ByteCode/builtin-functions.cpp | 22 +++++++++++++
 5 files changed, 75 insertions(+), 7 deletions(-)

diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index d6b33c8aeeaac..2ae91feb2d9e8 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -1862,10 +1862,10 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC,
   }
 
   QualType ElemType;
-  if (SrcPtr.getFieldDesc()->isArray())
-    ElemType = SrcPtr.getFieldDesc()->getElemQualType();
+  if (DestPtr.getFieldDesc()->isArray())
+    ElemType = DestPtr.getFieldDesc()->getElemQualType();
   else
-    ElemType = SrcPtr.getType();
+    ElemType = DestPtr.getType();
 
   unsigned ElemSize =
       S.getASTContext().getTypeSizeInChars(ElemType).getQuantity();
@@ -1876,6 +1876,18 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC,
     return false;
   }
 
+  QualType SrcElemType;
+  if (SrcPtr.getFieldDesc()->isArray())
+    SrcElemType = SrcPtr.getFieldDesc()->getElemQualType();
+  else
+    SrcElemType = SrcPtr.getType();
+
+  if (!S.getASTContext().hasSameUnqualifiedType(ElemType, SrcElemType)) {
+    S.FFDiag(S.Current->getSource(OpPC), diag::note_constexpr_memcpy_type_pun)
+        << Move << SrcElemType << ElemType;
+    return false;
+  }
+
   // Check for overlapping memory regions.
   if (!Move && Pointer::pointToSameBlock(SrcPtr, DestPtr)) {
     unsigned SrcIndex = SrcPtr.getIndex() * SrcPtr.elemSize();
@@ -1893,8 +1905,8 @@ static bool interp__builtin_memcpy(InterpState &S, CodePtr OpPC,
   // As a last resort, reject dummy pointers.
   if (DestPtr.isDummy() || SrcPtr.isDummy())
     return false;
-
-  if (!DoBitCastPtr(S, OpPC, SrcPtr, DestPtr, Size.getZExtValue()))
+  assert(Size.getZExtValue() % ElemSize == 0);
+  if (!DoMemcpy(S, OpPC, SrcPtr, DestPtr, Bytes(Size.getZExtValue()).toBits()))
     return false;
 
   S.Stk.push<Pointer>(DestPtr);
diff --git a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
index 07f7694370821..0fc94e1694822 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.cpp
@@ -448,3 +448,34 @@ bool clang::interp::DoBitCastPtr(InterpState &S, CodePtr OpPC,
 
   return Success;
 }
+
+bool clang::interp::DoMemcpy(InterpState &S, CodePtr OpPC,
+                             const Pointer &SrcPtr, const Pointer &DestPtr,
+                             Bits Size) {
+  assert(SrcPtr.isBlockPointer());
+  assert(DestPtr.isBlockPointer());
+
+  unsigned SrcStartOffset = SrcPtr.getByteOffset();
+  unsigned DestStartOffset = DestPtr.getByteOffset();
+
+  enumeratePointerFields(SrcPtr, S.getContext(), Size,
+                         [&](const Pointer &P, PrimType T, Bits BitOffset,
+                             Bits FullBitWidth, bool PackedBools) -> bool {
+                           unsigned SrcOffsetDiff =
+                               P.getByteOffset() - SrcStartOffset;
+
+                           Pointer DestP =
+                               Pointer(DestPtr.asBlockPointer().Pointee,
+                                       DestPtr.asBlockPointer().Base,
+                                       DestStartOffset + SrcOffsetDiff);
+
+                           TYPE_SWITCH(T, {
+                             DestP.deref<T>() = P.deref<T>();
+                             DestP.initialize();
+                           });
+
+                           return true;
+                         });
+
+  return true;
+}
diff --git a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.h b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.h
index b45613b2f21e2..a0191bab693c4 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltinBitCast.h
+++ b/clang/lib/AST/ByteCode/InterpBuiltinBitCast.h
@@ -33,6 +33,10 @@ bool DoBitCastPtr(InterpState &S, CodePtr OpPC, const Pointer &FromPtr,
                   Pointer &ToPtr, size_t Size);
 bool readPointerToBuffer(const Context &Ctx, const Pointer &FromPtr,
                          BitcastBuffer &Buffer, bool ReturnOnUninit);
+
+bool DoMemcpy(InterpState &S, CodePtr OpPC, const Pointer &SrcPtr,
+              const Pointer &DestPtr, Bits Size);
+
 } // namespace interp
 } // namespace clang
 
diff --git a/clang/lib/AST/ByteCode/Pointer.h b/clang/lib/AST/ByteCode/Pointer.h
index 457fe93b27817..0d467c2abf083 100644
--- a/clang/lib/AST/ByteCode/Pointer.h
+++ b/clang/lib/AST/ByteCode/Pointer.h
@@ -107,6 +107,7 @@ class Pointer {
       : Offset(Offset), StorageKind(Storage::Fn) {
     PointeeStorage.Fn = FunctionPointer(F);
   }
+  Pointer(Block *Pointee, unsigned Base, uint64_t Offset);
   ~Pointer();
 
   void operator=(const Pointer &P);
@@ -706,8 +707,6 @@ class Pointer {
   friend struct InitMap;
   friend class DynamicAllocator;
 
-  Pointer(Block *Pointee, unsigned Base, uint64_t Offset);
-
   /// Returns the embedded descriptor preceding a field.
   InlineDescriptor *getInlineDesc() const {
     assert(isBlockPointer());
diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp
index 5906cb970f06c..c1fd1bc138150 100644
--- a/clang/test/AST/ByteCode/builtin-functions.cpp
+++ b/clang/test/AST/ByteCode/builtin-functions.cpp
@@ -1222,6 +1222,28 @@ namespace BuiltinMemcpy {
   static_assert(test_memcpy(1, 2, sizeof(int)) == 1334);
   static_assert(test_memcpy(0, 1, sizeof(int) * 2) == 2334); // both-error {{not an integral constant expression}} \
                                                              // both-note {{in call}}
+
+  /// Both memcpy and memmove must support pointers.
+  constexpr bool moveptr() {
+    int a = 0;
+    void *x = &a;
+    void *z = nullptr;
+
+    __builtin_memmove(&z, &x, sizeof(void*));
+    return z == x;
+  }
+  static_assert(moveptr());
+
+  constexpr bool cpyptr() {
+    int a = 0;
+    void *x = &a;
+    void *z = nullptr;
+
+    __builtin_memcpy(&z, &x, sizeof(void*));
+    return z == x;
+  }
+  static_assert(cpyptr());
+
 }
 
 namespace Memcmp {

From a161e73fcc957860afe1ff603d3ed77ea0311cc3 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 19 Dec 2024 15:47:32 +0000
Subject: [PATCH 064/209] [AMDGPU] Remove unnecessary casts to GCNSubtarget

---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 30 +++++++++----------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c0e01a020e0eb..b0d7dc1986a88 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -349,8 +349,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
         return MRI.getRegClass(Reg);
       }
 
-      const SIRegisterInfo *TRI
-        = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
+      const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
       return TRI->getPhysRegBaseClass(Reg);
     }
 
@@ -2390,10 +2389,9 @@ bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
     return true;
 
   if (VT == MVT::i64) {
-    const auto *ST = static_cast<const GCNSubtarget *>(Subtarget);
-
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
-    return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
+    return (CC == ISD::SETEQ || CC == ISD::SETNE) &&
+           Subtarget->hasScalarCompareEq64();
   }
 
   return false;
@@ -2435,8 +2433,7 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
     return;
   }
 
-  const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
-  const SIRegisterInfo *TRI = ST->getRegisterInfo();
+  const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
 
   bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
   bool AndExec = !UseSCCBr;
@@ -2449,7 +2446,7 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
     if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
         isNullConstant(Cond->getOperand(1)) &&
         // We may encounter ballot.i64 in wave32 mode on -O0.
-        VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) {
+        VCMP.getValueType().getSizeInBits() == Subtarget->getWavefrontSize()) {
       // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
       // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
       // BRCOND i1 %C, %BB
@@ -2496,14 +2493,15 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
     // the S_AND when is unnecessary. But it would be better to add a separate
     // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
     // catches both cases.
-    Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
-                                                         : AMDGPU::S_AND_B64,
-                     SL, MVT::i1,
-                     CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
-                                                        : AMDGPU::EXEC,
-                                         MVT::i1),
-                    Cond),
-                   0);
+    Cond = SDValue(
+        CurDAG->getMachineNode(
+            Subtarget->isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64, SL,
+            MVT::i1,
+            CurDAG->getRegister(Subtarget->isWave32() ? AMDGPU::EXEC_LO
+                                                      : AMDGPU::EXEC,
+                                MVT::i1),
+            Cond),
+        0);
   }
 
   SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);

From 310e79875752886a7713911e2a1ec14bc75bd4b3 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Thu, 19 Dec 2024 07:55:03 -0800
Subject: [PATCH 065/209] [NVPTX] Avoid introducing unnecessary ProxyRegs and
 Movs in ISel (#120486)

Avoid introducing `ProxyReg` and `MOV` nodes during ISel when lowering
`bitconvert` or similar operations. These nodes are all erased by a
later pass but not introducing them in the first place is simpler and
likely saves compile time.

Also remove redundant `MOV` instruction definitions.
---
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp   |  19 --
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       | 140 +++-------
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td      |   4 +-
 llvm/test/CodeGen/NVPTX/atomics-sm70.ll       |  32 ++-
 llvm/test/CodeGen/NVPTX/atomics-sm90.ll       |  48 ++--
 llvm/test/CodeGen/NVPTX/bf16-instructions.ll  | 136 ++++-----
 .../test/CodeGen/NVPTX/bf16x2-instructions.ll |  40 +--
 llvm/test/CodeGen/NVPTX/chain-different-as.ll |   4 +-
 llvm/test/CodeGen/NVPTX/cmpxchg.ll            |   2 +-
 .../CodeGen/NVPTX/compute-ptx-value-vts.ll    |   4 +-
 llvm/test/CodeGen/NVPTX/demote-vars.ll        |   2 +-
 llvm/test/CodeGen/NVPTX/extractelement.ll     |  42 +--
 llvm/test/CodeGen/NVPTX/f16x2-instructions.ll | 138 +++++-----
 llvm/test/CodeGen/NVPTX/fma-relu-contract.ll  | 168 +++++------
 .../CodeGen/NVPTX/fma-relu-fma-intrinsic.ll   | 106 +++----
 .../NVPTX/fma-relu-instruction-flag.ll        | 244 ++++++++--------
 llvm/test/CodeGen/NVPTX/i1-load-lower.ll      |   2 +-
 llvm/test/CodeGen/NVPTX/i128.ll               |  18 +-
 llvm/test/CodeGen/NVPTX/i16x2-instructions.ll |  72 ++---
 llvm/test/CodeGen/NVPTX/i8x4-instructions.ll  | 172 ++++++------
 .../CodeGen/NVPTX/inline-asm-b128-test1.ll    |   6 +-
 .../CodeGen/NVPTX/inline-asm-b128-test2.ll    |   8 +-
 .../CodeGen/NVPTX/inline-asm-b128-test3.ll    |   2 +-
 llvm/test/CodeGen/NVPTX/math-intrins.ll       | 176 ++++++------
 llvm/test/CodeGen/NVPTX/misched_func_call.ll  |   9 +-
 llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll   |   4 +-
 llvm/test/CodeGen/NVPTX/reg-types.ll          |  12 +-
 ...unfold-masked-merge-vector-variablemask.ll | 260 +++++++++---------
 llvm/test/CodeGen/NVPTX/vaargs.ll             |   2 +-
 llvm/test/CodeGen/NVPTX/variadics-backend.ll  |  12 +-
 llvm/test/CodeGen/NVPTX/vector-returns.ll     |  50 ++--
 31 files changed, 921 insertions(+), 1013 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index c838b21cbf75e..c51729e224bf5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -176,10 +176,6 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
   case ISD::ADDRSPACECAST:
     SelectAddrSpaceCast(N);
     return;
-  case ISD::ConstantFP:
-    if (tryConstantFP(N))
-      return;
-    break;
   case ISD::CopyToReg: {
     if (N->getOperand(1).getValueType() == MVT::i128) {
       SelectV2I64toI128(N);
@@ -212,21 +208,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
   }
 }
 
-// There's no way to specify FP16 and BF16 immediates in .(b)f16 ops, so we
-// have to load them into an .(b)f16 register first.
-bool NVPTXDAGToDAGISel::tryConstantFP(SDNode *N) {
-  if (N->getValueType(0) != MVT::f16 && N->getValueType(0) != MVT::bf16)
-    return false;
-  SDValue Val = CurDAG->getTargetConstantFP(
-      cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), N->getValueType(0));
-  SDNode *LoadConstF16 = CurDAG->getMachineNode(
-      (N->getValueType(0) == MVT::f16 ? NVPTX::LOAD_CONST_F16
-                                      : NVPTX::LOAD_CONST_BF16),
-      SDLoc(N), N->getValueType(0), Val);
-  ReplaceNode(N, LoadConstF16);
-  return true;
-}
-
 // Map ISD:CONDCODE value to appropriate CmpMode expected by
 // NVPTXInstPrinter::printCmpMode()
 static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index abaf8e0b0ec1f..711cd67eceed9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1177,17 +1177,6 @@ def NegDoubleConst : SDNodeXForm<fpimm, [{
                                      SDLoc(N), MVT::f64);
 }]>;
 
-// Loads FP16 constant into a register.
-//
-// ptxas does not have hex representation for fp16, so we can't use
-// fp16 immediate values in .f16 instructions. Instead we have to load
-// the constant into a register using mov.b16.
-def LOAD_CONST_F16 :
-  NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$a),
-            "mov.b16 \t$dst, $a;", []>;
-def LOAD_CONST_BF16 :
-  NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$a),
-            "mov.b16 \t$dst, $a;", []>;
 defm FADD : F3_fma_component<"add", fadd>;
 defm FSUB : F3_fma_component<"sub", fsub>;
 defm FMUL : F3_fma_component<"mul", fmul>;
@@ -1963,7 +1952,7 @@ let hasSideEffects = false in {
 
 
 // copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
-let IsSimpleMove=1, hasSideEffects=0 in {
+let IsSimpleMove=1, hasSideEffects=0, isAsCheapAsAMove=1 in {
   def IMOV1rr :  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
                            "mov.pred \t$dst, $sss;", []>;
   def IMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
@@ -1975,48 +1964,37 @@ let IsSimpleMove=1, hasSideEffects=0 in {
   def IMOV128rr : NVPTXInst<(outs Int128Regs:$dst), (ins Int128Regs:$sss),
                            "mov.b128 \t$dst, $sss;", []>;
 
-  def IMOVB16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
-                           "mov.b16 \t$dst, $sss;", []>;
-  def IMOVB32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
-                           "mov.b32 \t$dst, $sss;", []>;
-  def IMOVB64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$sss),
-                           "mov.b64 \t$dst, $sss;", []>;
-
-  def FMOV16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
-                           // We have to use .b16 here as there's no mov.f16.
-                           "mov.b16 \t$dst, $src;", []>;
   def FMOV32rr : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
                            "mov.f32 \t$dst, $src;", []>;
   def FMOV64rr : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
                            "mov.f64 \t$dst, $src;", []>;
-}
 
-def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
-                        "mov.pred \t$dst, $src;",
-                        [(set i1:$dst, imm:$src)]>;
-def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
-                         "mov.u16 \t$dst, $src;",
-                         [(set i16:$dst, imm:$src)]>;
-def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
-                         "mov.u32 \t$dst, $src;",
-                         [(set i32:$dst, imm:$src)]>;
-def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
-                        "mov.u64 \t$dst, $src;",
-                        [(set i64:$dst, imm:$src)]>;
-
-def IMOVB16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
-                         "mov.b16 \t$dst, $src;", []>;
-def IMOVB32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
-                         "mov.b32 \t$dst, $src;", []>;
-def IMOVB64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
-                        "mov.b64 \t$dst, $src;", []>;
-
-def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
-                         "mov.f32 \t$dst, $src;",
-                         [(set f32:$dst, fpimm:$src)]>;
-def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
-                         "mov.f64 \t$dst, $src;",
-                         [(set f64:$dst, fpimm:$src)]>;
+  def IMOV1ri : NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
+                          "mov.pred \t$dst, $src;",
+                          [(set i1:$dst, imm:$src)]>;
+  def IMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
+                          "mov.b16 \t$dst, $src;",
+                          [(set i16:$dst, imm:$src)]>;
+  def IMOV32ri : NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$src),
+                          "mov.b32 \t$dst, $src;",
+                          [(set i32:$dst, imm:$src)]>;
+  def IMOV64ri : NVPTXInst<(outs Int64Regs:$dst), (ins i64imm:$src),
+                          "mov.b64 \t$dst, $src;",
+                          [(set i64:$dst, imm:$src)]>;
+
+  def FMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins f16imm:$src),
+                          "mov.b16 \t$dst, $src;",
+                          [(set f16:$dst, fpimm:$src)]>;
+  def BFMOV16ri : NVPTXInst<(outs Int16Regs:$dst), (ins bf16imm:$src),
+                          "mov.b16 \t$dst, $src;",
+                          [(set bf16:$dst, fpimm:$src)]>;
+  def FMOV32ri : NVPTXInst<(outs Float32Regs:$dst), (ins f32imm:$src),
+                          "mov.f32 \t$dst, $src;",
+                          [(set f32:$dst, fpimm:$src)]>;
+  def FMOV64ri : NVPTXInst<(outs Float64Regs:$dst), (ins f64imm:$src),
+                          "mov.f64 \t$dst, $src;",
+                          [(set f64:$dst, fpimm:$src)]>;
+}
 
 def : Pat<(i32 (Wrapper texternalsym:$dst)), (IMOV32ri texternalsym:$dst)>;
 def : Pat<(i64 (Wrapper texternalsym:$dst)), (IMOV64ri texternalsym:$dst)>;
@@ -2215,18 +2193,6 @@ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
   def : Pat<(i1 (OpNode f16:$a, f16:$b)),
             (SETP_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
         Requires<[useFP16Math]>;
-  def : Pat<(i1 (OpNode f16:$a, fpimm:$b)),
-            (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
-        Requires<[useFP16Math,doF32FTZ]>;
-  def : Pat<(i1 (OpNode f16:$a, fpimm:$b)),
-            (SETP_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
-        Requires<[useFP16Math]>;
-  def : Pat<(i1 (OpNode fpimm:$a, f16:$b)),
-            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
-        Requires<[useFP16Math,doF32FTZ]>;
-  def : Pat<(i1 (OpNode fpimm:$a, f16:$b)),
-            (SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
-        Requires<[useFP16Math]>;
 
   // bf16 -> pred
   def : Pat<(i1 (OpNode bf16:$a, bf16:$b)),
@@ -2235,18 +2201,6 @@ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
   def : Pat<(i1 (OpNode bf16:$a, bf16:$b)),
             (SETP_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
         Requires<[hasBF16Math]>;
-  def : Pat<(i1 (OpNode bf16:$a, fpimm:$b)),
-            (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>,
-        Requires<[hasBF16Math,doF32FTZ]>;
-  def : Pat<(i1 (OpNode bf16:$a, fpimm:$b)),
-            (SETP_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>,
-        Requires<[hasBF16Math]>;
-  def : Pat<(i1 (OpNode fpimm:$a, bf16:$b)),
-            (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
-        Requires<[hasBF16Math,doF32FTZ]>;
-  def : Pat<(i1 (OpNode fpimm:$a, bf16:$b)),
-            (SETP_bf16rr (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>,
-        Requires<[hasBF16Math]>;
 
   // f32 -> pred
   def : Pat<(i1 (OpNode f32:$a, f32:$b)),
@@ -2280,18 +2234,6 @@ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
   def : Pat<(i32 (OpNode f16:$a, f16:$b)),
             (SET_f16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
         Requires<[useFP16Math]>;
-  def : Pat<(i32 (OpNode f16:$a, fpimm:$b)),
-            (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
-        Requires<[useFP16Math, doF32FTZ]>;
-  def : Pat<(i32 (OpNode f16:$a, fpimm:$b)),
-            (SET_f16rr Int16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
-        Requires<[useFP16Math]>;
-  def : Pat<(i32 (OpNode fpimm:$a, f16:$b)),
-            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
-        Requires<[useFP16Math, doF32FTZ]>;
-  def : Pat<(i32 (OpNode fpimm:$a, f16:$b)),
-            (SET_f16ir (LOAD_CONST_F16 fpimm:$a), Int16Regs:$b, Mode)>,
-        Requires<[useFP16Math]>;
 
   // bf16 -> i32
   def : Pat<(i32 (OpNode bf16:$a, bf16:$b)),
@@ -2300,18 +2242,6 @@ multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
   def : Pat<(i32 (OpNode bf16:$a, bf16:$b)),
             (SET_bf16rr Int16Regs:$a, Int16Regs:$b, Mode)>,
         Requires<[hasBF16Math]>;
-  def : Pat<(i32 (OpNode bf16:$a, fpimm:$b)),
-            (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), ModeFTZ)>,
-        Requires<[hasBF16Math, doF32FTZ]>;
-  def : Pat<(i32 (OpNode bf16:$a, fpimm:$b)),
-            (SET_bf16rr Int16Regs:$a, (LOAD_CONST_BF16 fpimm:$b), Mode)>,
-        Requires<[hasBF16Math]>;
-  def : Pat<(i32 (OpNode fpimm:$a, bf16:$b)),
-            (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, ModeFTZ)>,
-        Requires<[hasBF16Math, doF32FTZ]>;
-  def : Pat<(i32 (OpNode fpimm:$a, bf16:$b)),
-            (SET_bf16ir (LOAD_CONST_BF16 fpimm:$a), Int16Regs:$b, Mode)>,
-        Requires<[hasBF16Math]>;
 
   // f32 -> i32
   def : Pat<(i32 (OpNode f32:$a, f32:$b)),
@@ -3104,21 +3034,17 @@ def: Pat<(f32 (bitconvert vt:$a)),
          (BITCONVERT_32_I2F Int32Regs:$a)>;
 }
 foreach vt = [f16, bf16] in {
-def: Pat<(vt (bitconvert (i16 UInt16Const:$a))),
-         (IMOVB16ri UInt16Const:$a)>;
-def: Pat<(vt (bitconvert i16:$a)),
-         (ProxyRegI16 Int16Regs:$a)>;
-def: Pat<(i16 (bitconvert vt:$a)),
-         (ProxyRegI16 Int16Regs:$a)>;
+  def: Pat<(vt (bitconvert i16:$a)),
+           (vt Int16Regs:$a)>;
+  def: Pat<(i16 (bitconvert vt:$a)),
+           (i16 Int16Regs:$a)>;
 }
 
 foreach ta = [v2f16, v2bf16, v2i16, v4i8, i32] in {
-  def: Pat<(ta (bitconvert (i32 UInt32Const:$a))),
-           (IMOVB32ri UInt32Const:$a)>;
   foreach tb = [v2f16, v2bf16, v2i16, v4i8, i32] in {
     if !ne(ta, tb) then {
-      def: Pat<(ta (bitconvert (tb Int32Regs:$a))),
-             (ProxyRegI32 Int32Regs:$a)>;
+      def: Pat<(ta (bitconvert tb:$a)),
+               (ta Int32Regs:$a)>;
     }
   }
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 256161d5d79c7..6d4a56f191825 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2803,10 +2803,10 @@ def : Pat<(int_nvvm_ptr_param_to_gen i64:$src),
 
 // nvvm.ptr.gen.to.param
 def : Pat<(int_nvvm_ptr_gen_to_param i32:$src),
-          (IMOV32rr Int32Regs:$src)>;
+          (i32 Int32Regs:$src)>;
 
 def : Pat<(int_nvvm_ptr_gen_to_param i64:$src),
-          (IMOV64rr Int64Regs:$src)>;
+          (i64 Int64Regs:$src)>;
 
 // nvvm.move intrinsicc
 def nvvm_move_i16 : NVPTXInst<(outs Int16Regs:$r), (ins Int16Regs:$s),
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
index bae7109288b99..b76b3e59e9e6d 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
@@ -46,7 +46,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-LABEL: test(
 ; CHECKPTX62:       {
 ; CHECKPTX62-NEXT:    .reg .pred %p<5>;
-; CHECKPTX62-NEXT:    .reg .b16 %rs<19>;
+; CHECKPTX62-NEXT:    .reg .b16 %rs<11>;
 ; CHECKPTX62-NEXT:    .reg .b32 %r<58>;
 ; CHECKPTX62-EMPTY:
 ; CHECKPTX62-NEXT:  // %bb.0:
@@ -65,8 +65,8 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECKPTX62-NEXT:    shr.u32 %r28, %r54, %r2;
 ; CHECKPTX62-NEXT:    cvt.u16.u32 %rs2, %r28;
-; CHECKPTX62-NEXT:    add.rn.f16 %rs4, %rs2, %rs1;
-; CHECKPTX62-NEXT:    cvt.u32.u16 %r29, %rs4;
+; CHECKPTX62-NEXT:    add.rn.f16 %rs3, %rs2, %rs1;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r29, %rs3;
 ; CHECKPTX62-NEXT:    shl.b32 %r30, %r29, %r2;
 ; CHECKPTX62-NEXT:    and.b32 %r31, %r54, %r3;
 ; CHECKPTX62-NEXT:    or.b32 %r32, %r31, %r30;
@@ -79,10 +79,10 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-NEXT:  $L__BB0_3: // %atomicrmw.start27
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECKPTX62-NEXT:    shr.u32 %r33, %r55, %r2;
-; CHECKPTX62-NEXT:    cvt.u16.u32 %rs6, %r33;
-; CHECKPTX62-NEXT:    mov.b16 %rs8, 0x3C00;
-; CHECKPTX62-NEXT:    add.rn.f16 %rs9, %rs6, %rs8;
-; CHECKPTX62-NEXT:    cvt.u32.u16 %r34, %rs9;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs4, %r33;
+; CHECKPTX62-NEXT:    mov.b16 %rs5, 0x3C00;
+; CHECKPTX62-NEXT:    add.rn.f16 %rs6, %rs4, %rs5;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r34, %rs6;
 ; CHECKPTX62-NEXT:    shl.b32 %r35, %r34, %r2;
 ; CHECKPTX62-NEXT:    and.b32 %r36, %r55, %r3;
 ; CHECKPTX62-NEXT:    or.b32 %r37, %r36, %r35;
@@ -94,15 +94,16 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-NEXT:    and.b32 %r10, %r22, -4;
 ; CHECKPTX62-NEXT:    shl.b32 %r38, %r22, 3;
 ; CHECKPTX62-NEXT:    and.b32 %r11, %r38, 24;
-; CHECKPTX62-NEXT:    shl.b32 %r40, %r26, %r11;
+; CHECKPTX62-NEXT:    mov.b32 %r39, 65535;
+; CHECKPTX62-NEXT:    shl.b32 %r40, %r39, %r11;
 ; CHECKPTX62-NEXT:    not.b32 %r12, %r40;
 ; CHECKPTX62-NEXT:    ld.global.u32 %r56, [%r10];
 ; CHECKPTX62-NEXT:  $L__BB0_5: // %atomicrmw.start9
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECKPTX62-NEXT:    shr.u32 %r41, %r56, %r11;
-; CHECKPTX62-NEXT:    cvt.u16.u32 %rs11, %r41;
-; CHECKPTX62-NEXT:    add.rn.f16 %rs13, %rs11, %rs1;
-; CHECKPTX62-NEXT:    cvt.u32.u16 %r42, %rs13;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs7, %r41;
+; CHECKPTX62-NEXT:    add.rn.f16 %rs8, %rs7, %rs1;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r42, %rs8;
 ; CHECKPTX62-NEXT:    shl.b32 %r43, %r42, %r11;
 ; CHECKPTX62-NEXT:    and.b32 %r44, %r56, %r12;
 ; CHECKPTX62-NEXT:    or.b32 %r45, %r44, %r43;
@@ -114,15 +115,16 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-NEXT:    and.b32 %r16, %r23, -4;
 ; CHECKPTX62-NEXT:    shl.b32 %r46, %r23, 3;
 ; CHECKPTX62-NEXT:    and.b32 %r17, %r46, 24;
-; CHECKPTX62-NEXT:    shl.b32 %r48, %r26, %r17;
+; CHECKPTX62-NEXT:    mov.b32 %r47, 65535;
+; CHECKPTX62-NEXT:    shl.b32 %r48, %r47, %r17;
 ; CHECKPTX62-NEXT:    not.b32 %r18, %r48;
 ; CHECKPTX62-NEXT:    ld.shared.u32 %r57, [%r16];
 ; CHECKPTX62-NEXT:  $L__BB0_7: // %atomicrmw.start
 ; CHECKPTX62-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECKPTX62-NEXT:    shr.u32 %r49, %r57, %r17;
-; CHECKPTX62-NEXT:    cvt.u16.u32 %rs15, %r49;
-; CHECKPTX62-NEXT:    add.rn.f16 %rs17, %rs15, %rs1;
-; CHECKPTX62-NEXT:    cvt.u32.u16 %r50, %rs17;
+; CHECKPTX62-NEXT:    cvt.u16.u32 %rs9, %r49;
+; CHECKPTX62-NEXT:    add.rn.f16 %rs10, %rs9, %rs1;
+; CHECKPTX62-NEXT:    cvt.u32.u16 %r50, %rs10;
 ; CHECKPTX62-NEXT:    shl.b32 %r51, %r50, %r17;
 ; CHECKPTX62-NEXT:    and.b32 %r52, %r57, %r18;
 ; CHECKPTX62-NEXT:    or.b32 %r53, %r52, %r51;
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
index 9ddb82321b4ea..f81b785f13225 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -46,7 +46,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-LABEL: test(
 ; CHECKPTX71:       {
 ; CHECKPTX71-NEXT:    .reg .pred %p<5>;
-; CHECKPTX71-NEXT:    .reg .b16 %rs<34>;
+; CHECKPTX71-NEXT:    .reg .b16 %rs<22>;
 ; CHECKPTX71-NEXT:    .reg .b32 %r<4>;
 ; CHECKPTX71-NEXT:    .reg .f32 %f<12>;
 ; CHECKPTX71-EMPTY:
@@ -55,49 +55,49 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-NEXT:    ld.param.u32 %r3, [test_param_2];
 ; CHECKPTX71-NEXT:    ld.param.u32 %r2, [test_param_1];
 ; CHECKPTX71-NEXT:    ld.param.u32 %r1, [test_param_0];
-; CHECKPTX71-NEXT:    ld.b16 %rs30, [%r1];
+; CHECKPTX71-NEXT:    ld.b16 %rs18, [%r1];
 ; CHECKPTX71-NEXT:    cvt.f32.bf16 %f1, %rs13;
 ; CHECKPTX71-NEXT:  $L__BB0_1: // %atomicrmw.start14
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    cvt.f32.bf16 %f2, %rs30;
+; CHECKPTX71-NEXT:    cvt.f32.bf16 %f2, %rs18;
 ; CHECKPTX71-NEXT:    add.rn.f32 %f3, %f2, %f1;
 ; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs14, %f3;
-; CHECKPTX71-NEXT:    atom.cas.b16 %rs17, [%r1], %rs30, %rs14;
-; CHECKPTX71-NEXT:    setp.ne.s16 %p1, %rs17, %rs30;
-; CHECKPTX71-NEXT:    mov.u16 %rs30, %rs17;
+; CHECKPTX71-NEXT:    atom.cas.b16 %rs3, [%r1], %rs18, %rs14;
+; CHECKPTX71-NEXT:    setp.ne.s16 %p1, %rs3, %rs18;
+; CHECKPTX71-NEXT:    mov.u16 %rs18, %rs3;
 ; CHECKPTX71-NEXT:    @%p1 bra $L__BB0_1;
 ; CHECKPTX71-NEXT:  // %bb.2: // %atomicrmw.end13
-; CHECKPTX71-NEXT:    ld.b16 %rs31, [%r1];
+; CHECKPTX71-NEXT:    ld.b16 %rs19, [%r1];
 ; CHECKPTX71-NEXT:  $L__BB0_3: // %atomicrmw.start8
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    cvt.f32.bf16 %f4, %rs31;
+; CHECKPTX71-NEXT:    cvt.f32.bf16 %f4, %rs19;
 ; CHECKPTX71-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
-; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs18, %f5;
-; CHECKPTX71-NEXT:    atom.cas.b16 %rs21, [%r1], %rs31, %rs18;
-; CHECKPTX71-NEXT:    setp.ne.s16 %p2, %rs21, %rs31;
-; CHECKPTX71-NEXT:    mov.u16 %rs31, %rs21;
+; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs15, %f5;
+; CHECKPTX71-NEXT:    atom.cas.b16 %rs6, [%r1], %rs19, %rs15;
+; CHECKPTX71-NEXT:    setp.ne.s16 %p2, %rs6, %rs19;
+; CHECKPTX71-NEXT:    mov.u16 %rs19, %rs6;
 ; CHECKPTX71-NEXT:    @%p2 bra $L__BB0_3;
 ; CHECKPTX71-NEXT:  // %bb.4: // %atomicrmw.end7
-; CHECKPTX71-NEXT:    ld.global.b16 %rs32, [%r2];
+; CHECKPTX71-NEXT:    ld.global.b16 %rs20, [%r2];
 ; CHECKPTX71-NEXT:  $L__BB0_5: // %atomicrmw.start2
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    cvt.f32.bf16 %f7, %rs32;
+; CHECKPTX71-NEXT:    cvt.f32.bf16 %f7, %rs20;
 ; CHECKPTX71-NEXT:    add.rn.f32 %f8, %f7, %f1;
-; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs22, %f8;
-; CHECKPTX71-NEXT:    atom.global.cas.b16 %rs25, [%r2], %rs32, %rs22;
-; CHECKPTX71-NEXT:    setp.ne.s16 %p3, %rs25, %rs32;
-; CHECKPTX71-NEXT:    mov.u16 %rs32, %rs25;
+; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs16, %f8;
+; CHECKPTX71-NEXT:    atom.global.cas.b16 %rs9, [%r2], %rs20, %rs16;
+; CHECKPTX71-NEXT:    setp.ne.s16 %p3, %rs9, %rs20;
+; CHECKPTX71-NEXT:    mov.u16 %rs20, %rs9;
 ; CHECKPTX71-NEXT:    @%p3 bra $L__BB0_5;
 ; CHECKPTX71-NEXT:  // %bb.6: // %atomicrmw.end1
-; CHECKPTX71-NEXT:    ld.shared.b16 %rs33, [%r3];
+; CHECKPTX71-NEXT:    ld.shared.b16 %rs21, [%r3];
 ; CHECKPTX71-NEXT:  $L__BB0_7: // %atomicrmw.start
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    cvt.f32.bf16 %f10, %rs33;
+; CHECKPTX71-NEXT:    cvt.f32.bf16 %f10, %rs21;
 ; CHECKPTX71-NEXT:    add.rn.f32 %f11, %f10, %f1;
-; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs26, %f11;
-; CHECKPTX71-NEXT:    atom.shared.cas.b16 %rs29, [%r3], %rs33, %rs26;
-; CHECKPTX71-NEXT:    setp.ne.s16 %p4, %rs29, %rs33;
-; CHECKPTX71-NEXT:    mov.u16 %rs33, %rs29;
+; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs17, %f11;
+; CHECKPTX71-NEXT:    atom.shared.cas.b16 %rs12, [%r3], %rs21, %rs17;
+; CHECKPTX71-NEXT:    setp.ne.s16 %p4, %rs12, %rs21;
+; CHECKPTX71-NEXT:    mov.u16 %rs21, %rs12;
 ; CHECKPTX71-NEXT:    @%p4 bra $L__BB0_7;
 ; CHECKPTX71-NEXT:  // %bb.8: // %atomicrmw.end
 ; CHECKPTX71-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
index 08ed317ef9300..6828bac18cad7 100644
--- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -17,7 +17,7 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) {
 ; SM70-LABEL: test_fadd(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
-; SM70-NEXT:    .reg .b16 %rs<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
 ; SM70-NEXT:    .reg .b32 %r<11>;
 ; SM70-NEXT:    .reg .f32 %f<4>;
 ; SM70-EMPTY:
@@ -88,7 +88,7 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) {
 ; SM70-LABEL: test_fsub(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
-; SM70-NEXT:    .reg .b16 %rs<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
 ; SM70-NEXT:    .reg .b32 %r<11>;
 ; SM70-NEXT:    .reg .f32 %f<4>;
 ; SM70-EMPTY:
@@ -159,8 +159,8 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-LABEL: test_faddx2(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<9>;
-; SM70-NEXT:    .reg .b32 %r<25>;
+; SM70-NEXT:    .reg .b16 %rs<5>;
+; SM70-NEXT:    .reg .b32 %r<24>;
 ; SM70-NEXT:    .reg .f32 %f<7>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
@@ -170,8 +170,8 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-NEXT:    cvt.u32.u16 %r3, %rs2;
 ; SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; SM70-NEXT:    mov.b32 %f1, %r4;
-; SM70-NEXT:    mov.b32 {%rs4, %rs5}, %r1;
-; SM70-NEXT:    cvt.u32.u16 %r5, %rs5;
+; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; SM70-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; SM70-NEXT:    shl.b32 %r6, %r5, 16;
 ; SM70-NEXT:    mov.b32 %f2, %r6;
 ; SM70-NEXT:    add.rn.f32 %f3, %f2, %f1;
@@ -185,7 +185,7 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    shl.b32 %r14, %r13, 16;
 ; SM70-NEXT:    mov.b32 %f4, %r14;
-; SM70-NEXT:    cvt.u32.u16 %r15, %rs4;
+; SM70-NEXT:    cvt.u32.u16 %r15, %rs3;
 ; SM70-NEXT:    shl.b32 %r16, %r15, 16;
 ; SM70-NEXT:    mov.b32 %f5, %r16;
 ; SM70-NEXT:    add.rn.f32 %f6, %f5, %f4;
@@ -260,8 +260,8 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-LABEL: test_fsubx2(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<9>;
-; SM70-NEXT:    .reg .b32 %r<25>;
+; SM70-NEXT:    .reg .b16 %rs<5>;
+; SM70-NEXT:    .reg .b32 %r<24>;
 ; SM70-NEXT:    .reg .f32 %f<7>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
@@ -271,8 +271,8 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-NEXT:    cvt.u32.u16 %r3, %rs2;
 ; SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; SM70-NEXT:    mov.b32 %f1, %r4;
-; SM70-NEXT:    mov.b32 {%rs4, %rs5}, %r1;
-; SM70-NEXT:    cvt.u32.u16 %r5, %rs5;
+; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; SM70-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; SM70-NEXT:    shl.b32 %r6, %r5, 16;
 ; SM70-NEXT:    mov.b32 %f2, %r6;
 ; SM70-NEXT:    sub.rn.f32 %f3, %f2, %f1;
@@ -286,7 +286,7 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    shl.b32 %r14, %r13, 16;
 ; SM70-NEXT:    mov.b32 %f4, %r14;
-; SM70-NEXT:    cvt.u32.u16 %r15, %rs4;
+; SM70-NEXT:    cvt.u32.u16 %r15, %rs3;
 ; SM70-NEXT:    shl.b32 %r16, %r15, 16;
 ; SM70-NEXT:    mov.b32 %f5, %r16;
 ; SM70-NEXT:    sub.rn.f32 %f6, %f5, %f4;
@@ -361,8 +361,8 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-LABEL: test_fmulx2(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<9>;
-; SM70-NEXT:    .reg .b32 %r<25>;
+; SM70-NEXT:    .reg .b16 %rs<5>;
+; SM70-NEXT:    .reg .b32 %r<24>;
 ; SM70-NEXT:    .reg .f32 %f<7>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
@@ -372,8 +372,8 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-NEXT:    cvt.u32.u16 %r3, %rs2;
 ; SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; SM70-NEXT:    mov.b32 %f1, %r4;
-; SM70-NEXT:    mov.b32 {%rs4, %rs5}, %r1;
-; SM70-NEXT:    cvt.u32.u16 %r5, %rs5;
+; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; SM70-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; SM70-NEXT:    shl.b32 %r6, %r5, 16;
 ; SM70-NEXT:    mov.b32 %f2, %r6;
 ; SM70-NEXT:    mul.rn.f32 %f3, %f2, %f1;
@@ -387,7 +387,7 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    shl.b32 %r14, %r13, 16;
 ; SM70-NEXT:    mov.b32 %f4, %r14;
-; SM70-NEXT:    cvt.u32.u16 %r15, %rs4;
+; SM70-NEXT:    cvt.u32.u16 %r15, %rs3;
 ; SM70-NEXT:    shl.b32 %r16, %r15, 16;
 ; SM70-NEXT:    mov.b32 %f5, %r16;
 ; SM70-NEXT:    mul.rn.f32 %f6, %f5, %f4;
@@ -462,8 +462,8 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-LABEL: test_fdiv(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<9>;
-; SM70-NEXT:    .reg .b32 %r<25>;
+; SM70-NEXT:    .reg .b16 %rs<5>;
+; SM70-NEXT:    .reg .b32 %r<24>;
 ; SM70-NEXT:    .reg .f32 %f<7>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
@@ -473,8 +473,8 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-NEXT:    cvt.u32.u16 %r3, %rs2;
 ; SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; SM70-NEXT:    mov.b32 %f1, %r4;
-; SM70-NEXT:    mov.b32 {%rs4, %rs5}, %r1;
-; SM70-NEXT:    cvt.u32.u16 %r5, %rs5;
+; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; SM70-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; SM70-NEXT:    shl.b32 %r6, %r5, 16;
 ; SM70-NEXT:    mov.b32 %f2, %r6;
 ; SM70-NEXT:    div.rn.f32 %f3, %f2, %f1;
@@ -488,7 +488,7 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    shl.b32 %r14, %r13, 16;
 ; SM70-NEXT:    mov.b32 %f4, %r14;
-; SM70-NEXT:    cvt.u32.u16 %r15, %rs4;
+; SM70-NEXT:    cvt.u32.u16 %r15, %rs3;
 ; SM70-NEXT:    shl.b32 %r16, %r15, 16;
 ; SM70-NEXT:    mov.b32 %f5, %r16;
 ; SM70-NEXT:    div.rn.f32 %f6, %f5, %f4;
@@ -648,7 +648,7 @@ define bfloat @test_fptrunc_float(float %a) #0 {
 ; SM70-LABEL: test_fptrunc_float(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
-; SM70-NEXT:    .reg .b16 %rs<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
 ; SM70-NEXT:    .reg .b32 %r<7>;
 ; SM70-NEXT:    .reg .f32 %f<2>;
 ; SM70-EMPTY:
@@ -705,7 +705,7 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 {
 ; SM70-LABEL: test_fadd_imm_1(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
-; SM70-NEXT:    .reg .b16 %rs<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
 ; SM70-NEXT:    .reg .b32 %r<9>;
 ; SM70-NEXT:    .reg .f32 %f<3>;
 ; SM70-EMPTY:
@@ -789,7 +789,7 @@ define bfloat @test_select_cc_bf16_f64(double %a, double %b, bfloat %c, bfloat %
 define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
 ; SM70-LABEL: test_extload_bf16x8(
 ; SM70:       {
-; SM70-NEXT:    .reg .b16 %rs<17>;
+; SM70-NEXT:    .reg .b16 %rs<9>;
 ; SM70-NEXT:    .reg .b32 %r<21>;
 ; SM70-NEXT:    .reg .f32 %f<9>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
@@ -1033,7 +1033,7 @@ define bfloat @test_sitofp_i16(i16 %a) {
 ; SM70-LABEL: test_sitofp_i16(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
-; SM70-NEXT:    .reg .b16 %rs<4>;
+; SM70-NEXT:    .reg .b16 %rs<3>;
 ; SM70-NEXT:    .reg .b32 %r<7>;
 ; SM70-NEXT:    .reg .f32 %f<2>;
 ; SM70-EMPTY:
@@ -1092,7 +1092,7 @@ define bfloat @test_uitofp_i8(i8 %a) {
 ; SM70-LABEL: test_uitofp_i8(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
-; SM70-NEXT:    .reg .b16 %rs<4>;
+; SM70-NEXT:    .reg .b16 %rs<3>;
 ; SM70-NEXT:    .reg .b32 %r<7>;
 ; SM70-NEXT:    .reg .f32 %f<2>;
 ; SM70-EMPTY:
@@ -1151,7 +1151,7 @@ define bfloat @test_uitofp_i1(i1 %a) {
 ; SM70-LABEL: test_uitofp_i1(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<5>;
+; SM70-NEXT:    .reg .b16 %rs<4>;
 ; SM70-NEXT:    .reg .b32 %r<8>;
 ; SM70-NEXT:    .reg .f32 %f<2>;
 ; SM70-EMPTY:
@@ -1228,7 +1228,7 @@ define bfloat @test_uitofp_i16(i16 %a) {
 ; SM70-LABEL: test_uitofp_i16(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
-; SM70-NEXT:    .reg .b16 %rs<4>;
+; SM70-NEXT:    .reg .b16 %rs<3>;
 ; SM70-NEXT:    .reg .b32 %r<7>;
 ; SM70-NEXT:    .reg .f32 %f<2>;
 ; SM70-EMPTY:
@@ -1287,7 +1287,7 @@ define bfloat @test_uitofp_i32(i32 %a) {
 ; SM70-LABEL: test_uitofp_i32(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
-; SM70-NEXT:    .reg .b16 %rs<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
 ; SM70-NEXT:    .reg .b32 %r<8>;
 ; SM70-NEXT:    .reg .f32 %f<2>;
 ; SM70-EMPTY:
@@ -1349,7 +1349,7 @@ define bfloat @test_uitofp_i64(i64 %a) {
 ; SM70-LABEL: test_uitofp_i64(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
-; SM70-NEXT:    .reg .b16 %rs<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
 ; SM70-NEXT:    .reg .b32 %r<7>;
 ; SM70-NEXT:    .reg .f32 %f<2>;
 ; SM70-NEXT:    .reg .b64 %rd<2>;
@@ -1412,7 +1412,7 @@ define bfloat @test_roundeven(bfloat %a) {
 ; SM70-LABEL: test_roundeven(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
-; SM70-NEXT:    .reg .b16 %rs<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
 ; SM70-NEXT:    .reg .b32 %r<9>;
 ; SM70-NEXT:    .reg .f32 %f<3>;
 ; SM70-EMPTY:
@@ -1475,33 +1475,33 @@ define bfloat @test_maximum(bfloat %a, bfloat %b) {
 ; SM70-LABEL: test_maximum(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<6>;
-; SM70-NEXT:    .reg .b16 %rs<11>;
+; SM70-NEXT:    .reg .b16 %rs<8>;
 ; SM70-NEXT:    .reg .b32 %r<7>;
 ; SM70-NEXT:    .reg .f32 %f<4>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
 ; SM70-NEXT:    ld.param.b16 %rs1, [test_maximum_param_0];
-; SM70-NEXT:    ld.param.b16 %rs3, [test_maximum_param_1];
-; SM70-NEXT:    cvt.u32.u16 %r1, %rs3;
+; SM70-NEXT:    ld.param.b16 %rs2, [test_maximum_param_1];
+; SM70-NEXT:    cvt.u32.u16 %r1, %rs2;
 ; SM70-NEXT:    shl.b32 %r2, %r1, 16;
 ; SM70-NEXT:    mov.b32 %f1, %r2;
 ; SM70-NEXT:    cvt.u32.u16 %r3, %rs1;
 ; SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; SM70-NEXT:    mov.b32 %f2, %r4;
 ; SM70-NEXT:    setp.gt.f32 %p1, %f2, %f1;
-; SM70-NEXT:    selp.b16 %rs5, %rs1, %rs3, %p1;
+; SM70-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
 ; SM70-NEXT:    setp.nan.f32 %p2, %f2, %f1;
-; SM70-NEXT:    selp.b16 %rs6, 0x7FC0, %rs5, %p2;
+; SM70-NEXT:    selp.b16 %rs4, 0x7FC0, %rs3, %p2;
 ; SM70-NEXT:    setp.eq.s16 %p3, %rs1, 0;
-; SM70-NEXT:    selp.b16 %rs7, %rs1, %rs6, %p3;
-; SM70-NEXT:    setp.eq.s16 %p4, %rs3, 0;
-; SM70-NEXT:    selp.b16 %rs8, %rs3, %rs7, %p4;
-; SM70-NEXT:    cvt.u32.u16 %r5, %rs6;
+; SM70-NEXT:    selp.b16 %rs5, %rs1, %rs4, %p3;
+; SM70-NEXT:    setp.eq.s16 %p4, %rs2, 0;
+; SM70-NEXT:    selp.b16 %rs6, %rs2, %rs5, %p4;
+; SM70-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; SM70-NEXT:    shl.b32 %r6, %r5, 16;
 ; SM70-NEXT:    mov.b32 %f3, %r6;
 ; SM70-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
-; SM70-NEXT:    selp.b16 %rs10, %rs8, %rs6, %p5;
-; SM70-NEXT:    st.param.b16 [func_retval0], %rs10;
+; SM70-NEXT:    selp.b16 %rs7, %rs6, %rs4, %p5;
+; SM70-NEXT:    st.param.b16 [func_retval0], %rs7;
 ; SM70-NEXT:    ret;
 ;
 ; SM80-LABEL: test_maximum(
@@ -1544,7 +1544,7 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) {
 ; SM70-LABEL: test_maxnum(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<2>;
-; SM70-NEXT:    .reg .b16 %rs<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
 ; SM70-NEXT:    .reg .b32 %r<11>;
 ; SM70-NEXT:    .reg .f32 %f<4>;
 ; SM70-EMPTY:
@@ -1607,7 +1607,7 @@ define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; SM70-LABEL: test_maximum_v2(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<11>;
-; SM70-NEXT:    .reg .b16 %rs<21>;
+; SM70-NEXT:    .reg .b16 %rs<15>;
 ; SM70-NEXT:    .reg .b32 %r<16>;
 ; SM70-NEXT:    .reg .f32 %f<7>;
 ; SM70-EMPTY:
@@ -1618,43 +1618,43 @@ define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; SM70-NEXT:    cvt.u32.u16 %r3, %rs2;
 ; SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; SM70-NEXT:    mov.b32 %f1, %r4;
-; SM70-NEXT:    mov.b32 {%rs4, %rs5}, %r1;
-; SM70-NEXT:    cvt.u32.u16 %r5, %rs5;
+; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; SM70-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; SM70-NEXT:    shl.b32 %r6, %r5, 16;
 ; SM70-NEXT:    mov.b32 %f2, %r6;
 ; SM70-NEXT:    setp.gt.f32 %p1, %f2, %f1;
-; SM70-NEXT:    selp.b16 %rs7, %rs5, %rs2, %p1;
+; SM70-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p1;
 ; SM70-NEXT:    setp.nan.f32 %p2, %f2, %f1;
-; SM70-NEXT:    selp.b16 %rs8, 0x7FC0, %rs7, %p2;
-; SM70-NEXT:    setp.eq.s16 %p3, %rs5, 0;
-; SM70-NEXT:    selp.b16 %rs9, %rs5, %rs8, %p3;
+; SM70-NEXT:    selp.b16 %rs6, 0x7FC0, %rs5, %p2;
+; SM70-NEXT:    setp.eq.s16 %p3, %rs4, 0;
+; SM70-NEXT:    selp.b16 %rs7, %rs4, %rs6, %p3;
 ; SM70-NEXT:    setp.eq.s16 %p4, %rs2, 0;
-; SM70-NEXT:    selp.b16 %rs10, %rs2, %rs9, %p4;
-; SM70-NEXT:    cvt.u32.u16 %r7, %rs8;
+; SM70-NEXT:    selp.b16 %rs8, %rs2, %rs7, %p4;
+; SM70-NEXT:    cvt.u32.u16 %r7, %rs6;
 ; SM70-NEXT:    shl.b32 %r8, %r7, 16;
 ; SM70-NEXT:    mov.b32 %f3, %r8;
 ; SM70-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
-; SM70-NEXT:    selp.b16 %rs12, %rs10, %rs8, %p5;
+; SM70-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p5;
 ; SM70-NEXT:    cvt.u32.u16 %r9, %rs1;
 ; SM70-NEXT:    shl.b32 %r10, %r9, 16;
 ; SM70-NEXT:    mov.b32 %f4, %r10;
-; SM70-NEXT:    cvt.u32.u16 %r11, %rs4;
+; SM70-NEXT:    cvt.u32.u16 %r11, %rs3;
 ; SM70-NEXT:    shl.b32 %r12, %r11, 16;
 ; SM70-NEXT:    mov.b32 %f5, %r12;
 ; SM70-NEXT:    setp.gt.f32 %p6, %f5, %f4;
-; SM70-NEXT:    selp.b16 %rs15, %rs4, %rs1, %p6;
+; SM70-NEXT:    selp.b16 %rs10, %rs3, %rs1, %p6;
 ; SM70-NEXT:    setp.nan.f32 %p7, %f5, %f4;
-; SM70-NEXT:    selp.b16 %rs16, 0x7FC0, %rs15, %p7;
-; SM70-NEXT:    setp.eq.s16 %p8, %rs4, 0;
-; SM70-NEXT:    selp.b16 %rs17, %rs4, %rs16, %p8;
+; SM70-NEXT:    selp.b16 %rs11, 0x7FC0, %rs10, %p7;
+; SM70-NEXT:    setp.eq.s16 %p8, %rs3, 0;
+; SM70-NEXT:    selp.b16 %rs12, %rs3, %rs11, %p8;
 ; SM70-NEXT:    setp.eq.s16 %p9, %rs1, 0;
-; SM70-NEXT:    selp.b16 %rs18, %rs1, %rs17, %p9;
-; SM70-NEXT:    cvt.u32.u16 %r13, %rs16;
+; SM70-NEXT:    selp.b16 %rs13, %rs1, %rs12, %p9;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs11;
 ; SM70-NEXT:    shl.b32 %r14, %r13, 16;
 ; SM70-NEXT:    mov.b32 %f6, %r14;
 ; SM70-NEXT:    setp.eq.f32 %p10, %f6, 0f00000000;
-; SM70-NEXT:    selp.b16 %rs20, %rs18, %rs16, %p10;
-; SM70-NEXT:    mov.b32 %r15, {%rs20, %rs12};
+; SM70-NEXT:    selp.b16 %rs14, %rs13, %rs11, %p10;
+; SM70-NEXT:    mov.b32 %r15, {%rs14, %rs9};
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r15;
 ; SM70-NEXT:    ret;
 ;
@@ -1698,8 +1698,8 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; SM70-LABEL: test_maxnum_v2(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<9>;
-; SM70-NEXT:    .reg .b32 %r<25>;
+; SM70-NEXT:    .reg .b16 %rs<5>;
+; SM70-NEXT:    .reg .b32 %r<24>;
 ; SM70-NEXT:    .reg .f32 %f<7>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
@@ -1709,8 +1709,8 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; SM70-NEXT:    cvt.u32.u16 %r3, %rs2;
 ; SM70-NEXT:    shl.b32 %r4, %r3, 16;
 ; SM70-NEXT:    mov.b32 %f1, %r4;
-; SM70-NEXT:    mov.b32 {%rs4, %rs5}, %r1;
-; SM70-NEXT:    cvt.u32.u16 %r5, %rs5;
+; SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; SM70-NEXT:    cvt.u32.u16 %r5, %rs4;
 ; SM70-NEXT:    shl.b32 %r6, %r5, 16;
 ; SM70-NEXT:    mov.b32 %f2, %r6;
 ; SM70-NEXT:    max.f32 %f3, %f2, %f1;
@@ -1724,7 +1724,7 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
 ; SM70-NEXT:    shl.b32 %r14, %r13, 16;
 ; SM70-NEXT:    mov.b32 %f4, %r14;
-; SM70-NEXT:    cvt.u32.u16 %r15, %rs4;
+; SM70-NEXT:    cvt.u32.u16 %r15, %rs3;
 ; SM70-NEXT:    shl.b32 %r16, %r15, 16;
 ; SM70-NEXT:    mov.b32 %f5, %r16;
 ; SM70-NEXT:    max.f32 %f6, %f5, %f4;
diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
index 8eb6ae8bfae86..03cdeb9683aba 100644
--- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll
@@ -179,7 +179,7 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 define <2 x bfloat> @test_fneg(<2 x bfloat> %a) #0 {
 ; CHECK-LABEL: test_fneg(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_fneg_param_0];
@@ -211,7 +211,7 @@ define void @test_ldst_v3bf16(ptr %a, ptr %b) {
 ; CHECK-LABEL: test_ldst_v3bf16(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
@@ -529,7 +529,7 @@ define <2 x bfloat> @test_fmuladd(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
 define <2 x bfloat> @test_fabs(<2 x bfloat> %a) #0 {
 ; CHECK-LABEL: test_fabs(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_fabs_param_0];
@@ -805,7 +805,7 @@ define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM80-LABEL: test_copysign(
 ; SM80:       {
 ; SM80-NEXT:    .reg .pred %p<3>;
-; SM80-NEXT:    .reg .b16 %rs<17>;
+; SM80-NEXT:    .reg .b16 %rs<15>;
 ; SM80-NEXT:    .reg .b32 %r<4>;
 ; SM80-EMPTY:
 ; SM80-NEXT:  // %bb.0:
@@ -815,31 +815,31 @@ define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM80-NEXT:    abs.bf16 %rs3, %rs2;
 ; SM80-NEXT:    neg.bf16 %rs4, %rs3;
 ; SM80-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; SM80-NEXT:    shr.u16 %rs8, %rs6, 15;
-; SM80-NEXT:    and.b16 %rs9, %rs8, 1;
-; SM80-NEXT:    setp.eq.b16 %p1, %rs9, 1;
-; SM80-NEXT:    selp.b16 %rs10, %rs4, %rs3, %p1;
-; SM80-NEXT:    abs.bf16 %rs11, %rs1;
-; SM80-NEXT:    neg.bf16 %rs12, %rs11;
-; SM80-NEXT:    shr.u16 %rs14, %rs5, 15;
-; SM80-NEXT:    and.b16 %rs15, %rs14, 1;
-; SM80-NEXT:    setp.eq.b16 %p2, %rs15, 1;
-; SM80-NEXT:    selp.b16 %rs16, %rs12, %rs11, %p2;
-; SM80-NEXT:    mov.b32 %r3, {%rs16, %rs10};
+; SM80-NEXT:    shr.u16 %rs7, %rs6, 15;
+; SM80-NEXT:    and.b16 %rs8, %rs7, 1;
+; SM80-NEXT:    setp.eq.b16 %p1, %rs8, 1;
+; SM80-NEXT:    selp.b16 %rs9, %rs4, %rs3, %p1;
+; SM80-NEXT:    abs.bf16 %rs10, %rs1;
+; SM80-NEXT:    neg.bf16 %rs11, %rs10;
+; SM80-NEXT:    shr.u16 %rs12, %rs5, 15;
+; SM80-NEXT:    and.b16 %rs13, %rs12, 1;
+; SM80-NEXT:    setp.eq.b16 %p2, %rs13, 1;
+; SM80-NEXT:    selp.b16 %rs14, %rs11, %rs10, %p2;
+; SM80-NEXT:    mov.b32 %r3, {%rs14, %rs9};
 ; SM80-NEXT:    st.param.b32 [func_retval0], %r3;
 ; SM80-NEXT:    ret;
 ;
 ; SM90-LABEL: test_copysign(
 ; SM90:       {
-; SM90-NEXT:    .reg .b32 %r<9>;
+; SM90-NEXT:    .reg .b32 %r<6>;
 ; SM90-EMPTY:
 ; SM90-NEXT:  // %bb.0:
 ; SM90-NEXT:    ld.param.b32 %r1, [test_copysign_param_0];
 ; SM90-NEXT:    ld.param.b32 %r2, [test_copysign_param_1];
-; SM90-NEXT:    and.b32 %r4, %r2, -2147450880;
-; SM90-NEXT:    and.b32 %r6, %r1, 2147450879;
-; SM90-NEXT:    or.b32 %r7, %r6, %r4;
-; SM90-NEXT:    st.param.b32 [func_retval0], %r7;
+; SM90-NEXT:    and.b32 %r3, %r2, -2147450880;
+; SM90-NEXT:    and.b32 %r4, %r1, 2147450879;
+; SM90-NEXT:    or.b32 %r5, %r4, %r3;
+; SM90-NEXT:    st.param.b32 [func_retval0], %r5;
 ; SM90-NEXT:    ret;
   %r = call <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b)
   ret <2 x bfloat> %r
diff --git a/llvm/test/CodeGen/NVPTX/chain-different-as.ll b/llvm/test/CodeGen/NVPTX/chain-different-as.ll
index 293281e17dd36..704ed234f7fe6 100644
--- a/llvm/test/CodeGen/NVPTX/chain-different-as.ll
+++ b/llvm/test/CodeGen/NVPTX/chain-different-as.ll
@@ -7,8 +7,8 @@ define i64 @test() nounwind readnone {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u64 %rd1, 1;
-; CHECK-NEXT:    mov.u64 %rd2, 42;
+; CHECK-NEXT:    mov.b64 %rd1, 1;
+; CHECK-NEXT:    mov.b64 %rd2, 42;
 ; CHECK-NEXT:    st.u64 [%rd1], %rd2;
 ; CHECK-NEXT:    ld.global.u64 %rd3, [%rd1];
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd3;
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
index 8508b5f5c7283..608dbb3a0ba73 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
@@ -65,7 +65,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-NEXT:    and.b64 %rd3, %rd2, 1;
 ; SM70-NEXT:    shl.b64 %rd4, %rd3, 3;
 ; SM70-NEXT:    cvt.u32.u64 %r1, %rd4;
-; SM70-NEXT:    mov.u16 %rs11, 255;
+; SM70-NEXT:    mov.b16 %rs11, 255;
 ; SM70-NEXT:    shl.b16 %rs12, %rs11, %r1;
 ; SM70-NEXT:    not.b16 %rs2, %rs12;
 ; SM70-NEXT:    shl.b16 %rs3, %rs9, %r1;
diff --git a/llvm/test/CodeGen/NVPTX/compute-ptx-value-vts.ll b/llvm/test/CodeGen/NVPTX/compute-ptx-value-vts.ll
index 99292d69c6a08..aa08b9605790c 100644
--- a/llvm/test/CodeGen/NVPTX/compute-ptx-value-vts.ll
+++ b/llvm/test/CodeGen/NVPTX/compute-ptx-value-vts.ll
@@ -36,7 +36,7 @@ define <12 x i8> @byte12() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1};
 ; CHECK-NEXT:    st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1};
 ; CHECK-NEXT:    st.param.v4.b8 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1};
@@ -50,7 +50,7 @@ define <20 x i8> @byte20() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1};
 ; CHECK-NEXT:    st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1};
 ; CHECK-NEXT:    st.param.v4.b8 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1};
diff --git a/llvm/test/CodeGen/NVPTX/demote-vars.ll b/llvm/test/CodeGen/NVPTX/demote-vars.ll
index faa657083ba79..16ae80dca1edc 100644
--- a/llvm/test/CodeGen/NVPTX/demote-vars.ll
+++ b/llvm/test/CodeGen/NVPTX/demote-vars.ll
@@ -67,7 +67,7 @@ define void @define_private_global(i64 %val) {
 ; Also check that the if-then is still here, otherwise we may not be testing
 ; the "more-than-one-use" part.
 ; CHECK: st.shared.u64   [private_global_used_more_than_once_in_same_fct],
-; CHECK: mov.u64 %[[VAR:.*]], 25
+; CHECK: mov.b64 %[[VAR:.*]], 25
 ; CHECK: st.shared.u64   [private_global_used_more_than_once_in_same_fct], %[[VAR]]
 define void @define_private_global_more_than_one_use(i64 %val, i1 %cond) {
   store i64 %val, ptr addrspace(3) @private_global_used_more_than_once_in_same_fct
diff --git a/llvm/test/CodeGen/NVPTX/extractelement.ll b/llvm/test/CodeGen/NVPTX/extractelement.ll
index f7b410e550d03..159d100336436 100644
--- a/llvm/test/CodeGen/NVPTX/extractelement.ll
+++ b/llvm/test/CodeGen/NVPTX/extractelement.ll
@@ -157,29 +157,29 @@ define i16  @test_v8i8(i64 %a) {
 ; CHECK-LABEL: test_v8i8(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<16>;
-; CHECK-NEXT:    .reg .b32 %r<14>;
+; CHECK-NEXT:    .reg .b32 %r<12>;
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u64 %rd1, [test_v8i8_param_0];
-; CHECK-NEXT:    cvt.u32.u64 %r1, %rd1;
-; CHECK-NEXT:    { .reg .b32 tmp; mov.b64 {tmp, %r2}, %rd1; }
-; CHECK-NEXT:    bfe.s32 %r5, %r1, 0, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs1, %r5;
-; CHECK-NEXT:    bfe.s32 %r6, %r1, 8, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs2, %r6;
-; CHECK-NEXT:    bfe.s32 %r7, %r1, 16, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs3, %r7;
-; CHECK-NEXT:    bfe.s32 %r8, %r1, 24, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs4, %r8;
-; CHECK-NEXT:    bfe.s32 %r9, %r2, 0, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs5, %r9;
-; CHECK-NEXT:    bfe.s32 %r10, %r2, 8, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs6, %r10;
-; CHECK-NEXT:    bfe.s32 %r11, %r2, 16, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs7, %r11;
-; CHECK-NEXT:    bfe.s32 %r12, %r2, 24, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs8, %r12;
+; CHECK-NEXT:    { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd1; }
+; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
+; CHECK-NEXT:    bfe.s32 %r3, %r2, 0, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs1, %r3;
+; CHECK-NEXT:    bfe.s32 %r4, %r2, 8, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs2, %r4;
+; CHECK-NEXT:    bfe.s32 %r5, %r2, 16, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs3, %r5;
+; CHECK-NEXT:    bfe.s32 %r6, %r2, 24, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs4, %r6;
+; CHECK-NEXT:    bfe.s32 %r7, %r1, 0, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs5, %r7;
+; CHECK-NEXT:    bfe.s32 %r8, %r1, 8, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs6, %r8;
+; CHECK-NEXT:    bfe.s32 %r9, %r1, 16, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs7, %r9;
+; CHECK-NEXT:    bfe.s32 %r10, %r1, 24, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs8, %r10;
 ; CHECK-NEXT:    add.s16 %rs9, %rs1, %rs2;
 ; CHECK-NEXT:    add.s16 %rs10, %rs3, %rs4;
 ; CHECK-NEXT:    add.s16 %rs11, %rs5, %rs6;
@@ -187,8 +187,8 @@ define i16  @test_v8i8(i64 %a) {
 ; CHECK-NEXT:    add.s16 %rs13, %rs9, %rs10;
 ; CHECK-NEXT:    add.s16 %rs14, %rs11, %rs12;
 ; CHECK-NEXT:    add.s16 %rs15, %rs13, %rs14;
-; CHECK-NEXT:    cvt.u32.u16 %r13, %rs15;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r13;
+; CHECK-NEXT:    cvt.u32.u16 %r11, %rs15;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r11;
 ; CHECK-NEXT:    ret;
   %v = bitcast i64 %a to <8 x i8>
   %r0 = extractelement <8 x i8> %v, i64 0
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index eb0b00e883846..1905fec8ab7a8 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -408,7 +408,7 @@ define void @test_ldst_v3f16(ptr %a, ptr %b) {
 ; CHECK-LABEL: test_ldst_v3f16(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
@@ -1616,11 +1616,11 @@ define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 {
 define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 {
 ; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r2, [test_bitcast_2xhalf_to_2xi16_param_0];
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ld.param.u32 %r1, [test_bitcast_2xhalf_to_2xi16_param_0];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %r = bitcast <2 x half> %a to <2 x i16>
   ret <2 x i16> %r
@@ -1629,11 +1629,11 @@ define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 {
 define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 {
 ; CHECK-LABEL: test_bitcast_2xi16_to_2xhalf(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r2, [test_bitcast_2xi16_to_2xhalf_param_0];
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ld.param.u32 %r1, [test_bitcast_2xi16_to_2xhalf_param_0];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %r = bitcast <2 x i16> %a to <2 x half>
   ret <2 x half> %r
@@ -1657,12 +1657,12 @@ define <2 x half> @test_bitcast_float_to_2xhalf(float %a) #0 {
 define float @test_bitcast_2xhalf_to_float(<2 x half> %a) #0 {
 ; CHECK-LABEL: test_bitcast_2xhalf_to_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .f32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r2, [test_bitcast_2xhalf_to_float_param_0];
-; CHECK-NEXT:    mov.b32 %f1, %r2;
+; CHECK-NEXT:    ld.param.u32 %r1, [test_bitcast_2xhalf_to_float_param_0];
+; CHECK-NEXT:    mov.b32 %f1, %r1;
 ; CHECK-NEXT:    st.param.f32 [func_retval0], %f1;
 ; CHECK-NEXT:    ret;
   %r = bitcast <2 x half> %a to float
@@ -1858,12 +1858,12 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
 define <2 x half> @test_fabs(<2 x half> %a) #0 {
 ; CHECK-F16-LABEL: test_fabs(
 ; CHECK-F16:       {
-; CHECK-F16-NEXT:    .reg .b32 %r<5>;
+; CHECK-F16-NEXT:    .reg .b32 %r<3>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
 ; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_fabs_param_0];
-; CHECK-F16-NEXT:    and.b32 %r3, %r1, 2147450879;
-; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r3;
+; CHECK-F16-NEXT:    and.b32 %r2, %r1, 2147450879;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-NOF16-LABEL: test_fabs(
@@ -1945,34 +1945,34 @@ define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 {
 define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-F16-LABEL: test_copysign(
 ; CHECK-F16:       {
-; CHECK-F16-NEXT:    .reg .b32 %r<9>;
+; CHECK-F16-NEXT:    .reg .b32 %r<6>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
 ; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_copysign_param_1];
 ; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_copysign_param_0];
-; CHECK-F16-NEXT:    and.b32 %r4, %r2, -2147450880;
-; CHECK-F16-NEXT:    and.b32 %r6, %r1, 2147450879;
-; CHECK-F16-NEXT:    or.b32 %r7, %r6, %r4;
-; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r7;
+; CHECK-F16-NEXT:    and.b32 %r3, %r2, -2147450880;
+; CHECK-F16-NEXT:    and.b32 %r4, %r1, 2147450879;
+; CHECK-F16-NEXT:    or.b32 %r5, %r4, %r3;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r5;
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-NOF16-LABEL: test_copysign(
 ; CHECK-NOF16:       {
-; CHECK-NOF16-NEXT:    .reg .b16 %rs<17>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<11>;
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_copysign_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    and.b16 %rs4, %rs2, -32768;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NOF16-NEXT:    and.b16 %rs8, %rs6, 32767;
-; CHECK-NOF16-NEXT:    or.b16 %rs9, %rs8, %rs4;
-; CHECK-NOF16-NEXT:    and.b16 %rs12, %rs1, -32768;
-; CHECK-NOF16-NEXT:    and.b16 %rs14, %rs5, 32767;
-; CHECK-NOF16-NEXT:    or.b16 %rs15, %rs14, %rs12;
-; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs15, %rs9};
+; CHECK-NOF16-NEXT:    and.b16 %rs3, %rs2, -32768;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs4, %rs5}, %r1;
+; CHECK-NOF16-NEXT:    and.b16 %rs6, %rs5, 32767;
+; CHECK-NOF16-NEXT:    or.b16 %rs7, %rs6, %rs3;
+; CHECK-NOF16-NEXT:    and.b16 %rs8, %rs1, -32768;
+; CHECK-NOF16-NEXT:    and.b16 %rs9, %rs4, 32767;
+; CHECK-NOF16-NEXT:    or.b16 %rs10, %rs9, %rs8;
+; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs10, %rs7};
 ; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NOF16-NEXT:    ret;
   %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
@@ -1983,7 +1983,7 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
 ; CHECK-F16-LABEL: test_copysign_f32(
 ; CHECK-F16:       {
 ; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
-; CHECK-F16-NEXT:    .reg .b32 %r<9>;
+; CHECK-F16-NEXT:    .reg .b32 %r<6>;
 ; CHECK-F16-NEXT:    .reg .f32 %f<3>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
@@ -1992,33 +1992,33 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
 ; CHECK-F16-NEXT:    cvt.rn.f16.f32 %rs1, %f2;
 ; CHECK-F16-NEXT:    cvt.rn.f16.f32 %rs2, %f1;
 ; CHECK-F16-NEXT:    mov.b32 %r2, {%rs2, %rs1};
-; CHECK-F16-NEXT:    and.b32 %r4, %r2, -2147450880;
-; CHECK-F16-NEXT:    and.b32 %r6, %r1, 2147450879;
-; CHECK-F16-NEXT:    or.b32 %r7, %r6, %r4;
-; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r7;
+; CHECK-F16-NEXT:    and.b32 %r3, %r2, -2147450880;
+; CHECK-F16-NEXT:    and.b32 %r4, %r1, 2147450879;
+; CHECK-F16-NEXT:    or.b32 %r5, %r4, %r3;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r5;
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-NOF16-LABEL: test_copysign_f32(
 ; CHECK-NOF16:       {
-; CHECK-NOF16-NEXT:    .reg .b16 %rs<13>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<9>;
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-NEXT:    .reg .f32 %f<3>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.v2.f32 {%f1, %f2}, [test_copysign_f32_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_f32_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
+; CHECK-NOF16-NEXT:    and.b16 %rs3, %rs2, 32767;
 ; CHECK-NOF16-NEXT:    mov.b32 %r2, %f2;
 ; CHECK-NOF16-NEXT:    and.b32 %r3, %r2, -2147483648;
-; CHECK-NOF16-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r3; }
-; CHECK-NOF16-NEXT:    mov.b32 {%rs2, %rs3}, %r1;
-; CHECK-NOF16-NEXT:    and.b16 %rs5, %rs3, 32767;
-; CHECK-NOF16-NEXT:    or.b16 %rs6, %rs5, %rs1;
+; CHECK-NOF16-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r3; }
+; CHECK-NOF16-NEXT:    or.b16 %rs5, %rs3, %rs4;
+; CHECK-NOF16-NEXT:    and.b16 %rs6, %rs1, 32767;
 ; CHECK-NOF16-NEXT:    mov.b32 %r4, %f1;
 ; CHECK-NOF16-NEXT:    and.b32 %r5, %r4, -2147483648;
-; CHECK-NOF16-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r5; }
-; CHECK-NOF16-NEXT:    and.b16 %rs10, %rs2, 32767;
-; CHECK-NOF16-NEXT:    or.b16 %rs11, %rs10, %rs8;
-; CHECK-NOF16-NEXT:    mov.b32 %r6, {%rs11, %rs6};
+; CHECK-NOF16-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r5; }
+; CHECK-NOF16-NEXT:    or.b16 %rs8, %rs6, %rs7;
+; CHECK-NOF16-NEXT:    mov.b32 %r6, {%rs8, %rs5};
 ; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NOF16-NEXT:    ret;
   %tb = fptrunc <2 x float> %b to <2 x half>
@@ -2030,7 +2030,7 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
 ; CHECK-F16-LABEL: test_copysign_f64(
 ; CHECK-F16:       {
 ; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
-; CHECK-F16-NEXT:    .reg .b32 %r<9>;
+; CHECK-F16-NEXT:    .reg .b32 %r<6>;
 ; CHECK-F16-NEXT:    .reg .f64 %fd<3>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
@@ -2039,15 +2039,15 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
 ; CHECK-F16-NEXT:    cvt.rn.f16.f64 %rs1, %fd2;
 ; CHECK-F16-NEXT:    cvt.rn.f16.f64 %rs2, %fd1;
 ; CHECK-F16-NEXT:    mov.b32 %r2, {%rs2, %rs1};
-; CHECK-F16-NEXT:    and.b32 %r4, %r2, -2147450880;
-; CHECK-F16-NEXT:    and.b32 %r6, %r1, 2147450879;
-; CHECK-F16-NEXT:    or.b32 %r7, %r6, %r4;
-; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r7;
+; CHECK-F16-NEXT:    and.b32 %r3, %r2, -2147450880;
+; CHECK-F16-NEXT:    and.b32 %r4, %r1, 2147450879;
+; CHECK-F16-NEXT:    or.b32 %r5, %r4, %r3;
+; CHECK-F16-NEXT:    st.param.b32 [func_retval0], %r5;
 ; CHECK-F16-NEXT:    ret;
 ;
 ; CHECK-NOF16-LABEL: test_copysign_f64(
 ; CHECK-NOF16:       {
-; CHECK-NOF16-NEXT:    .reg .b16 %rs<13>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<9>;
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NOF16-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-NOF16-NEXT:    .reg .f64 %fd<3>;
@@ -2056,19 +2056,19 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
 ; CHECK-NOF16-NEXT:    ld.param.v2.f64 {%fd1, %fd2}, [test_copysign_f64_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_f64_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
-; CHECK-NOF16-NEXT:    and.b16 %rs4, %rs2, 32767;
+; CHECK-NOF16-NEXT:    and.b16 %rs3, %rs2, 32767;
 ; CHECK-NOF16-NEXT:    mov.b64 %rd1, %fd2;
 ; CHECK-NOF16-NEXT:    and.b64 %rd2, %rd1, -9223372036854775808;
 ; CHECK-NOF16-NEXT:    shr.u64 %rd3, %rd2, 48;
-; CHECK-NOF16-NEXT:    cvt.u16.u64 %rs5, %rd3;
-; CHECK-NOF16-NEXT:    or.b16 %rs6, %rs4, %rs5;
-; CHECK-NOF16-NEXT:    and.b16 %rs9, %rs1, 32767;
+; CHECK-NOF16-NEXT:    cvt.u16.u64 %rs4, %rd3;
+; CHECK-NOF16-NEXT:    or.b16 %rs5, %rs3, %rs4;
+; CHECK-NOF16-NEXT:    and.b16 %rs6, %rs1, 32767;
 ; CHECK-NOF16-NEXT:    mov.b64 %rd4, %fd1;
 ; CHECK-NOF16-NEXT:    and.b64 %rd5, %rd4, -9223372036854775808;
 ; CHECK-NOF16-NEXT:    shr.u64 %rd6, %rd5, 48;
-; CHECK-NOF16-NEXT:    cvt.u16.u64 %rs10, %rd6;
-; CHECK-NOF16-NEXT:    or.b16 %rs11, %rs9, %rs10;
-; CHECK-NOF16-NEXT:    mov.b32 %r2, {%rs11, %rs6};
+; CHECK-NOF16-NEXT:    cvt.u16.u64 %rs7, %rd6;
+; CHECK-NOF16-NEXT:    or.b16 %rs8, %rs6, %rs7;
+; CHECK-NOF16-NEXT:    mov.b32 %r2, {%rs8, %rs5};
 ; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r2;
 ; CHECK-NOF16-NEXT:    ret;
   %tb = fptrunc <2 x double> %b to <2 x half>
@@ -2080,16 +2080,16 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-F16-LABEL: test_copysign_extended(
 ; CHECK-F16:       {
 ; CHECK-F16-NEXT:    .reg .b16 %rs<3>;
-; CHECK-F16-NEXT:    .reg .b32 %r<9>;
+; CHECK-F16-NEXT:    .reg .b32 %r<6>;
 ; CHECK-F16-NEXT:    .reg .f32 %f<3>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
 ; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_copysign_extended_param_1];
 ; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_copysign_extended_param_0];
-; CHECK-F16-NEXT:    and.b32 %r4, %r2, -2147450880;
-; CHECK-F16-NEXT:    and.b32 %r6, %r1, 2147450879;
-; CHECK-F16-NEXT:    or.b32 %r7, %r6, %r4;
-; CHECK-F16-NEXT:    mov.b32 {%rs1, %rs2}, %r7;
+; CHECK-F16-NEXT:    and.b32 %r3, %r2, -2147450880;
+; CHECK-F16-NEXT:    and.b32 %r4, %r1, 2147450879;
+; CHECK-F16-NEXT:    or.b32 %r5, %r4, %r3;
+; CHECK-F16-NEXT:    mov.b32 {%rs1, %rs2}, %r5;
 ; CHECK-F16-NEXT:    cvt.f32.f16 %f1, %rs2;
 ; CHECK-F16-NEXT:    cvt.f32.f16 %f2, %rs1;
 ; CHECK-F16-NEXT:    st.param.v2.f32 [func_retval0], {%f2, %f1};
@@ -2097,7 +2097,7 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
 ;
 ; CHECK-NOF16-LABEL: test_copysign_extended(
 ; CHECK-NOF16:       {
-; CHECK-NOF16-NEXT:    .reg .b16 %rs<17>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<11>;
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NOF16-NEXT:    .reg .f32 %f<3>;
 ; CHECK-NOF16-EMPTY:
@@ -2105,15 +2105,15 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_copysign_extended_param_1];
 ; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_extended_param_0];
 ; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
-; CHECK-NOF16-NEXT:    and.b16 %rs4, %rs1, -32768;
-; CHECK-NOF16-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
-; CHECK-NOF16-NEXT:    and.b16 %rs8, %rs5, 32767;
-; CHECK-NOF16-NEXT:    or.b16 %rs9, %rs8, %rs4;
-; CHECK-NOF16-NEXT:    and.b16 %rs12, %rs2, -32768;
-; CHECK-NOF16-NEXT:    and.b16 %rs14, %rs6, 32767;
-; CHECK-NOF16-NEXT:    or.b16 %rs15, %rs14, %rs12;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs15;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs9;
+; CHECK-NOF16-NEXT:    and.b16 %rs3, %rs1, -32768;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs4, %rs5}, %r1;
+; CHECK-NOF16-NEXT:    and.b16 %rs6, %rs4, 32767;
+; CHECK-NOF16-NEXT:    or.b16 %rs7, %rs6, %rs3;
+; CHECK-NOF16-NEXT:    and.b16 %rs8, %rs2, -32768;
+; CHECK-NOF16-NEXT:    and.b16 %rs9, %rs5, 32767;
+; CHECK-NOF16-NEXT:    or.b16 %rs10, %rs9, %rs8;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs10;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs7;
 ; CHECK-NOF16-NEXT:    st.param.v2.f32 [func_retval0], {%f2, %f1};
 ; CHECK-NOF16-NEXT:    ret;
   %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
index fde2255a78343..48c94f275274b 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll
@@ -248,7 +248,7 @@ define bfloat @fma_bf16_expanded_unsafe_with_nans(bfloat %a, bfloat %b, bfloat %
 ; CHECK-SM70-LABEL: fma_bf16_expanded_unsafe_with_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<3>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<4>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-SM70-NEXT:    .reg .b32 %r<14>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<6>;
 ; CHECK-SM70-EMPTY:
@@ -274,8 +274,8 @@ define bfloat @fma_bf16_expanded_unsafe_with_nans(bfloat %a, bfloat %b, bfloat %
 ; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
 ; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %f5, 0f00000000;
-; CHECK-SM70-NEXT:    selp.b16 %rs3, %rs1, 0x0000, %p2;
-; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs3;
+; CHECK-SM70-NEXT:    selp.b16 %rs2, %rs1, 0x0000, %p2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul bfloat %a, %b
   %2 = fadd bfloat %1, %c
@@ -312,7 +312,7 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-SM70-LABEL: fma_bf16_expanded_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<3>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<4>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-SM70-NEXT:    .reg .b32 %r<14>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<6>;
 ; CHECK-SM70-EMPTY:
@@ -338,8 +338,8 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
 ; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %f5, 0f00000000;
-; CHECK-SM70-NEXT:    selp.b16 %rs3, %rs1, 0x0000, %p2;
-; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs3;
+; CHECK-SM70-NEXT:    selp.b16 %rs2, %rs1, 0x0000, %p2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul bfloat %a, %b
   %2 = fadd bfloat %1, %c
@@ -352,7 +352,7 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
 define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<12>;
+; CHECK-NEXT:    .reg .b16 %rs<9>;
 ; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NEXT:    .reg .f32 %f<6>;
 ; CHECK-EMPTY:
@@ -367,21 +367,21 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
 ; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-NEXT:    mov.b32 %f1, %r2;
 ; CHECK-NEXT:    add.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs8, %f2;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs7, %f2;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs6;
 ; CHECK-NEXT:    shl.b32 %r4, %r3, 16;
 ; CHECK-NEXT:    mov.b32 %f3, %r4;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs8;
+; CHECK-NEXT:    cvt.u32.u16 %r5, %rs7;
 ; CHECK-NEXT:    shl.b32 %r6, %r5, 16;
 ; CHECK-NEXT:    mov.b32 %f4, %r6;
 ; CHECK-NEXT:    add.f32 %f5, %f3, %f4;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs11, %f5;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs11;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs8, %f5;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs8;
 ; CHECK-NEXT:    ret;
 ;
 ; CHECK-FTZ-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK-FTZ:       {
-; CHECK-FTZ-NEXT:    .reg .b16 %rs<12>;
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<9>;
 ; CHECK-FTZ-NEXT:    .reg .b32 %r<7>;
 ; CHECK-FTZ-NEXT:    .reg .f32 %f<6>;
 ; CHECK-FTZ-EMPTY:
@@ -396,22 +396,22 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
 ; CHECK-FTZ-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f1, %r2;
 ; CHECK-FTZ-NEXT:    add.ftz.f32 %f2, %f1, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs8, %f2;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs7, %f2;
 ; CHECK-FTZ-NEXT:    cvt.u32.u16 %r3, %rs6;
 ; CHECK-FTZ-NEXT:    shl.b32 %r4, %r3, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f3, %r4;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r5, %rs8;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r5, %rs7;
 ; CHECK-FTZ-NEXT:    shl.b32 %r6, %r5, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f4, %r6;
 ; CHECK-FTZ-NEXT:    add.ftz.f32 %f5, %f3, %f4;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs11, %f5;
-; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs11;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs8, %f5;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs8;
 ; CHECK-FTZ-NEXT:    ret;
 ;
 ; CHECK-SM70-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<7>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<4>;
 ; CHECK-SM70-NEXT:    .reg .b32 %r<29>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<10>;
 ; CHECK-SM70-EMPTY:
@@ -437,7 +437,7 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
 ; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
 ; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %f5, 0f00000000;
-; CHECK-SM70-NEXT:    selp.b16 %rs3, %rs1, 0x0000, %p2;
+; CHECK-SM70-NEXT:    selp.b16 %rs2, %rs1, 0x0000, %p2;
 ; CHECK-SM70-NEXT:    add.f32 %f6, %f5, 0f40E00000;
 ; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
 ; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
@@ -446,7 +446,7 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f6, %f6;
 ; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p3;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs3;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs2;
 ; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
 ; CHECK-SM70-NEXT:    and.b32 %r22, %r19, -65536;
@@ -459,8 +459,8 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %f9, %f9;
 ; CHECK-SM70-NEXT:    or.b32 %r27, %r23, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r28, %r27, %r26, %p4;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs5}, %r28; }
-; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r28; }
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul bfloat %a, %b
   %2 = fadd bfloat %1, %c
@@ -499,7 +499,7 @@ define bfloat @fma_bf16_expanded_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c)
 ; CHECK-SM70-LABEL: fma_bf16_expanded_maxnum_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<3>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
 ; CHECK-SM70-EMPTY:
@@ -797,7 +797,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x
 ; CHECK-SM70-LABEL: fma_bf16x2_expanded_unsafe_with_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<19>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<11>;
 ; CHECK-SM70-NEXT:    .reg .b32 %r<31>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<11>;
 ; CHECK-SM70-EMPTY:
@@ -809,12 +809,12 @@ define <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
 ; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
-; CHECK-SM70-NEXT:    mov.b32 {%rs4, %rs5}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
+; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs3;
 ; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
-; CHECK-SM70-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs7;
+; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs5;
 ; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -825,14 +825,14 @@ define <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
 ; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r15; }
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs2;
 ; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs5;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
 ; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs8;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs6;
 ; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
@@ -843,16 +843,16 @@ define <2 x bfloat> @fma_bf16x2_expanded_unsafe_with_nans(<2 x bfloat> %a, <2 x
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
 ; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r27; }
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
 ; CHECK-SM70-NEXT:    and.b32 %r28, %r15, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
 ; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %f9, 0f00000000;
 ; CHECK-SM70-NEXT:    and.b32 %r29, %r27, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f10, %r29;
 ; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %f10, 0f00000000;
-; CHECK-SM70-NEXT:    selp.b16 %rs17, %rs15, 0x0000, %p4;
-; CHECK-SM70-NEXT:    selp.b16 %rs18, %rs10, 0x0000, %p3;
-; CHECK-SM70-NEXT:    mov.b32 %r30, {%rs18, %rs17};
+; CHECK-SM70-NEXT:    selp.b16 %rs9, %rs8, 0x0000, %p4;
+; CHECK-SM70-NEXT:    selp.b16 %rs10, %rs7, 0x0000, %p3;
+; CHECK-SM70-NEXT:    mov.b32 %r30, {%rs10, %rs9};
 ; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r30;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul <2 x bfloat> %a, %b
@@ -890,7 +890,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
 ; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<19>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<11>;
 ; CHECK-SM70-NEXT:    .reg .b32 %r<31>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<11>;
 ; CHECK-SM70-EMPTY:
@@ -902,12 +902,12 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
 ; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
-; CHECK-SM70-NEXT:    mov.b32 {%rs4, %rs5}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
+; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs3;
 ; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
-; CHECK-SM70-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs7;
+; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs5;
 ; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -918,14 +918,14 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
 ; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r15; }
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs2;
 ; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs5;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
 ; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs8;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs6;
 ; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
@@ -936,16 +936,16 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
 ; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r27; }
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
 ; CHECK-SM70-NEXT:    and.b32 %r28, %r15, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
 ; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %f9, 0f00000000;
 ; CHECK-SM70-NEXT:    and.b32 %r29, %r27, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f10, %r29;
 ; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %f10, 0f00000000;
-; CHECK-SM70-NEXT:    selp.b16 %rs17, %rs15, 0x0000, %p4;
-; CHECK-SM70-NEXT:    selp.b16 %rs18, %rs10, 0x0000, %p3;
-; CHECK-SM70-NEXT:    mov.b32 %r30, {%rs18, %rs17};
+; CHECK-SM70-NEXT:    selp.b16 %rs9, %rs8, 0x0000, %p4;
+; CHECK-SM70-NEXT:    selp.b16 %rs10, %rs7, 0x0000, %p3;
+; CHECK-SM70-NEXT:    mov.b32 %r30, {%rs10, %rs9};
 ; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r30;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul <2 x bfloat> %a, %b
@@ -959,7 +959,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
 define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 {
 ; CHECK-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<13>;
+; CHECK-NEXT:    .reg .b16 %rs<7>;
 ; CHECK-NEXT:    .reg .b32 %r<20>;
 ; CHECK-NEXT:    .reg .f32 %f<11>;
 ; CHECK-EMPTY:
@@ -975,24 +975,24 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-NEXT:    shl.b32 %r8, %r7, 16;
 ; CHECK-NEXT:    mov.b32 %f1, %r8;
 ; CHECK-NEXT:    add.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs4, %f2;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
 ; CHECK-NEXT:    cvt.u32.u16 %r9, %rs1;
 ; CHECK-NEXT:    shl.b32 %r10, %r9, 16;
 ; CHECK-NEXT:    mov.b32 %f3, %r10;
 ; CHECK-NEXT:    add.f32 %f4, %f3, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs6, %f4;
-; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r6;
-; CHECK-NEXT:    cvt.u32.u16 %r11, %rs7;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
+; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r6;
+; CHECK-NEXT:    cvt.u32.u16 %r11, %rs5;
 ; CHECK-NEXT:    shl.b32 %r12, %r11, 16;
 ; CHECK-NEXT:    mov.b32 %f5, %r12;
-; CHECK-NEXT:    cvt.u32.u16 %r13, %rs6;
+; CHECK-NEXT:    cvt.u32.u16 %r13, %rs4;
 ; CHECK-NEXT:    shl.b32 %r14, %r13, 16;
 ; CHECK-NEXT:    mov.b32 %f6, %r14;
 ; CHECK-NEXT:    add.f32 %f7, %f5, %f6;
-; CHECK-NEXT:    cvt.u32.u16 %r15, %rs8;
+; CHECK-NEXT:    cvt.u32.u16 %r15, %rs6;
 ; CHECK-NEXT:    shl.b32 %r16, %r15, 16;
 ; CHECK-NEXT:    mov.b32 %f8, %r16;
-; CHECK-NEXT:    cvt.u32.u16 %r17, %rs4;
+; CHECK-NEXT:    cvt.u32.u16 %r17, %rs3;
 ; CHECK-NEXT:    shl.b32 %r18, %r17, 16;
 ; CHECK-NEXT:    mov.b32 %f9, %r18;
 ; CHECK-NEXT:    add.f32 %f10, %f8, %f9;
@@ -1002,7 +1002,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ;
 ; CHECK-FTZ-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK-FTZ:       {
-; CHECK-FTZ-NEXT:    .reg .b16 %rs<13>;
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<7>;
 ; CHECK-FTZ-NEXT:    .reg .b32 %r<20>;
 ; CHECK-FTZ-NEXT:    .reg .f32 %f<11>;
 ; CHECK-FTZ-EMPTY:
@@ -1018,24 +1018,24 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-FTZ-NEXT:    shl.b32 %r8, %r7, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f1, %r8;
 ; CHECK-FTZ-NEXT:    add.ftz.f32 %f2, %f1, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs4, %f2;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
 ; CHECK-FTZ-NEXT:    cvt.u32.u16 %r9, %rs1;
 ; CHECK-FTZ-NEXT:    shl.b32 %r10, %r9, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f3, %r10;
 ; CHECK-FTZ-NEXT:    add.ftz.f32 %f4, %f3, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs6, %f4;
-; CHECK-FTZ-NEXT:    mov.b32 {%rs7, %rs8}, %r6;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r11, %rs7;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
+; CHECK-FTZ-NEXT:    mov.b32 {%rs5, %rs6}, %r6;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r11, %rs5;
 ; CHECK-FTZ-NEXT:    shl.b32 %r12, %r11, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f5, %r12;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r13, %rs6;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r13, %rs4;
 ; CHECK-FTZ-NEXT:    shl.b32 %r14, %r13, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f6, %r14;
 ; CHECK-FTZ-NEXT:    add.ftz.f32 %f7, %f5, %f6;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r15, %rs8;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r15, %rs6;
 ; CHECK-FTZ-NEXT:    shl.b32 %r16, %r15, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f8, %r16;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r17, %rs4;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r17, %rs3;
 ; CHECK-FTZ-NEXT:    shl.b32 %r18, %r17, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f9, %r18;
 ; CHECK-FTZ-NEXT:    add.ftz.f32 %f10, %f8, %f9;
@@ -1046,8 +1046,8 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<9>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<21>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<62>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<11>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<61>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<19>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
@@ -1058,12 +1058,12 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs2;
 ; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
-; CHECK-SM70-NEXT:    mov.b32 {%rs4, %rs5}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs5;
+; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
 ; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
-; CHECK-SM70-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs8;
+; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -1074,14 +1074,14 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
 ; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r15; }
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs1;
 ; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs3;
 ; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs7;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs5;
 ; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
@@ -1092,15 +1092,15 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
 ; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r27; }
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
 ; CHECK-SM70-NEXT:    and.b32 %r28, %r15, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
 ; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %f9, 0f00000000;
 ; CHECK-SM70-NEXT:    and.b32 %r29, %r27, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f10, %r29;
 ; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %f10, 0f00000000;
-; CHECK-SM70-NEXT:    selp.b16 %rs17, %rs15, 0x0000, %p4;
-; CHECK-SM70-NEXT:    selp.b16 %rs18, %rs10, 0x0000, %p3;
+; CHECK-SM70-NEXT:    selp.b16 %rs9, %rs8, 0x0000, %p4;
+; CHECK-SM70-NEXT:    selp.b16 %rs10, %rs7, 0x0000, %p3;
 ; CHECK-SM70-NEXT:    add.f32 %f11, %f10, 0f40E00000;
 ; CHECK-SM70-NEXT:    mov.b32 %r30, %f11;
 ; CHECK-SM70-NEXT:    bfe.u32 %r31, %r30, 16, 1;
@@ -1117,7 +1117,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p6, %f12, %f12;
 ; CHECK-SM70-NEXT:    or.b32 %r40, %r36, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r41, %r40, %r39, %p6;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r42, %rs18;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r42, %rs10;
 ; CHECK-SM70-NEXT:    shl.b32 %r43, %r42, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f13, %r43;
 ; CHECK-SM70-NEXT:    and.b32 %r44, %r41, -65536;
@@ -1130,7 +1130,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p7, %f15, %f15;
 ; CHECK-SM70-NEXT:    or.b32 %r49, %r45, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r50, %r49, %r48, %p7;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r51, %rs17;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r51, %rs9;
 ; CHECK-SM70-NEXT:    shl.b32 %r52, %r51, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f16, %r52;
 ; CHECK-SM70-NEXT:    and.b32 %r53, %r35, -65536;
@@ -1183,8 +1183,8 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
 ; CHECK-SM70-LABEL: fma_bf16x2_expanded_maxnum_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<13>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<44>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<7>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<43>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<13>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
@@ -1195,12 +1195,12 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
 ; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
-; CHECK-SM70-NEXT:    mov.b32 {%rs4, %rs5}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
+; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs3;
 ; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
-; CHECK-SM70-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs7;
+; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs5;
 ; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -1214,10 +1214,10 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs2;
 ; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs5;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
 ; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs8;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs6;
 ; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
index 723c72165f785..561f2b0cc0673 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll
@@ -182,7 +182,7 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-SM70-LABEL: fma_bf16_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<3>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<4>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-SM70-NEXT:    .reg .b32 %r<14>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<6>;
 ; CHECK-SM70-EMPTY:
@@ -208,8 +208,8 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
 ; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %f5, 0f00000000;
-; CHECK-SM70-NEXT:    selp.b16 %rs3, %rs1, 0x0000, %p2;
-; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs3;
+; CHECK-SM70-NEXT:    selp.b16 %rs2, %rs1, 0x0000, %p2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; CHECK-SM70-NEXT:    ret;
   %1 = call bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
   %2 = fcmp ogt bfloat %1, 0.0
@@ -221,7 +221,7 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
 define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-LABEL: fma_bf16_no_nans_multiple_uses_of_fma(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<9>;
+; CHECK-NEXT:    .reg .b16 %rs<7>;
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NEXT:    .reg .f32 %f<5>;
 ; CHECK-EMPTY:
@@ -234,18 +234,18 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa
 ; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-NEXT:    mov.b32 %f1, %r2;
 ; CHECK-NEXT:    add.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs6, %f2;
-; CHECK-NEXT:    cvt.u32.u16 %r3, %rs6;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs5, %f2;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs5;
 ; CHECK-NEXT:    shl.b32 %r4, %r3, 16;
 ; CHECK-NEXT:    mov.b32 %f3, %r4;
 ; CHECK-NEXT:    add.f32 %f4, %f3, %f1;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs8, %f4;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs8;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs6, %f4;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs6;
 ; CHECK-NEXT:    ret;
 ;
 ; CHECK-FTZ-LABEL: fma_bf16_no_nans_multiple_uses_of_fma(
 ; CHECK-FTZ:       {
-; CHECK-FTZ-NEXT:    .reg .b16 %rs<9>;
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<7>;
 ; CHECK-FTZ-NEXT:    .reg .b32 %r<5>;
 ; CHECK-FTZ-NEXT:    .reg .f32 %f<5>;
 ; CHECK-FTZ-EMPTY:
@@ -258,19 +258,19 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa
 ; CHECK-FTZ-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f1, %r2;
 ; CHECK-FTZ-NEXT:    add.ftz.f32 %f2, %f1, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs6, %f2;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r3, %rs6;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs5, %f2;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r3, %rs5;
 ; CHECK-FTZ-NEXT:    shl.b32 %r4, %r3, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f3, %r4;
 ; CHECK-FTZ-NEXT:    add.ftz.f32 %f4, %f3, %f1;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs8, %f4;
-; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs8;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs6, %f4;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs6;
 ; CHECK-FTZ-NEXT:    ret;
 ;
 ; CHECK-SM70-LABEL: fma_bf16_no_nans_multiple_uses_of_fma(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<4>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-SM70-NEXT:    .reg .b32 %r<27>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<9>;
 ; CHECK-SM70-EMPTY:
@@ -351,7 +351,7 @@ define bfloat @fma_bf16_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-SM70-LABEL: fma_bf16_maxnum_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<3>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
 ; CHECK-SM70-EMPTY:
@@ -574,7 +574,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
 ; CHECK-SM70-LABEL: fma_bf16x2_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<19>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<11>;
 ; CHECK-SM70-NEXT:    .reg .b32 %r<31>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<11>;
 ; CHECK-SM70-EMPTY:
@@ -586,12 +586,12 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
 ; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
-; CHECK-SM70-NEXT:    mov.b32 {%rs4, %rs5}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
+; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs3;
 ; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
-; CHECK-SM70-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs7;
+; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs5;
 ; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -602,14 +602,14 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
 ; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r15; }
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs2;
 ; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs5;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
 ; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs8;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs6;
 ; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
@@ -620,16 +620,16 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
 ; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r27; }
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
 ; CHECK-SM70-NEXT:    and.b32 %r28, %r15, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
 ; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %f9, 0f00000000;
 ; CHECK-SM70-NEXT:    and.b32 %r29, %r27, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f10, %r29;
 ; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %f10, 0f00000000;
-; CHECK-SM70-NEXT:    selp.b16 %rs17, %rs15, 0x0000, %p4;
-; CHECK-SM70-NEXT:    selp.b16 %rs18, %rs10, 0x0000, %p3;
-; CHECK-SM70-NEXT:    mov.b32 %r30, {%rs18, %rs17};
+; CHECK-SM70-NEXT:    selp.b16 %rs9, %rs8, 0x0000, %p4;
+; CHECK-SM70-NEXT:    selp.b16 %rs10, %rs7, 0x0000, %p3;
+; CHECK-SM70-NEXT:    mov.b32 %r30, {%rs10, %rs9};
 ; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r30;
 ; CHECK-SM70-NEXT:    ret;
   %1 = call <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
@@ -642,7 +642,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
 define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) #0 {
 ; CHECK-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<9>;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
 ; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-NEXT:    .reg .f32 %f<9>;
 ; CHECK-EMPTY:
@@ -656,17 +656,17 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ; CHECK-NEXT:    shl.b32 %r6, %r5, 16;
 ; CHECK-NEXT:    mov.b32 %f1, %r6;
 ; CHECK-NEXT:    add.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs4, %f2;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
 ; CHECK-NEXT:    cvt.u32.u16 %r7, %rs1;
 ; CHECK-NEXT:    shl.b32 %r8, %r7, 16;
 ; CHECK-NEXT:    mov.b32 %f3, %r8;
 ; CHECK-NEXT:    add.f32 %f4, %f3, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs6, %f4;
-; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
+; CHECK-NEXT:    cvt.u32.u16 %r9, %rs4;
 ; CHECK-NEXT:    shl.b32 %r10, %r9, 16;
 ; CHECK-NEXT:    mov.b32 %f5, %r10;
 ; CHECK-NEXT:    add.f32 %f6, %f5, %f3;
-; CHECK-NEXT:    cvt.u32.u16 %r11, %rs4;
+; CHECK-NEXT:    cvt.u32.u16 %r11, %rs3;
 ; CHECK-NEXT:    shl.b32 %r12, %r11, 16;
 ; CHECK-NEXT:    mov.b32 %f7, %r12;
 ; CHECK-NEXT:    add.f32 %f8, %f7, %f1;
@@ -676,7 +676,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ;
 ; CHECK-FTZ-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
 ; CHECK-FTZ:       {
-; CHECK-FTZ-NEXT:    .reg .b16 %rs<9>;
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
 ; CHECK-FTZ-NEXT:    .reg .b32 %r<14>;
 ; CHECK-FTZ-NEXT:    .reg .f32 %f<9>;
 ; CHECK-FTZ-EMPTY:
@@ -690,17 +690,17 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ; CHECK-FTZ-NEXT:    shl.b32 %r6, %r5, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f1, %r6;
 ; CHECK-FTZ-NEXT:    add.ftz.f32 %f2, %f1, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs4, %f2;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
 ; CHECK-FTZ-NEXT:    cvt.u32.u16 %r7, %rs1;
 ; CHECK-FTZ-NEXT:    shl.b32 %r8, %r7, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f3, %r8;
 ; CHECK-FTZ-NEXT:    add.ftz.f32 %f4, %f3, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs6, %f4;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r9, %rs6;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r9, %rs4;
 ; CHECK-FTZ-NEXT:    shl.b32 %r10, %r9, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f5, %r10;
 ; CHECK-FTZ-NEXT:    add.ftz.f32 %f6, %f5, %f3;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r11, %rs4;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r11, %rs3;
 ; CHECK-FTZ-NEXT:    shl.b32 %r12, %r11, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f7, %r12;
 ; CHECK-FTZ-NEXT:    add.ftz.f32 %f8, %f7, %f1;
@@ -711,8 +711,8 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ; CHECK-SM70-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<7>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<13>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<58>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<7>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<57>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<17>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
@@ -723,12 +723,12 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs2;
 ; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
-; CHECK-SM70-NEXT:    mov.b32 {%rs4, %rs5}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs5;
+; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
 ; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
-; CHECK-SM70-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs8;
+; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -742,10 +742,10 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs1;
 ; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs3;
 ; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs7;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs5;
 ; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
@@ -835,8 +835,8 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
 ; CHECK-SM70-LABEL: fma_bf16x2_maxnum_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<13>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<44>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<7>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<43>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<13>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
@@ -847,12 +847,12 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
 ; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
-; CHECK-SM70-NEXT:    mov.b32 {%rs4, %rs5}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
+; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs3;
 ; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
-; CHECK-SM70-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs7;
+; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs5;
 ; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -866,10 +866,10 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs2;
 ; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs5;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
 ; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs8;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs6;
 ; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll b/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
index ce35d9066a475..b20ca24dd91a0 100644
--- a/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll
@@ -193,7 +193,7 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c)  {
 ; CHECK-SM70-LABEL: fma_bf16_expanded_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<3>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<4>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-SM70-NEXT:    .reg .b32 %r<14>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<6>;
 ; CHECK-SM70-EMPTY:
@@ -219,8 +219,8 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c)  {
 ; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
 ; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %f5, 0f00000000;
-; CHECK-SM70-NEXT:    selp.b16 %rs3, %rs1, 0x0000, %p2;
-; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs3;
+; CHECK-SM70-NEXT:    selp.b16 %rs2, %rs1, 0x0000, %p2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul fast bfloat %a, %b
   %2 = fadd fast bfloat %1, %c
@@ -233,7 +233,7 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c)  {
 define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c)  {
 ; CHECK-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<12>;
+; CHECK-NEXT:    .reg .b16 %rs<9>;
 ; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NEXT:    .reg .f32 %f<6>;
 ; CHECK-EMPTY:
@@ -248,21 +248,21 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
 ; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-NEXT:    mov.b32 %f1, %r2;
 ; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs8, %f2;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs7, %f2;
 ; CHECK-NEXT:    cvt.u32.u16 %r3, %rs6;
 ; CHECK-NEXT:    shl.b32 %r4, %r3, 16;
 ; CHECK-NEXT:    mov.b32 %f3, %r4;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs8;
+; CHECK-NEXT:    cvt.u32.u16 %r5, %rs7;
 ; CHECK-NEXT:    shl.b32 %r6, %r5, 16;
 ; CHECK-NEXT:    mov.b32 %f4, %r6;
 ; CHECK-NEXT:    add.rn.f32 %f5, %f3, %f4;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs11, %f5;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs11;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs8, %f5;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs8;
 ; CHECK-NEXT:    ret;
 ;
 ; CHECK-FTZ-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK-FTZ:       {
-; CHECK-FTZ-NEXT:    .reg .b16 %rs<12>;
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<9>;
 ; CHECK-FTZ-NEXT:    .reg .b32 %r<7>;
 ; CHECK-FTZ-NEXT:    .reg .f32 %f<6>;
 ; CHECK-FTZ-EMPTY:
@@ -277,22 +277,22 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
 ; CHECK-FTZ-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f1, %r2;
 ; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f2, %f1, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs8, %f2;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs7, %f2;
 ; CHECK-FTZ-NEXT:    cvt.u32.u16 %r3, %rs6;
 ; CHECK-FTZ-NEXT:    shl.b32 %r4, %r3, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f3, %r4;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r5, %rs8;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r5, %rs7;
 ; CHECK-FTZ-NEXT:    shl.b32 %r6, %r5, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f4, %r6;
 ; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f5, %f3, %f4;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs11, %f5;
-; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs11;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs8, %f5;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs8;
 ; CHECK-FTZ-NEXT:    ret;
 ;
 ; CHECK-SM70-LABEL: fma_bf16_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<7>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<4>;
 ; CHECK-SM70-NEXT:    .reg .b32 %r<29>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<10>;
 ; CHECK-SM70-EMPTY:
@@ -318,7 +318,7 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
 ; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
 ; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %f5, 0f00000000;
-; CHECK-SM70-NEXT:    selp.b16 %rs3, %rs1, 0x0000, %p2;
+; CHECK-SM70-NEXT:    selp.b16 %rs2, %rs1, 0x0000, %p2;
 ; CHECK-SM70-NEXT:    add.rn.f32 %f6, %f5, 0f40E00000;
 ; CHECK-SM70-NEXT:    mov.b32 %r14, %f6;
 ; CHECK-SM70-NEXT:    bfe.u32 %r15, %r14, 16, 1;
@@ -327,7 +327,7 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p3, %f6, %f6;
 ; CHECK-SM70-NEXT:    or.b32 %r18, %r14, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r19, %r18, %r17, %p3;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs3;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs2;
 ; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
 ; CHECK-SM70-NEXT:    and.b32 %r22, %r19, -65536;
@@ -340,8 +340,8 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p4, %f9, %f9;
 ; CHECK-SM70-NEXT:    or.b32 %r27, %r23, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r28, %r27, %r26, %p4;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs5}, %r28; }
-; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs5;
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs3}, %r28; }
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul fast bfloat %a, %b
   %2 = fadd fast bfloat %1, %c
@@ -382,7 +382,7 @@ define bfloat @fma_bf16_expanded_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c)
 ; CHECK-SM70-LABEL: fma_bf16_expanded_maxnum_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<3>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
 ; CHECK-SM70-EMPTY:
@@ -625,7 +625,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
 ; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<19>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<11>;
 ; CHECK-SM70-NEXT:    .reg .b32 %r<31>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<11>;
 ; CHECK-SM70-EMPTY:
@@ -637,12 +637,12 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
 ; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
-; CHECK-SM70-NEXT:    mov.b32 {%rs4, %rs5}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
+; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs3;
 ; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
-; CHECK-SM70-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs7;
+; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs5;
 ; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -653,14 +653,14 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
 ; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r15; }
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs2;
 ; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs5;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
 ; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs8;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs6;
 ; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
@@ -671,16 +671,16 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
 ; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r27; }
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
 ; CHECK-SM70-NEXT:    and.b32 %r28, %r15, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
 ; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %f9, 0f00000000;
 ; CHECK-SM70-NEXT:    and.b32 %r29, %r27, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f10, %r29;
 ; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %f10, 0f00000000;
-; CHECK-SM70-NEXT:    selp.b16 %rs17, %rs15, 0x0000, %p4;
-; CHECK-SM70-NEXT:    selp.b16 %rs18, %rs10, 0x0000, %p3;
-; CHECK-SM70-NEXT:    mov.b32 %r30, {%rs18, %rs17};
+; CHECK-SM70-NEXT:    selp.b16 %rs9, %rs8, 0x0000, %p4;
+; CHECK-SM70-NEXT:    selp.b16 %rs10, %rs7, 0x0000, %p3;
+; CHECK-SM70-NEXT:    mov.b32 %r30, {%rs10, %rs9};
 ; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r30;
 ; CHECK-SM70-NEXT:    ret;
   %1 = fmul fast <2 x bfloat> %a, %b
@@ -694,7 +694,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans(<2 x bfloat> %a, <2 x bfloat> %
 define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)  {
 ; CHECK-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<13>;
+; CHECK-NEXT:    .reg .b16 %rs<7>;
 ; CHECK-NEXT:    .reg .b32 %r<20>;
 ; CHECK-NEXT:    .reg .f32 %f<11>;
 ; CHECK-EMPTY:
@@ -710,24 +710,24 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-NEXT:    shl.b32 %r8, %r7, 16;
 ; CHECK-NEXT:    mov.b32 %f1, %r8;
 ; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs4, %f2;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
 ; CHECK-NEXT:    cvt.u32.u16 %r9, %rs1;
 ; CHECK-NEXT:    shl.b32 %r10, %r9, 16;
 ; CHECK-NEXT:    mov.b32 %f3, %r10;
 ; CHECK-NEXT:    add.rn.f32 %f4, %f3, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs6, %f4;
-; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r6;
-; CHECK-NEXT:    cvt.u32.u16 %r11, %rs7;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
+; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r6;
+; CHECK-NEXT:    cvt.u32.u16 %r11, %rs5;
 ; CHECK-NEXT:    shl.b32 %r12, %r11, 16;
 ; CHECK-NEXT:    mov.b32 %f5, %r12;
-; CHECK-NEXT:    cvt.u32.u16 %r13, %rs6;
+; CHECK-NEXT:    cvt.u32.u16 %r13, %rs4;
 ; CHECK-NEXT:    shl.b32 %r14, %r13, 16;
 ; CHECK-NEXT:    mov.b32 %f6, %r14;
 ; CHECK-NEXT:    add.rn.f32 %f7, %f5, %f6;
-; CHECK-NEXT:    cvt.u32.u16 %r15, %rs8;
+; CHECK-NEXT:    cvt.u32.u16 %r15, %rs6;
 ; CHECK-NEXT:    shl.b32 %r16, %r15, 16;
 ; CHECK-NEXT:    mov.b32 %f8, %r16;
-; CHECK-NEXT:    cvt.u32.u16 %r17, %rs4;
+; CHECK-NEXT:    cvt.u32.u16 %r17, %rs3;
 ; CHECK-NEXT:    shl.b32 %r18, %r17, 16;
 ; CHECK-NEXT:    mov.b32 %f9, %r18;
 ; CHECK-NEXT:    add.rn.f32 %f10, %f8, %f9;
@@ -737,7 +737,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ;
 ; CHECK-FTZ-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK-FTZ:       {
-; CHECK-FTZ-NEXT:    .reg .b16 %rs<13>;
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<7>;
 ; CHECK-FTZ-NEXT:    .reg .b32 %r<20>;
 ; CHECK-FTZ-NEXT:    .reg .f32 %f<11>;
 ; CHECK-FTZ-EMPTY:
@@ -753,24 +753,24 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-FTZ-NEXT:    shl.b32 %r8, %r7, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f1, %r8;
 ; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f2, %f1, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs4, %f2;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
 ; CHECK-FTZ-NEXT:    cvt.u32.u16 %r9, %rs1;
 ; CHECK-FTZ-NEXT:    shl.b32 %r10, %r9, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f3, %r10;
 ; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f4, %f3, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs6, %f4;
-; CHECK-FTZ-NEXT:    mov.b32 {%rs7, %rs8}, %r6;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r11, %rs7;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
+; CHECK-FTZ-NEXT:    mov.b32 {%rs5, %rs6}, %r6;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r11, %rs5;
 ; CHECK-FTZ-NEXT:    shl.b32 %r12, %r11, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f5, %r12;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r13, %rs6;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r13, %rs4;
 ; CHECK-FTZ-NEXT:    shl.b32 %r14, %r13, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f6, %r14;
 ; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f7, %f5, %f6;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r15, %rs8;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r15, %rs6;
 ; CHECK-FTZ-NEXT:    shl.b32 %r16, %r15, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f8, %r16;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r17, %rs4;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r17, %rs3;
 ; CHECK-FTZ-NEXT:    shl.b32 %r18, %r17, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f9, %r18;
 ; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f10, %f8, %f9;
@@ -781,8 +781,8 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-SM70-LABEL: fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<9>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<21>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<62>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<11>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<61>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<19>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
@@ -793,12 +793,12 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs2;
 ; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
-; CHECK-SM70-NEXT:    mov.b32 {%rs4, %rs5}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs5;
+; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
 ; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
-; CHECK-SM70-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs8;
+; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -809,14 +809,14 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
 ; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r15; }
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs1;
 ; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs3;
 ; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs7;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs5;
 ; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
@@ -827,15 +827,15 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
 ; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r27; }
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
 ; CHECK-SM70-NEXT:    and.b32 %r28, %r15, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
 ; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %f9, 0f00000000;
 ; CHECK-SM70-NEXT:    and.b32 %r29, %r27, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f10, %r29;
 ; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %f10, 0f00000000;
-; CHECK-SM70-NEXT:    selp.b16 %rs17, %rs15, 0x0000, %p4;
-; CHECK-SM70-NEXT:    selp.b16 %rs18, %rs10, 0x0000, %p3;
+; CHECK-SM70-NEXT:    selp.b16 %rs9, %rs8, 0x0000, %p4;
+; CHECK-SM70-NEXT:    selp.b16 %rs10, %rs7, 0x0000, %p3;
 ; CHECK-SM70-NEXT:    add.rn.f32 %f11, %f10, 0f40E00000;
 ; CHECK-SM70-NEXT:    mov.b32 %r30, %f11;
 ; CHECK-SM70-NEXT:    bfe.u32 %r31, %r30, 16, 1;
@@ -852,7 +852,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p6, %f12, %f12;
 ; CHECK-SM70-NEXT:    or.b32 %r40, %r36, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r41, %r40, %r39, %p6;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r42, %rs18;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r42, %rs10;
 ; CHECK-SM70-NEXT:    shl.b32 %r43, %r42, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f13, %r43;
 ; CHECK-SM70-NEXT:    and.b32 %r44, %r41, -65536;
@@ -865,7 +865,7 @@ define <2 x bfloat> @fma_bf16x2_expanded_no_nans_multiple_uses_of_fma(<2 x bfloa
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p7, %f15, %f15;
 ; CHECK-SM70-NEXT:    or.b32 %r49, %r45, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r50, %r49, %r48, %p7;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r51, %rs17;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r51, %rs9;
 ; CHECK-SM70-NEXT:    shl.b32 %r52, %r51, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f16, %r52;
 ; CHECK-SM70-NEXT:    and.b32 %r53, %r35, -65536;
@@ -918,8 +918,8 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
 ; CHECK-SM70-LABEL: fma_bf16x2_expanded_maxnum_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<13>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<44>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<7>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<43>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<13>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
@@ -930,12 +930,12 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
 ; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
-; CHECK-SM70-NEXT:    mov.b32 {%rs4, %rs5}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
+; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs3;
 ; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
-; CHECK-SM70-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs7;
+; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs5;
 ; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -949,10 +949,10 @@ define <2 x bfloat> @fma_bf16x2_expanded_maxnum_no_nans(<2 x bfloat> %a, <2 x bf
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs2;
 ; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs5;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
 ; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs8;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs6;
 ; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
@@ -1165,7 +1165,7 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c)  {
 ; CHECK-SM70-LABEL: fma_bf16_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<3>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<4>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-SM70-NEXT:    .reg .b32 %r<14>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<6>;
 ; CHECK-SM70-EMPTY:
@@ -1191,8 +1191,8 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c)  {
 ; CHECK-SM70-NEXT:    and.b32 %r13, %r12, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f5, %r13;
 ; CHECK-SM70-NEXT:    setp.gt.f32 %p2, %f5, 0f00000000;
-; CHECK-SM70-NEXT:    selp.b16 %rs3, %rs1, 0x0000, %p2;
-; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs3;
+; CHECK-SM70-NEXT:    selp.b16 %rs2, %rs1, 0x0000, %p2;
+; CHECK-SM70-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; CHECK-SM70-NEXT:    ret;
   %1 = call nnan bfloat @llvm.fma.bf16(bfloat %a, bfloat %b, bfloat %c)
   %2 = fcmp nsz ogt bfloat %1, 0.0
@@ -1204,7 +1204,7 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c)  {
 define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloat %c)  {
 ; CHECK-LABEL: fma_bf16_no_nans_multiple_uses_of_fma(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<9>;
+; CHECK-NEXT:    .reg .b16 %rs<7>;
 ; CHECK-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NEXT:    .reg .f32 %f<5>;
 ; CHECK-EMPTY:
@@ -1217,18 +1217,18 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa
 ; CHECK-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-NEXT:    mov.b32 %f1, %r2;
 ; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs6, %f2;
-; CHECK-NEXT:    cvt.u32.u16 %r3, %rs6;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs5, %f2;
+; CHECK-NEXT:    cvt.u32.u16 %r3, %rs5;
 ; CHECK-NEXT:    shl.b32 %r4, %r3, 16;
 ; CHECK-NEXT:    mov.b32 %f3, %r4;
 ; CHECK-NEXT:    add.rn.f32 %f4, %f3, %f1;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs8, %f4;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs8;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs6, %f4;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs6;
 ; CHECK-NEXT:    ret;
 ;
 ; CHECK-FTZ-LABEL: fma_bf16_no_nans_multiple_uses_of_fma(
 ; CHECK-FTZ:       {
-; CHECK-FTZ-NEXT:    .reg .b16 %rs<9>;
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<7>;
 ; CHECK-FTZ-NEXT:    .reg .b32 %r<5>;
 ; CHECK-FTZ-NEXT:    .reg .f32 %f<5>;
 ; CHECK-FTZ-EMPTY:
@@ -1241,19 +1241,19 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa
 ; CHECK-FTZ-NEXT:    shl.b32 %r2, %r1, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f1, %r2;
 ; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f2, %f1, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs6, %f2;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r3, %rs6;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs5, %f2;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r3, %rs5;
 ; CHECK-FTZ-NEXT:    shl.b32 %r4, %r3, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f3, %r4;
 ; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f4, %f3, %f1;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs8, %f4;
-; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs8;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs6, %f4;
+; CHECK-FTZ-NEXT:    st.param.b16 [func_retval0], %rs6;
 ; CHECK-FTZ-NEXT:    ret;
 ;
 ; CHECK-SM70-LABEL: fma_bf16_no_nans_multiple_uses_of_fma(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<4>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-SM70-NEXT:    .reg .b32 %r<27>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<9>;
 ; CHECK-SM70-EMPTY:
@@ -1334,7 +1334,7 @@ define bfloat @fma_bf16_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c)  {
 ; CHECK-SM70-LABEL: fma_bf16_maxnum_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<3>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<3>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-SM70-NEXT:    .reg .b32 %r<20>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<7>;
 ; CHECK-SM70-EMPTY:
@@ -1561,7 +1561,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
 ; CHECK-SM70-LABEL: fma_bf16x2_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<19>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<11>;
 ; CHECK-SM70-NEXT:    .reg .b32 %r<31>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<11>;
 ; CHECK-SM70-EMPTY:
@@ -1573,12 +1573,12 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
 ; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
-; CHECK-SM70-NEXT:    mov.b32 {%rs4, %rs5}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
+; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs3;
 ; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
-; CHECK-SM70-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs7;
+; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs5;
 ; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -1589,14 +1589,14 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p1, %f4, %f4;
 ; CHECK-SM70-NEXT:    or.b32 %r14, %r10, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r15, %r14, %r13, %p1;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs10}, %r15; }
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs7}, %r15; }
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs2;
 ; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs5;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
 ; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs8;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs6;
 ; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
@@ -1607,16 +1607,16 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
 ; CHECK-SM70-NEXT:    setp.nan.f32 %p2, %f8, %f8;
 ; CHECK-SM70-NEXT:    or.b32 %r26, %r22, 4194304;
 ; CHECK-SM70-NEXT:    selp.b32 %r27, %r26, %r25, %p2;
-; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs15}, %r27; }
+; CHECK-SM70-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs8}, %r27; }
 ; CHECK-SM70-NEXT:    and.b32 %r28, %r15, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f9, %r28;
 ; CHECK-SM70-NEXT:    setp.gt.f32 %p3, %f9, 0f00000000;
 ; CHECK-SM70-NEXT:    and.b32 %r29, %r27, -65536;
 ; CHECK-SM70-NEXT:    mov.b32 %f10, %r29;
 ; CHECK-SM70-NEXT:    setp.gt.f32 %p4, %f10, 0f00000000;
-; CHECK-SM70-NEXT:    selp.b16 %rs17, %rs15, 0x0000, %p4;
-; CHECK-SM70-NEXT:    selp.b16 %rs18, %rs10, 0x0000, %p3;
-; CHECK-SM70-NEXT:    mov.b32 %r30, {%rs18, %rs17};
+; CHECK-SM70-NEXT:    selp.b16 %rs9, %rs8, 0x0000, %p4;
+; CHECK-SM70-NEXT:    selp.b16 %rs10, %rs7, 0x0000, %p3;
+; CHECK-SM70-NEXT:    mov.b32 %r30, {%rs10, %rs9};
 ; CHECK-SM70-NEXT:    st.param.b32 [func_retval0], %r30;
 ; CHECK-SM70-NEXT:    ret;
   %1 = call nnan <2 x bfloat> @llvm.fma.bf16x2(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
@@ -1629,7 +1629,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans(<2 x bfloat> %a, <2 x bfloat> %b, <2 x b
 define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)  {
 ; CHECK-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<9>;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
 ; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-NEXT:    .reg .f32 %f<9>;
 ; CHECK-EMPTY:
@@ -1643,17 +1643,17 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ; CHECK-NEXT:    shl.b32 %r6, %r5, 16;
 ; CHECK-NEXT:    mov.b32 %f1, %r6;
 ; CHECK-NEXT:    add.rn.f32 %f2, %f1, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs4, %f2;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
 ; CHECK-NEXT:    cvt.u32.u16 %r7, %rs1;
 ; CHECK-NEXT:    shl.b32 %r8, %r7, 16;
 ; CHECK-NEXT:    mov.b32 %f3, %r8;
 ; CHECK-NEXT:    add.rn.f32 %f4, %f3, 0f40E00000;
-; CHECK-NEXT:    cvt.rn.bf16.f32 %rs6, %f4;
-; CHECK-NEXT:    cvt.u32.u16 %r9, %rs6;
+; CHECK-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
+; CHECK-NEXT:    cvt.u32.u16 %r9, %rs4;
 ; CHECK-NEXT:    shl.b32 %r10, %r9, 16;
 ; CHECK-NEXT:    mov.b32 %f5, %r10;
 ; CHECK-NEXT:    add.rn.f32 %f6, %f5, %f3;
-; CHECK-NEXT:    cvt.u32.u16 %r11, %rs4;
+; CHECK-NEXT:    cvt.u32.u16 %r11, %rs3;
 ; CHECK-NEXT:    shl.b32 %r12, %r11, 16;
 ; CHECK-NEXT:    mov.b32 %f7, %r12;
 ; CHECK-NEXT:    add.rn.f32 %f8, %f7, %f1;
@@ -1663,7 +1663,7 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ;
 ; CHECK-FTZ-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
 ; CHECK-FTZ:       {
-; CHECK-FTZ-NEXT:    .reg .b16 %rs<9>;
+; CHECK-FTZ-NEXT:    .reg .b16 %rs<5>;
 ; CHECK-FTZ-NEXT:    .reg .b32 %r<14>;
 ; CHECK-FTZ-NEXT:    .reg .f32 %f<9>;
 ; CHECK-FTZ-EMPTY:
@@ -1677,17 +1677,17 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ; CHECK-FTZ-NEXT:    shl.b32 %r6, %r5, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f1, %r6;
 ; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f2, %f1, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs4, %f2;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %f2;
 ; CHECK-FTZ-NEXT:    cvt.u32.u16 %r7, %rs1;
 ; CHECK-FTZ-NEXT:    shl.b32 %r8, %r7, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f3, %r8;
 ; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f4, %f3, 0f40E00000;
-; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs6, %f4;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r9, %rs6;
+; CHECK-FTZ-NEXT:    cvt.rn.bf16.f32 %rs4, %f4;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r9, %rs4;
 ; CHECK-FTZ-NEXT:    shl.b32 %r10, %r9, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f5, %r10;
 ; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f6, %f5, %f3;
-; CHECK-FTZ-NEXT:    cvt.u32.u16 %r11, %rs4;
+; CHECK-FTZ-NEXT:    cvt.u32.u16 %r11, %rs3;
 ; CHECK-FTZ-NEXT:    shl.b32 %r12, %r11, 16;
 ; CHECK-FTZ-NEXT:    mov.b32 %f7, %r12;
 ; CHECK-FTZ-NEXT:    add.rn.ftz.f32 %f8, %f7, %f1;
@@ -1698,8 +1698,8 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ; CHECK-SM70-LABEL: fma_bf16x2_no_nans_multiple_uses_of_fma(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<7>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<13>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<58>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<7>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<57>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<17>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
@@ -1710,12 +1710,12 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs2;
 ; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
-; CHECK-SM70-NEXT:    mov.b32 {%rs4, %rs5}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs5;
+; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
 ; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
-; CHECK-SM70-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs8;
+; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs6;
 ; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -1729,10 +1729,10 @@ define <2 x bfloat> @fma_bf16x2_no_nans_multiple_uses_of_fma(<2 x bfloat> %a, <2
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs1;
 ; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs3;
 ; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs7;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs5;
 ; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
@@ -1822,8 +1822,8 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
 ; CHECK-SM70-LABEL: fma_bf16x2_maxnum_no_nans(
 ; CHECK-SM70:       {
 ; CHECK-SM70-NEXT:    .reg .pred %p<5>;
-; CHECK-SM70-NEXT:    .reg .b16 %rs<13>;
-; CHECK-SM70-NEXT:    .reg .b32 %r<44>;
+; CHECK-SM70-NEXT:    .reg .b16 %rs<7>;
+; CHECK-SM70-NEXT:    .reg .b32 %r<43>;
 ; CHECK-SM70-NEXT:    .reg .f32 %f<13>;
 ; CHECK-SM70-EMPTY:
 ; CHECK-SM70-NEXT:  // %bb.0:
@@ -1834,12 +1834,12 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r4, %rs1;
 ; CHECK-SM70-NEXT:    shl.b32 %r5, %r4, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f1, %r5;
-; CHECK-SM70-NEXT:    mov.b32 {%rs4, %rs5}, %r2;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs4;
+; CHECK-SM70-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r6, %rs3;
 ; CHECK-SM70-NEXT:    shl.b32 %r7, %r6, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f2, %r7;
-; CHECK-SM70-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs7;
+; CHECK-SM70-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r8, %rs5;
 ; CHECK-SM70-NEXT:    shl.b32 %r9, %r8, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f3, %r9;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f4, %f3, %f2, %f1;
@@ -1853,10 +1853,10 @@ define <2 x bfloat> @fma_bf16x2_maxnum_no_nans(<2 x bfloat> %a, <2 x bfloat> %b,
 ; CHECK-SM70-NEXT:    cvt.u32.u16 %r16, %rs2;
 ; CHECK-SM70-NEXT:    shl.b32 %r17, %r16, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f5, %r17;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs5;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r18, %rs4;
 ; CHECK-SM70-NEXT:    shl.b32 %r19, %r18, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f6, %r19;
-; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs8;
+; CHECK-SM70-NEXT:    cvt.u32.u16 %r20, %rs6;
 ; CHECK-SM70-NEXT:    shl.b32 %r21, %r20, 16;
 ; CHECK-SM70-NEXT:    mov.b32 %f7, %r21;
 ; CHECK-SM70-NEXT:    fma.rn.f32 %f8, %f7, %f6, %f5;
diff --git a/llvm/test/CodeGen/NVPTX/i1-load-lower.ll b/llvm/test/CodeGen/NVPTX/i1-load-lower.ll
index 4b56c267ab324..f4a5ff9a333f7 100644
--- a/llvm/test/CodeGen/NVPTX/i1-load-lower.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-load-lower.ll
@@ -16,7 +16,7 @@ define void @foo() {
 ; CHECK:    and.b16 %rs2, %rs1, 1;
 ; CHECK:    setp.eq.b16 %p1, %rs2, 1;
 ; CHECK:    @%p1 bra $L__BB0_2;
-; CHECK:    mov.u16 %rs3, 1;
+; CHECK:    mov.b16 %rs3, 1;
 ; CHECK:    st.global.u8 [i1g], %rs3;
 ; CHECK:    ret;
   %tmp = load i1, ptr addrspace(1) @i1g, align 2
diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll
index 895787d68adfe..accfbe4af0313 100644
--- a/llvm/test/CodeGen/NVPTX/i128.ll
+++ b/llvm/test/CodeGen/NVPTX/i128.ll
@@ -13,7 +13,7 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd45, %rd46}, [srem_i128_param_0];
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd49, %rd50}, [srem_i128_param_1];
 ; CHECK-NEXT:    shr.s64 %rd2, %rd46, 63;
-; CHECK-NEXT:    mov.u64 %rd119, 0;
+; CHECK-NEXT:    mov.b64 %rd119, 0;
 ; CHECK-NEXT:    sub.cc.s64 %rd52, %rd119, %rd45;
 ; CHECK-NEXT:    subc.cc.s64 %rd53, %rd119, %rd46;
 ; CHECK-NEXT:    setp.lt.s64 %p1, %rd46, 0;
@@ -95,7 +95,7 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    shr.u64 %rd124, %rd4, %r16;
 ; CHECK-NEXT:    add.cc.s64 %rd35, %rd5, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd36, %rd6, -1;
-; CHECK-NEXT:    mov.u64 %rd116, 0;
+; CHECK-NEXT:    mov.b64 %rd116, 0;
 ; CHECK-NEXT:    mov.u64 %rd119, %rd116;
 ; CHECK-NEXT:  $L__BB0_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -180,7 +180,7 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd52, %r4;
 ; CHECK-NEXT:    add.s64 %rd53, %rd52, 64;
 ; CHECK-NEXT:    selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT:    mov.u64 %rd105, 0;
+; CHECK-NEXT:    mov.b64 %rd105, 0;
 ; CHECK-NEXT:    sub.cc.s64 %rd56, %rd50, %rd54;
 ; CHECK-NEXT:    subc.cc.s64 %rd57, %rd105, 0;
 ; CHECK-NEXT:    setp.eq.s64 %p6, %rd57, 0;
@@ -233,7 +233,7 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    shr.u64 %rd110, %rd42, %r16;
 ; CHECK-NEXT:    add.cc.s64 %rd33, %rd3, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd34, %rd4, -1;
-; CHECK-NEXT:    mov.u64 %rd102, 0;
+; CHECK-NEXT:    mov.b64 %rd102, 0;
 ; CHECK-NEXT:    mov.u64 %rd105, %rd102;
 ; CHECK-NEXT:  $L__BB1_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -313,7 +313,7 @@ define i128 @urem_i128_pow2k(i128 %lhs) {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd1, %rd2}, [urem_i128_pow2k_param_0];
 ; CHECK-NEXT:    and.b64 %rd3, %rd1, 8589934591;
-; CHECK-NEXT:    mov.u64 %rd4, 0;
+; CHECK-NEXT:    mov.b64 %rd4, 0;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd3, %rd4};
 ; CHECK-NEXT:    ret;
   %div = urem i128 %lhs, 8589934592
@@ -330,7 +330,7 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd45, %rd46}, [sdiv_i128_param_0];
 ; CHECK-NEXT:    ld.param.v2.u64 {%rd49, %rd50}, [sdiv_i128_param_1];
-; CHECK-NEXT:    mov.u64 %rd112, 0;
+; CHECK-NEXT:    mov.b64 %rd112, 0;
 ; CHECK-NEXT:    sub.cc.s64 %rd52, %rd112, %rd45;
 ; CHECK-NEXT:    subc.cc.s64 %rd53, %rd112, %rd46;
 ; CHECK-NEXT:    setp.lt.s64 %p1, %rd46, 0;
@@ -414,7 +414,7 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    shr.u64 %rd117, %rd2, %r16;
 ; CHECK-NEXT:    add.cc.s64 %rd35, %rd3, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd36, %rd4, -1;
-; CHECK-NEXT:    mov.u64 %rd109, 0;
+; CHECK-NEXT:    mov.b64 %rd109, 0;
 ; CHECK-NEXT:    mov.u64 %rd112, %rd109;
 ; CHECK-NEXT:  $L__BB4_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -491,7 +491,7 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd52, %r4;
 ; CHECK-NEXT:    add.s64 %rd53, %rd52, 64;
 ; CHECK-NEXT:    selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT:    mov.u64 %rd97, 0;
+; CHECK-NEXT:    mov.b64 %rd97, 0;
 ; CHECK-NEXT:    sub.cc.s64 %rd56, %rd50, %rd54;
 ; CHECK-NEXT:    subc.cc.s64 %rd57, %rd97, 0;
 ; CHECK-NEXT:    setp.eq.s64 %p6, %rd57, 0;
@@ -544,7 +544,7 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    shr.u64 %rd102, %rd42, %r16;
 ; CHECK-NEXT:    add.cc.s64 %rd33, %rd43, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd34, %rd44, -1;
-; CHECK-NEXT:    mov.u64 %rd94, 0;
+; CHECK-NEXT:    mov.b64 %rd94, 0;
 ; CHECK-NEXT:    mov.u64 %rd97, %rd94;
 ; CHECK-NEXT:  $L__BB5_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
index 5d849517096dc..a39e25582f759 100644
--- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
@@ -347,13 +347,13 @@ define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 {
 define <2 x i16> @test_or(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-LABEL: test_or(
 ; COMMON:       {
-; COMMON-NEXT:    .reg .b32 %r<7>;
+; COMMON-NEXT:    .reg .b32 %r<4>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r3, [test_or_param_1];
-; COMMON-NEXT:    ld.param.u32 %r4, [test_or_param_0];
-; COMMON-NEXT:    or.b32 %r5, %r4, %r3;
-; COMMON-NEXT:    st.param.b32 [func_retval0], %r5;
+; COMMON-NEXT:    ld.param.u32 %r2, [test_or_param_1];
+; COMMON-NEXT:    ld.param.u32 %r1, [test_or_param_0];
+; COMMON-NEXT:    or.b32 %r3, %r1, %r2;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
 ; COMMON-NEXT:    ret;
   %r = or <2 x i16> %a, %b
   ret <2 x i16> %r
@@ -369,9 +369,9 @@ define <2 x i16> @test_or_computed(i16 %a) {
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
 ; COMMON-NEXT:    ld.param.u16 %rs1, [test_or_computed_param_0];
-; COMMON-NEXT:    mov.u16 %rs2, 0;
+; COMMON-NEXT:    mov.b16 %rs2, 0;
 ; COMMON-NEXT:    mov.b32 %r1, {%rs1, %rs2};
-; COMMON-NEXT:    mov.u16 %rs3, 5;
+; COMMON-NEXT:    mov.b16 %rs3, 5;
 ; COMMON-NEXT:    mov.b32 %r2, {%rs1, %rs3};
 ; COMMON-NEXT:    or.b32 %r3, %r2, %r1;
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
@@ -414,13 +414,13 @@ define <2 x i16> @test_or_imm_1(<2 x i16> %a) #0 {
 define <2 x i16> @test_xor(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-LABEL: test_xor(
 ; COMMON:       {
-; COMMON-NEXT:    .reg .b32 %r<7>;
+; COMMON-NEXT:    .reg .b32 %r<4>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r3, [test_xor_param_1];
-; COMMON-NEXT:    ld.param.u32 %r4, [test_xor_param_0];
-; COMMON-NEXT:    xor.b32 %r5, %r4, %r3;
-; COMMON-NEXT:    st.param.b32 [func_retval0], %r5;
+; COMMON-NEXT:    ld.param.u32 %r2, [test_xor_param_1];
+; COMMON-NEXT:    ld.param.u32 %r1, [test_xor_param_0];
+; COMMON-NEXT:    xor.b32 %r3, %r1, %r2;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
 ; COMMON-NEXT:    ret;
   %r = xor <2 x i16> %a, %b
   ret <2 x i16> %r
@@ -434,9 +434,9 @@ define <2 x i16> @test_xor_computed(i16 %a) {
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
 ; COMMON-NEXT:    ld.param.u16 %rs1, [test_xor_computed_param_0];
-; COMMON-NEXT:    mov.u16 %rs2, 0;
+; COMMON-NEXT:    mov.b16 %rs2, 0;
 ; COMMON-NEXT:    mov.b32 %r1, {%rs1, %rs2};
-; COMMON-NEXT:    mov.u16 %rs3, 5;
+; COMMON-NEXT:    mov.b16 %rs3, 5;
 ; COMMON-NEXT:    mov.b32 %r2, {%rs1, %rs3};
 ; COMMON-NEXT:    xor.b32 %r3, %r2, %r1;
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
@@ -479,13 +479,13 @@ define <2 x i16> @test_xor_imm_1(<2 x i16> %a) #0 {
 define <2 x i16> @test_and(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-LABEL: test_and(
 ; COMMON:       {
-; COMMON-NEXT:    .reg .b32 %r<7>;
+; COMMON-NEXT:    .reg .b32 %r<4>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r3, [test_and_param_1];
-; COMMON-NEXT:    ld.param.u32 %r4, [test_and_param_0];
-; COMMON-NEXT:    and.b32 %r5, %r4, %r3;
-; COMMON-NEXT:    st.param.b32 [func_retval0], %r5;
+; COMMON-NEXT:    ld.param.u32 %r2, [test_and_param_1];
+; COMMON-NEXT:    ld.param.u32 %r1, [test_and_param_0];
+; COMMON-NEXT:    and.b32 %r3, %r1, %r2;
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
 ; COMMON-NEXT:    ret;
   %r = and <2 x i16> %a, %b
   ret <2 x i16> %r
@@ -501,9 +501,9 @@ define <2 x i16> @test_and_computed(i16 %a) {
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
 ; COMMON-NEXT:    ld.param.u16 %rs1, [test_and_computed_param_0];
-; COMMON-NEXT:    mov.u16 %rs2, 0;
+; COMMON-NEXT:    mov.b16 %rs2, 0;
 ; COMMON-NEXT:    mov.b32 %r1, {%rs1, %rs2};
-; COMMON-NEXT:    mov.u16 %rs3, 5;
+; COMMON-NEXT:    mov.b16 %rs3, 5;
 ; COMMON-NEXT:    mov.b32 %r2, {%rs1, %rs3};
 ; COMMON-NEXT:    and.b32 %r3, %r2, %r1;
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
@@ -807,7 +807,7 @@ define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b,
 define <2 x i16> @test_trunc_2xi32(<2 x i32> %a) #0 {
 ; COMMON-LABEL: test_trunc_2xi32(
 ; COMMON:       {
-; COMMON-NEXT:    .reg .b32 %r<5>;
+; COMMON-NEXT:    .reg .b32 %r<4>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
 ; COMMON-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_param_0];
@@ -821,16 +821,16 @@ define <2 x i16> @test_trunc_2xi32(<2 x i32> %a) #0 {
 define <2 x i16> @test_trunc_2xi32_muliple_use0(<2 x i32> %a, ptr %p) #0 {
 ; I16x2-LABEL: test_trunc_2xi32_muliple_use0(
 ; I16x2:       {
-; I16x2-NEXT:    .reg .b32 %r<7>;
+; I16x2-NEXT:    .reg .b32 %r<6>;
 ; I16x2-NEXT:    .reg .b64 %rd<2>;
 ; I16x2-EMPTY:
 ; I16x2-NEXT:  // %bb.0:
 ; I16x2-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_muliple_use0_param_0];
 ; I16x2-NEXT:    ld.param.u64 %rd1, [test_trunc_2xi32_muliple_use0_param_1];
 ; I16x2-NEXT:    prmt.b32 %r3, %r1, %r2, 0x5410U;
-; I16x2-NEXT:    mov.b32 %r5, 65537;
-; I16x2-NEXT:    add.s16x2 %r6, %r3, %r5;
-; I16x2-NEXT:    st.u32 [%rd1], %r6;
+; I16x2-NEXT:    mov.b32 %r4, 65537;
+; I16x2-NEXT:    add.s16x2 %r5, %r3, %r4;
+; I16x2-NEXT:    st.u32 [%rd1], %r5;
 ; I16x2-NEXT:    st.param.b32 [func_retval0], %r3;
 ; I16x2-NEXT:    ret;
 ;
@@ -863,16 +863,16 @@ define <2 x i16> @test_trunc_2xi32_muliple_use0(<2 x i32> %a, ptr %p) #0 {
 define <2 x i16> @test_trunc_2xi32_muliple_use1(<2 x i32> %a, ptr %p) #0 {
 ; COMMON-LABEL: test_trunc_2xi32_muliple_use1(
 ; COMMON:       {
-; COMMON-NEXT:    .reg .b32 %r<7>;
+; COMMON-NEXT:    .reg .b32 %r<6>;
 ; COMMON-NEXT:    .reg .b64 %rd<2>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
 ; COMMON-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_muliple_use1_param_0];
 ; COMMON-NEXT:    ld.param.u64 %rd1, [test_trunc_2xi32_muliple_use1_param_1];
 ; COMMON-NEXT:    prmt.b32 %r3, %r1, %r2, 0x5410U;
-; COMMON-NEXT:    add.s32 %r5, %r2, 1;
-; COMMON-NEXT:    add.s32 %r6, %r1, 1;
-; COMMON-NEXT:    st.v2.u32 [%rd1], {%r6, %r5};
+; COMMON-NEXT:    add.s32 %r4, %r2, 1;
+; COMMON-NEXT:    add.s32 %r5, %r1, 1;
+; COMMON-NEXT:    st.v2.u32 [%rd1], {%r5, %r4};
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r3;
 ; COMMON-NEXT:    ret;
   %r = trunc <2 x i32> %a to <2 x i16>
@@ -939,7 +939,7 @@ define <2 x i64> @test_zext_2xi64(<2 x i16> %a) #0 {
 define <2 x i16> @test_bitcast_i32_to_2xi16(i32 %a) #0 {
 ; COMMON-LABEL: test_bitcast_i32_to_2xi16(
 ; COMMON:       {
-; COMMON-NEXT:    .reg .b32 %r<3>;
+; COMMON-NEXT:    .reg .b32 %r<2>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
 ; COMMON-NEXT:    ld.param.u32 %r1, [test_bitcast_i32_to_2xi16_param_0];
@@ -952,11 +952,11 @@ define <2 x i16> @test_bitcast_i32_to_2xi16(i32 %a) #0 {
 define i32 @test_bitcast_2xi16_to_i32(<2 x i16> %a) #0 {
 ; COMMON-LABEL: test_bitcast_2xi16_to_i32(
 ; COMMON:       {
-; COMMON-NEXT:    .reg .b32 %r<3>;
+; COMMON-NEXT:    .reg .b32 %r<2>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.u32 %r2, [test_bitcast_2xi16_to_i32_param_0];
-; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
+; COMMON-NEXT:    ld.param.u32 %r1, [test_bitcast_2xi16_to_i32_param_0];
+; COMMON-NEXT:    st.param.b32 [func_retval0], %r1;
 ; COMMON-NEXT:    ret;
   %r = bitcast <2 x i16> %a to i32
   ret i32 %r
@@ -966,11 +966,11 @@ define <2 x half> @test_bitcast_2xi16_to_2xhalf(i16 %a) #0 {
 ; COMMON-LABEL: test_bitcast_2xi16_to_2xhalf(
 ; COMMON:       {
 ; COMMON-NEXT:    .reg .b16 %rs<3>;
-; COMMON-NEXT:    .reg .b32 %r<3>;
+; COMMON-NEXT:    .reg .b32 %r<2>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
 ; COMMON-NEXT:    ld.param.u16 %rs1, [test_bitcast_2xi16_to_2xhalf_param_0];
-; COMMON-NEXT:    mov.u16 %rs2, 5;
+; COMMON-NEXT:    mov.b16 %rs2, 5;
 ; COMMON-NEXT:    mov.b32 %r1, {%rs1, %rs2};
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r1;
 ; COMMON-NEXT:    ret;
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 3853ec5c4151a..c2f166770a7ad 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -528,13 +528,13 @@ define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 {
 define <4 x i8> @test_or(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_or(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r3, [test_or_param_1];
-; CHECK-NEXT:    ld.param.u32 %r4, [test_or_param_0];
-; CHECK-NEXT:    or.b32 %r5, %r4, %r3;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT:    ld.param.u32 %r2, [test_or_param_1];
+; CHECK-NEXT:    ld.param.u32 %r1, [test_or_param_0];
+; CHECK-NEXT:    or.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %r = or <4 x i8> %a, %b
   ret <4 x i8> %r
@@ -544,7 +544,7 @@ define <4 x i8> @test_or_computed(i8 %a) {
 ; CHECK-LABEL: test_or_computed(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u8 %rs1, [test_or_computed_param_0];
@@ -554,8 +554,8 @@ define <4 x i8> @test_or_computed(i8 %a) {
 ; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
 ; CHECK-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
-; CHECK-NEXT:    or.b32 %r8, %r6, %r5;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    or.b32 %r7, %r6, %r5;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
 ; CHECK-NEXT:    ret;
   %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
   %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
@@ -594,13 +594,13 @@ define <4 x i8> @test_or_imm_1(<4 x i8> %a) #0 {
 define <4 x i8> @test_xor(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_xor(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r3, [test_xor_param_1];
-; CHECK-NEXT:    ld.param.u32 %r4, [test_xor_param_0];
-; CHECK-NEXT:    xor.b32 %r5, %r4, %r3;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT:    ld.param.u32 %r2, [test_xor_param_1];
+; CHECK-NEXT:    ld.param.u32 %r1, [test_xor_param_0];
+; CHECK-NEXT:    xor.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %r = xor <4 x i8> %a, %b
   ret <4 x i8> %r
@@ -610,7 +610,7 @@ define <4 x i8> @test_xor_computed(i8 %a) {
 ; CHECK-LABEL: test_xor_computed(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u8 %rs1, [test_xor_computed_param_0];
@@ -620,8 +620,8 @@ define <4 x i8> @test_xor_computed(i8 %a) {
 ; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
 ; CHECK-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
-; CHECK-NEXT:    xor.b32 %r8, %r6, %r5;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    xor.b32 %r7, %r6, %r5;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
 ; CHECK-NEXT:    ret;
   %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
   %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
@@ -660,13 +660,13 @@ define <4 x i8> @test_xor_imm_1(<4 x i8> %a) #0 {
 define <4 x i8> @test_and(<4 x i8> %a, <4 x i8> %b) #0 {
 ; CHECK-LABEL: test_and(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r3, [test_and_param_1];
-; CHECK-NEXT:    ld.param.u32 %r4, [test_and_param_0];
-; CHECK-NEXT:    and.b32 %r5, %r4, %r3;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
+; CHECK-NEXT:    ld.param.u32 %r2, [test_and_param_1];
+; CHECK-NEXT:    ld.param.u32 %r1, [test_and_param_0];
+; CHECK-NEXT:    and.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NEXT:    ret;
   %r = and <4 x i8> %a, %b
   ret <4 x i8> %r
@@ -676,7 +676,7 @@ define <4 x i8> @test_and_computed(i8 %a) {
 ; CHECK-LABEL: test_and_computed(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u8 %rs1, [test_and_computed_param_0];
@@ -686,8 +686,8 @@ define <4 x i8> @test_and_computed(i8 %a) {
 ; CHECK-NEXT:    prmt.b32 %r4, %r3, 0, 0x3340U;
 ; CHECK-NEXT:    prmt.b32 %r5, %r4, %r2, 0x5410U;
 ; CHECK-NEXT:    bfi.b32 %r6, 5, %r5, 8, 8;
-; CHECK-NEXT:    and.b32 %r8, %r6, %r5;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    and.b32 %r7, %r6, %r5;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
 ; CHECK-NEXT:    ret;
   %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0
   %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1
@@ -743,7 +743,7 @@ define void @test_ldst_v2i8(ptr %a, ptr %b) {
 define void @test_ldst_v3i8(ptr %a, ptr %b) {
 ; CHECK-LABEL: test_ldst_v3i8(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
@@ -751,8 +751,8 @@ define void @test_ldst_v3i8(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ld.param.u64 %rd1, [test_ldst_v3i8_param_0];
 ; CHECK-NEXT:    ld.u32 %r1, [%rd1];
 ; CHECK-NEXT:    st.u16 [%rd2], %r1;
-; CHECK-NEXT:    bfe.u32 %r3, %r1, 16, 8;
-; CHECK-NEXT:    st.u8 [%rd2+2], %r3;
+; CHECK-NEXT:    bfe.u32 %r2, %r1, 16, 8;
+; CHECK-NEXT:    st.u8 [%rd2+2], %r2;
 ; CHECK-NEXT:    ret;
   %t1 = load <3 x i8>, ptr %a
   store <3 x i8> %t1, ptr %b, align 16
@@ -1127,7 +1127,7 @@ define <4 x i64> @test_zext_2xi64(<4 x i8> %a) #0 {
 define <4 x i8> @test_bitcast_i32_to_4xi8(i32 %a) #0 {
 ; CHECK-LABEL: test_bitcast_i32_to_4xi8(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [test_bitcast_i32_to_4xi8_param_0];
@@ -1155,11 +1155,11 @@ define <4 x i8> @test_bitcast_float_to_4xi8(float %a) #0 {
 define i32 @test_bitcast_4xi8_to_i32(<4 x i8> %a) #0 {
 ; CHECK-LABEL: test_bitcast_4xi8_to_i32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r2, [test_bitcast_4xi8_to_i32_param_0];
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT:    ld.param.u32 %r1, [test_bitcast_4xi8_to_i32_param_0];
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
 ; CHECK-NEXT:    ret;
   %r = bitcast <4 x i8> %a to i32
   ret i32 %r
@@ -1168,12 +1168,12 @@ define i32 @test_bitcast_4xi8_to_i32(<4 x i8> %a) #0 {
 define float @test_bitcast_4xi8_to_float(<4 x i8> %a) #0 {
 ; CHECK-LABEL: test_bitcast_4xi8_to_float(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-NEXT:    .reg .f32 %f<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r2, [test_bitcast_4xi8_to_float_param_0];
-; CHECK-NEXT:    mov.b32 %f1, %r2;
+; CHECK-NEXT:    ld.param.u32 %r1, [test_bitcast_4xi8_to_float_param_0];
+; CHECK-NEXT:    mov.b32 %f1, %r1;
 ; CHECK-NEXT:    st.param.f32 [func_retval0], %f1;
 ; CHECK-NEXT:    ret;
   %r = bitcast <4 x i8> %a to float
@@ -1185,7 +1185,7 @@ define <2 x half> @test_bitcast_4xi8_to_2xhalf(i8 %a) #0 {
 ; CHECK-LABEL: test_bitcast_4xi8_to_2xhalf(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b32 %r<6>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0];
@@ -1240,7 +1240,7 @@ define <4 x i8> @test_insertelement(<4 x i8> %a, i8 %x) #0 {
 ; CHECK-LABEL: test_insertelement(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
-; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u8 %rs1, [test_insertelement_param_1];
@@ -1257,28 +1257,28 @@ define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 {
 ; CHECK-LABEL: test_fptosi_4xhalf_to_4xi8(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<14>;
+; CHECK-NEXT:    .reg .b32 %r<12>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [test_fptosi_4xhalf_to_4xi8_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_fptosi_4xhalf_to_4xi8_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
 ; CHECK-NEXT:    cvt.rzi.s16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rzi.s16.f16 %rs4, %rs1;
-; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
-; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r5;
-; CHECK-NEXT:    cvt.u32.u16 %r6, %rs6;
-; CHECK-NEXT:    cvt.u32.u16 %r7, %rs5;
-; CHECK-NEXT:    prmt.b32 %r8, %r7, %r6, 0x3340U;
-; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
+; CHECK-NEXT:    mov.b32 %r3, {%rs4, %rs3};
+; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
+; CHECK-NEXT:    cvt.u32.u16 %r4, %rs6;
+; CHECK-NEXT:    cvt.u32.u16 %r5, %rs5;
+; CHECK-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
+; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
 ; CHECK-NEXT:    cvt.rzi.s16.f16 %rs9, %rs8;
 ; CHECK-NEXT:    cvt.rzi.s16.f16 %rs10, %rs7;
-; CHECK-NEXT:    mov.b32 %r9, {%rs10, %rs9};
-; CHECK-NEXT:    mov.b32 {%rs11, %rs12}, %r9;
-; CHECK-NEXT:    cvt.u32.u16 %r10, %rs12;
-; CHECK-NEXT:    cvt.u32.u16 %r11, %rs11;
-; CHECK-NEXT:    prmt.b32 %r12, %r11, %r10, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r13, %r12, %r8, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r13;
+; CHECK-NEXT:    mov.b32 %r7, {%rs10, %rs9};
+; CHECK-NEXT:    mov.b32 {%rs11, %rs12}, %r7;
+; CHECK-NEXT:    cvt.u32.u16 %r8, %rs12;
+; CHECK-NEXT:    cvt.u32.u16 %r9, %rs11;
+; CHECK-NEXT:    prmt.b32 %r10, %r9, %r8, 0x3340U;
+; CHECK-NEXT:    prmt.b32 %r11, %r10, %r6, 0x5410U;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r11;
 ; CHECK-NEXT:    ret;
   %r = fptosi <4 x half> %a to <4 x i8>
   ret <4 x i8> %r
@@ -1288,28 +1288,28 @@ define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 {
 ; CHECK-LABEL: test_fptoui_4xhalf_to_4xi8(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<13>;
-; CHECK-NEXT:    .reg .b32 %r<14>;
+; CHECK-NEXT:    .reg .b32 %r<12>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [test_fptoui_4xhalf_to_4xi8_param_0];
-; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [test_fptoui_4xhalf_to_4xi8_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
 ; CHECK-NEXT:    cvt.rzi.u16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rzi.u16.f16 %rs4, %rs1;
-; CHECK-NEXT:    mov.b32 %r5, {%rs4, %rs3};
-; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r5;
-; CHECK-NEXT:    cvt.u32.u16 %r6, %rs6;
-; CHECK-NEXT:    cvt.u32.u16 %r7, %rs5;
-; CHECK-NEXT:    prmt.b32 %r8, %r7, %r6, 0x3340U;
-; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
+; CHECK-NEXT:    mov.b32 %r3, {%rs4, %rs3};
+; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
+; CHECK-NEXT:    cvt.u32.u16 %r4, %rs6;
+; CHECK-NEXT:    cvt.u32.u16 %r5, %rs5;
+; CHECK-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
+; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
 ; CHECK-NEXT:    cvt.rzi.u16.f16 %rs9, %rs8;
 ; CHECK-NEXT:    cvt.rzi.u16.f16 %rs10, %rs7;
-; CHECK-NEXT:    mov.b32 %r9, {%rs10, %rs9};
-; CHECK-NEXT:    mov.b32 {%rs11, %rs12}, %r9;
-; CHECK-NEXT:    cvt.u32.u16 %r10, %rs12;
-; CHECK-NEXT:    cvt.u32.u16 %r11, %rs11;
-; CHECK-NEXT:    prmt.b32 %r12, %r11, %r10, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r13, %r12, %r8, 0x5410U;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r13;
+; CHECK-NEXT:    mov.b32 %r7, {%rs10, %rs9};
+; CHECK-NEXT:    mov.b32 {%rs11, %rs12}, %r7;
+; CHECK-NEXT:    cvt.u32.u16 %r8, %rs12;
+; CHECK-NEXT:    cvt.u32.u16 %r9, %rs11;
+; CHECK-NEXT:    prmt.b32 %r10, %r9, %r8, 0x3340U;
+; CHECK-NEXT:    prmt.b32 %r11, %r10, %r6, 0x5410U;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r11;
 ; CHECK-NEXT:    ret;
   %r = fptoui <4 x half> %a to <4 x i8>
   ret <4 x i8> %r
@@ -1375,7 +1375,7 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: test_srem_v3i8(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b16 %rs<20>;
-; CHECK-NEXT:    .reg .b32 %r<17>;
+; CHECK-NEXT:    .reg .b32 %r<14>;
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %entry
@@ -1392,27 +1392,27 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-NEXT:    ld.u8 %rs7, [%rd2+1];
 ; CHECK-NEXT:    shl.b16 %rs8, %rs7, 8;
 ; CHECK-NEXT:    or.b16 %rs9, %rs8, %rs6;
-; CHECK-NEXT:    cvt.u32.u16 %r3, %rs9;
+; CHECK-NEXT:    cvt.u32.u16 %r2, %rs9;
 ; CHECK-NEXT:    ld.s8 %rs10, [%rd2+2];
-; CHECK-NEXT:    bfe.s32 %r5, %r3, 8, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs11, %r5;
-; CHECK-NEXT:    bfe.s32 %r6, %r1, 8, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs12, %r6;
+; CHECK-NEXT:    bfe.s32 %r3, %r2, 8, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs11, %r3;
+; CHECK-NEXT:    bfe.s32 %r4, %r1, 8, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs12, %r4;
 ; CHECK-NEXT:    rem.s16 %rs13, %rs12, %rs11;
-; CHECK-NEXT:    cvt.u32.u16 %r7, %rs13;
-; CHECK-NEXT:    bfe.s32 %r8, %r3, 0, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs14, %r8;
-; CHECK-NEXT:    bfe.s32 %r9, %r1, 0, 8;
-; CHECK-NEXT:    cvt.s8.s32 %rs15, %r9;
+; CHECK-NEXT:    cvt.u32.u16 %r5, %rs13;
+; CHECK-NEXT:    bfe.s32 %r6, %r2, 0, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs14, %r6;
+; CHECK-NEXT:    bfe.s32 %r7, %r1, 0, 8;
+; CHECK-NEXT:    cvt.s8.s32 %rs15, %r7;
 ; CHECK-NEXT:    rem.s16 %rs16, %rs15, %rs14;
-; CHECK-NEXT:    cvt.u32.u16 %r10, %rs16;
-; CHECK-NEXT:    prmt.b32 %r11, %r10, %r7, 0x3340U;
-; CHECK-NEXT:    // implicit-def: %r13
-; CHECK-NEXT:    // implicit-def: %r14
-; CHECK-NEXT:    prmt.b32 %r12, %r13, %r14, 0x3340U;
-; CHECK-NEXT:    prmt.b32 %r15, %r11, %r12, 0x5410U;
+; CHECK-NEXT:    cvt.u32.u16 %r8, %rs16;
+; CHECK-NEXT:    prmt.b32 %r9, %r8, %r5, 0x3340U;
+; CHECK-NEXT:    // implicit-def: %r11
+; CHECK-NEXT:    // implicit-def: %r12
+; CHECK-NEXT:    prmt.b32 %r10, %r11, %r12, 0x3340U;
+; CHECK-NEXT:    prmt.b32 %r13, %r9, %r10, 0x5410U;
 ; CHECK-NEXT:    rem.s16 %rs17, %rs5, %rs10;
-; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs18, tmp}, %r15; }
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs18, tmp}, %r13; }
 ; CHECK-NEXT:    st.u8 [%rd3], %rs18;
 ; CHECK-NEXT:    shr.u16 %rs19, %rs18, 8;
 ; CHECK-NEXT:    st.u8 [%rd3+1], %rs19;
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll
index 4da77e28547b6..4ebeba06032c6 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll
@@ -13,8 +13,8 @@ define void @test_b128_input_from_const() {
 ; CHECK-NEXT:    .reg .b128 %rq<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u64 %rd2, 0;
-; CHECK-NEXT:    mov.u64 %rd3, 42;
+; CHECK-NEXT:    mov.b64 %rd2, 0;
+; CHECK-NEXT:    mov.b64 %rd3, 42;
 ; CHECK-NEXT:    mov.b128 %rq1, {%rd3, %rd2};
 ; CHECK-NEXT:    mov.u64 %rd4, value;
 ; CHECK-NEXT:    cvta.global.u64 %rd1, %rd4;
@@ -65,7 +65,7 @@ define void @test_b128_input_from_select(ptr nocapture readonly %flag) {
 ; CHECK-NEXT:    ld.global.u8 %rs1, [%rd3];
 ; CHECK-NEXT:    setp.eq.s16 %p1, %rs1, 0;
 ; CHECK-NEXT:    selp.b64 %rd4, 24, 42, %p1;
-; CHECK-NEXT:    mov.u64 %rd5, 0;
+; CHECK-NEXT:    mov.b64 %rd5, 0;
 ; CHECK-NEXT:    mov.b128 %rq1, {%rd4, %rd5};
 ; CHECK-NEXT:    mov.u64 %rd6, value;
 ; CHECK-NEXT:    cvta.global.u64 %rd1, %rd6;
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll
index e7980accf457e..d5f3d83573e29 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll
@@ -23,7 +23,7 @@ define void @test_corner_values() {
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.global.u64 %rd1, [v64];
 ; CHECK-NEXT:    add.s64 %rd2, %rd1, 8;
-; CHECK-NEXT:    mov.u64 %rd13, -1;
+; CHECK-NEXT:    mov.b64 %rd13, -1;
 ; CHECK-NEXT:    mov.b128 %rq1, {%rd13, %rd13};
 ; CHECK-NEXT:    mov.u64 %rd14, v_u128_max;
 ; CHECK-NEXT:    cvta.global.u64 %rd3, %rd14;
@@ -40,7 +40,7 @@ define void @test_corner_values() {
 ; CHECK-NEXT:    ld.global.u64 %rd15, [v64];
 ; CHECK-NEXT:    add.s64 %rd4, %rd15, 16;
 ; CHECK-NEXT:    add.s64 %rd5, %rd15, 24;
-; CHECK-NEXT:    mov.u64 %rd16, 9223372036854775807;
+; CHECK-NEXT:    mov.b64 %rd16, 9223372036854775807;
 ; CHECK-NEXT:    mov.b128 %rq2, {%rd13, %rd16};
 ; CHECK-NEXT:    mov.u64 %rd17, v_i128_max;
 ; CHECK-NEXT:    cvta.global.u64 %rd6, %rd17;
@@ -57,8 +57,8 @@ define void @test_corner_values() {
 ; CHECK-NEXT:    ld.global.u64 %rd18, [v64];
 ; CHECK-NEXT:    add.s64 %rd7, %rd18, 32;
 ; CHECK-NEXT:    add.s64 %rd8, %rd18, 40;
-; CHECK-NEXT:    mov.u64 %rd19, -9223372036854775808;
-; CHECK-NEXT:    mov.u64 %rd20, 0;
+; CHECK-NEXT:    mov.b64 %rd19, -9223372036854775808;
+; CHECK-NEXT:    mov.b64 %rd20, 0;
 ; CHECK-NEXT:    mov.b128 %rq3, {%rd20, %rd19};
 ; CHECK-NEXT:    mov.u64 %rd21, v_i128_min;
 ; CHECK-NEXT:    cvta.global.u64 %rd9, %rd21;
diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
index 00cfa3daf4a7c..a3d40169c4695 100644
--- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
+++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll
@@ -21,7 +21,7 @@ define void @test_b128_in_loop() {
 ; CHECK-NEXT:  // %bb.1: // %BB1
 ; CHECK-NEXT:    ld.global.u64 %rd13, [x+8];
 ; CHECK-NEXT:    ld.global.u64 %rd12, [x];
-; CHECK-NEXT:    mov.u64 %rd14, 0;
+; CHECK-NEXT:    mov.b64 %rd14, 0;
 ; CHECK-NEXT:  $L__BB0_2: // %BB2
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    mov.b128 %rq1, {%rd12, %rd13};
diff --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll
index 5161e5d029777..189f3421cd03a 100644
--- a/llvm/test/CodeGen/NVPTX/math-intrins.ll
+++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll
@@ -621,26 +621,26 @@ define half @minimum_half(half %a, half %b) {
 ; CHECK-NOF16-LABEL: minimum_half(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<6>;
-; CHECK-NOF16-NEXT:    .reg .b16 %rs<10>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<8>;
 ; CHECK-NOF16-NEXT:    .reg .f32 %f<4>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b16 %rs1, [minimum_half_param_0];
-; CHECK-NOF16-NEXT:    ld.param.b16 %rs3, [minimum_half_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs3;
+; CHECK-NOF16-NEXT:    ld.param.b16 %rs2, [minimum_half_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
 ; CHECK-NOF16-NEXT:    setp.lt.f32 %p1, %f2, %f1;
-; CHECK-NOF16-NEXT:    selp.b16 %rs4, %rs1, %rs3, %p1;
+; CHECK-NOF16-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
 ; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
-; CHECK-NOF16-NEXT:    selp.b16 %rs5, 0x7E00, %rs4, %p2;
+; CHECK-NOF16-NEXT:    selp.b16 %rs4, 0x7E00, %rs3, %p2;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p3, %rs1, -32768;
-; CHECK-NOF16-NEXT:    selp.b16 %rs6, %rs1, %rs5, %p3;
-; CHECK-NOF16-NEXT:    setp.eq.s16 %p4, %rs3, -32768;
-; CHECK-NOF16-NEXT:    selp.b16 %rs8, %rs3, %rs6, %p4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs5;
+; CHECK-NOF16-NEXT:    selp.b16 %rs5, %rs1, %rs4, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, -32768;
+; CHECK-NOF16-NEXT:    selp.b16 %rs6, %rs2, %rs5, %p4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs4;
 ; CHECK-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
-; CHECK-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs5, %p5;
-; CHECK-NOF16-NEXT:    st.param.b16 [func_retval0], %rs9;
+; CHECK-NOF16-NEXT:    selp.b16 %rs7, %rs6, %rs4, %p5;
+; CHECK-NOF16-NEXT:    st.param.b16 [func_retval0], %rs7;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: minimum_half(
@@ -657,26 +657,26 @@ define half @minimum_half(half %a, half %b) {
 ; CHECK-SM80-NOF16-LABEL: minimum_half(
 ; CHECK-SM80-NOF16:       {
 ; CHECK-SM80-NOF16-NEXT:    .reg .pred %p<6>;
-; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<10>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<8>;
 ; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<4>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
 ; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs1, [minimum_half_param_0];
-; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs3, [minimum_half_param_1];
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs3;
+; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs2, [minimum_half_param_1];
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
 ; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
 ; CHECK-SM80-NOF16-NEXT:    setp.lt.f32 %p1, %f2, %f1;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs4, %rs1, %rs3, %p1;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
 ; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs5, 0x7E00, %rs4, %p2;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs4, 0x7E00, %rs3, %p2;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p3, %rs1, -32768;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs6, %rs1, %rs5, %p3;
-; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p4, %rs3, -32768;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs8, %rs3, %rs6, %p4;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f3, %rs5;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs5, %rs1, %rs4, %p3;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, -32768;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs6, %rs2, %rs5, %p4;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f3, %rs4;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs5, %p5;
-; CHECK-SM80-NOF16-NEXT:    st.param.b16 [func_retval0], %rs9;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs7, %rs6, %rs4, %p5;
+; CHECK-SM80-NOF16-NEXT:    st.param.b16 [func_retval0], %rs7;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call half @llvm.minimum.f16(half %a, half %b)
   ret half %x
@@ -897,7 +897,7 @@ define <2 x half> @minimum_v2half(<2 x half> %a, <2 x half> %b) {
 ; CHECK-NOF16-LABEL: minimum_v2half(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<11>;
-; CHECK-NOF16-NEXT:    .reg .b16 %rs<19>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<15>;
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
 ; CHECK-NOF16-NEXT:    .reg .f32 %f<7>;
 ; CHECK-NOF16-EMPTY:
@@ -913,26 +913,26 @@ define <2 x half> @minimum_v2half(<2 x half> %a, <2 x half> %b) {
 ; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs6, 0x7E00, %rs5, %p2;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p3, %rs4, -32768;
-; CHECK-NOF16-NEXT:    selp.b16 %rs8, %rs4, %rs6, %p3;
+; CHECK-NOF16-NEXT:    selp.b16 %rs7, %rs4, %rs6, %p3;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, -32768;
-; CHECK-NOF16-NEXT:    selp.b16 %rs10, %rs2, %rs8, %p4;
+; CHECK-NOF16-NEXT:    selp.b16 %rs8, %rs2, %rs7, %p4;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs6;
 ; CHECK-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
-; CHECK-NOF16-NEXT:    selp.b16 %rs11, %rs10, %rs6, %p5;
+; CHECK-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p5;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
 ; CHECK-NOF16-NEXT:    setp.lt.f32 %p6, %f5, %f4;
-; CHECK-NOF16-NEXT:    selp.b16 %rs12, %rs3, %rs1, %p6;
+; CHECK-NOF16-NEXT:    selp.b16 %rs10, %rs3, %rs1, %p6;
 ; CHECK-NOF16-NEXT:    setp.nan.f32 %p7, %f5, %f4;
-; CHECK-NOF16-NEXT:    selp.b16 %rs13, 0x7E00, %rs12, %p7;
+; CHECK-NOF16-NEXT:    selp.b16 %rs11, 0x7E00, %rs10, %p7;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p8, %rs3, -32768;
-; CHECK-NOF16-NEXT:    selp.b16 %rs15, %rs3, %rs13, %p8;
+; CHECK-NOF16-NEXT:    selp.b16 %rs12, %rs3, %rs11, %p8;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p9, %rs1, -32768;
-; CHECK-NOF16-NEXT:    selp.b16 %rs17, %rs1, %rs15, %p9;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f6, %rs13;
+; CHECK-NOF16-NEXT:    selp.b16 %rs13, %rs1, %rs12, %p9;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f6, %rs11;
 ; CHECK-NOF16-NEXT:    setp.eq.f32 %p10, %f6, 0f00000000;
-; CHECK-NOF16-NEXT:    selp.b16 %rs18, %rs17, %rs13, %p10;
-; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs18, %rs11};
+; CHECK-NOF16-NEXT:    selp.b16 %rs14, %rs13, %rs11, %p10;
+; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs14, %rs9};
 ; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NOF16-NEXT:    ret;
 ;
@@ -950,7 +950,7 @@ define <2 x half> @minimum_v2half(<2 x half> %a, <2 x half> %b) {
 ; CHECK-SM80-NOF16-LABEL: minimum_v2half(
 ; CHECK-SM80-NOF16:       {
 ; CHECK-SM80-NOF16-NEXT:    .reg .pred %p<11>;
-; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<19>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<15>;
 ; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
 ; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<7>;
 ; CHECK-SM80-NOF16-EMPTY:
@@ -966,26 +966,26 @@ define <2 x half> @minimum_v2half(<2 x half> %a, <2 x half> %b) {
 ; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs6, 0x7E00, %rs5, %p2;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p3, %rs4, -32768;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs8, %rs4, %rs6, %p3;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs7, %rs4, %rs6, %p3;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, -32768;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs10, %rs2, %rs8, %p4;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs8, %rs2, %rs7, %p4;
 ; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f3, %rs6;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs11, %rs10, %rs6, %p5;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p5;
 ; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
 ; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
 ; CHECK-SM80-NOF16-NEXT:    setp.lt.f32 %p6, %f5, %f4;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs12, %rs3, %rs1, %p6;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs10, %rs3, %rs1, %p6;
 ; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p7, %f5, %f4;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs13, 0x7E00, %rs12, %p7;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs11, 0x7E00, %rs10, %p7;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p8, %rs3, -32768;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs15, %rs3, %rs13, %p8;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs12, %rs3, %rs11, %p8;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p9, %rs1, -32768;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs17, %rs1, %rs15, %p9;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f6, %rs13;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs13, %rs1, %rs12, %p9;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f6, %rs11;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p10, %f6, 0f00000000;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs18, %rs17, %rs13, %p10;
-; CHECK-SM80-NOF16-NEXT:    mov.b32 %r3, {%rs18, %rs11};
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs14, %rs13, %rs11, %p10;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 %r3, {%rs14, %rs9};
 ; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
@@ -1179,26 +1179,26 @@ define half @maximum_half(half %a, half %b) {
 ; CHECK-NOF16-LABEL: maximum_half(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<6>;
-; CHECK-NOF16-NEXT:    .reg .b16 %rs<10>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<8>;
 ; CHECK-NOF16-NEXT:    .reg .f32 %f<4>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.b16 %rs1, [maximum_half_param_0];
-; CHECK-NOF16-NEXT:    ld.param.b16 %rs3, [maximum_half_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs3;
+; CHECK-NOF16-NEXT:    ld.param.b16 %rs2, [maximum_half_param_1];
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
 ; CHECK-NOF16-NEXT:    setp.gt.f32 %p1, %f2, %f1;
-; CHECK-NOF16-NEXT:    selp.b16 %rs4, %rs1, %rs3, %p1;
+; CHECK-NOF16-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
 ; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
-; CHECK-NOF16-NEXT:    selp.b16 %rs5, 0x7E00, %rs4, %p2;
+; CHECK-NOF16-NEXT:    selp.b16 %rs4, 0x7E00, %rs3, %p2;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p3, %rs1, 0;
-; CHECK-NOF16-NEXT:    selp.b16 %rs6, %rs1, %rs5, %p3;
-; CHECK-NOF16-NEXT:    setp.eq.s16 %p4, %rs3, 0;
-; CHECK-NOF16-NEXT:    selp.b16 %rs8, %rs3, %rs6, %p4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs5;
+; CHECK-NOF16-NEXT:    selp.b16 %rs5, %rs1, %rs4, %p3;
+; CHECK-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, 0;
+; CHECK-NOF16-NEXT:    selp.b16 %rs6, %rs2, %rs5, %p4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs4;
 ; CHECK-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
-; CHECK-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs5, %p5;
-; CHECK-NOF16-NEXT:    st.param.b16 [func_retval0], %rs9;
+; CHECK-NOF16-NEXT:    selp.b16 %rs7, %rs6, %rs4, %p5;
+; CHECK-NOF16-NEXT:    st.param.b16 [func_retval0], %rs7;
 ; CHECK-NOF16-NEXT:    ret;
 ;
 ; CHECK-F16-LABEL: maximum_half(
@@ -1215,26 +1215,26 @@ define half @maximum_half(half %a, half %b) {
 ; CHECK-SM80-NOF16-LABEL: maximum_half(
 ; CHECK-SM80-NOF16:       {
 ; CHECK-SM80-NOF16-NEXT:    .reg .pred %p<6>;
-; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<10>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<8>;
 ; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<4>;
 ; CHECK-SM80-NOF16-EMPTY:
 ; CHECK-SM80-NOF16-NEXT:  // %bb.0:
 ; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs1, [maximum_half_param_0];
-; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs3, [maximum_half_param_1];
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs3;
+; CHECK-SM80-NOF16-NEXT:    ld.param.b16 %rs2, [maximum_half_param_1];
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f1, %rs2;
 ; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f2, %rs1;
 ; CHECK-SM80-NOF16-NEXT:    setp.gt.f32 %p1, %f2, %f1;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs4, %rs1, %rs3, %p1;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
 ; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs5, 0x7E00, %rs4, %p2;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs4, 0x7E00, %rs3, %p2;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p3, %rs1, 0;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs6, %rs1, %rs5, %p3;
-; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p4, %rs3, 0;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs8, %rs3, %rs6, %p4;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f3, %rs5;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs5, %rs1, %rs4, %p3;
+; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, 0;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs6, %rs2, %rs5, %p4;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f3, %rs4;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs5, %p5;
-; CHECK-SM80-NOF16-NEXT:    st.param.b16 [func_retval0], %rs9;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs7, %rs6, %rs4, %p5;
+; CHECK-SM80-NOF16-NEXT:    st.param.b16 [func_retval0], %rs7;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call half @llvm.maximum.f16(half %a, half %b)
   ret half %x
@@ -1447,7 +1447,7 @@ define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) {
 ; CHECK-NOF16-LABEL: maximum_v2half(
 ; CHECK-NOF16:       {
 ; CHECK-NOF16-NEXT:    .reg .pred %p<11>;
-; CHECK-NOF16-NEXT:    .reg .b16 %rs<19>;
+; CHECK-NOF16-NEXT:    .reg .b16 %rs<15>;
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<4>;
 ; CHECK-NOF16-NEXT:    .reg .f32 %f<7>;
 ; CHECK-NOF16-EMPTY:
@@ -1463,26 +1463,26 @@ define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) {
 ; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs6, 0x7E00, %rs5, %p2;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p3, %rs4, 0;
-; CHECK-NOF16-NEXT:    selp.b16 %rs8, %rs4, %rs6, %p3;
+; CHECK-NOF16-NEXT:    selp.b16 %rs7, %rs4, %rs6, %p3;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, 0;
-; CHECK-NOF16-NEXT:    selp.b16 %rs10, %rs2, %rs8, %p4;
+; CHECK-NOF16-NEXT:    selp.b16 %rs8, %rs2, %rs7, %p4;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %f3, %rs6;
 ; CHECK-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
-; CHECK-NOF16-NEXT:    selp.b16 %rs11, %rs10, %rs6, %p5;
+; CHECK-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p5;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
 ; CHECK-NOF16-NEXT:    setp.gt.f32 %p6, %f5, %f4;
-; CHECK-NOF16-NEXT:    selp.b16 %rs12, %rs3, %rs1, %p6;
+; CHECK-NOF16-NEXT:    selp.b16 %rs10, %rs3, %rs1, %p6;
 ; CHECK-NOF16-NEXT:    setp.nan.f32 %p7, %f5, %f4;
-; CHECK-NOF16-NEXT:    selp.b16 %rs13, 0x7E00, %rs12, %p7;
+; CHECK-NOF16-NEXT:    selp.b16 %rs11, 0x7E00, %rs10, %p7;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p8, %rs3, 0;
-; CHECK-NOF16-NEXT:    selp.b16 %rs15, %rs3, %rs13, %p8;
+; CHECK-NOF16-NEXT:    selp.b16 %rs12, %rs3, %rs11, %p8;
 ; CHECK-NOF16-NEXT:    setp.eq.s16 %p9, %rs1, 0;
-; CHECK-NOF16-NEXT:    selp.b16 %rs17, %rs1, %rs15, %p9;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %f6, %rs13;
+; CHECK-NOF16-NEXT:    selp.b16 %rs13, %rs1, %rs12, %p9;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %f6, %rs11;
 ; CHECK-NOF16-NEXT:    setp.eq.f32 %p10, %f6, 0f00000000;
-; CHECK-NOF16-NEXT:    selp.b16 %rs18, %rs17, %rs13, %p10;
-; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs18, %rs11};
+; CHECK-NOF16-NEXT:    selp.b16 %rs14, %rs13, %rs11, %p10;
+; CHECK-NOF16-NEXT:    mov.b32 %r3, {%rs14, %rs9};
 ; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-NOF16-NEXT:    ret;
 ;
@@ -1500,7 +1500,7 @@ define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) {
 ; CHECK-SM80-NOF16-LABEL: maximum_v2half(
 ; CHECK-SM80-NOF16:       {
 ; CHECK-SM80-NOF16-NEXT:    .reg .pred %p<11>;
-; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<19>;
+; CHECK-SM80-NOF16-NEXT:    .reg .b16 %rs<15>;
 ; CHECK-SM80-NOF16-NEXT:    .reg .b32 %r<4>;
 ; CHECK-SM80-NOF16-NEXT:    .reg .f32 %f<7>;
 ; CHECK-SM80-NOF16-EMPTY:
@@ -1516,26 +1516,26 @@ define <2 x half> @maximum_v2half(<2 x half> %a, <2 x half> %b) {
 ; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p2, %f2, %f1;
 ; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs6, 0x7E00, %rs5, %p2;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p3, %rs4, 0;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs8, %rs4, %rs6, %p3;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs7, %rs4, %rs6, %p3;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p4, %rs2, 0;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs10, %rs2, %rs8, %p4;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs8, %rs2, %rs7, %p4;
 ; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f3, %rs6;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p5, %f3, 0f00000000;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs11, %rs10, %rs6, %p5;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p5;
 ; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f4, %rs1;
 ; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f5, %rs3;
 ; CHECK-SM80-NOF16-NEXT:    setp.gt.f32 %p6, %f5, %f4;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs12, %rs3, %rs1, %p6;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs10, %rs3, %rs1, %p6;
 ; CHECK-SM80-NOF16-NEXT:    setp.nan.f32 %p7, %f5, %f4;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs13, 0x7E00, %rs12, %p7;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs11, 0x7E00, %rs10, %p7;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p8, %rs3, 0;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs15, %rs3, %rs13, %p8;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs12, %rs3, %rs11, %p8;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.s16 %p9, %rs1, 0;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs17, %rs1, %rs15, %p9;
-; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f6, %rs13;
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs13, %rs1, %rs12, %p9;
+; CHECK-SM80-NOF16-NEXT:    cvt.f32.f16 %f6, %rs11;
 ; CHECK-SM80-NOF16-NEXT:    setp.eq.f32 %p10, %f6, 0f00000000;
-; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs18, %rs17, %rs13, %p10;
-; CHECK-SM80-NOF16-NEXT:    mov.b32 %r3, {%rs18, %rs11};
+; CHECK-SM80-NOF16-NEXT:    selp.b16 %rs14, %rs13, %rs11, %p10;
+; CHECK-SM80-NOF16-NEXT:    mov.b32 %r3, {%rs14, %rs9};
 ; CHECK-SM80-NOF16-NEXT:    st.param.b32 [func_retval0], %r3;
 ; CHECK-SM80-NOF16-NEXT:    ret;
   %x = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
index da0e6b7ca5860..e0d0197c6ead5 100644
--- a/llvm/test/CodeGen/NVPTX/misched_func_call.ll
+++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll
@@ -16,8 +16,6 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
 ; CHECK-NEXT:    ld.param.u32 %r3, [wombat_param_1];
 ; CHECK-NEXT:    ld.param.u32 %r2, [wombat_param_0];
 ; CHECK-NEXT:    mov.b32 %r10, 0;
-; CHECK-NEXT:    mov.u64 %rd1, 0;
-; CHECK-NEXT:    mov.b32 %r6, 1;
 ; CHECK-NEXT:  $L__BB0_1: // %bb3
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    { // callseq 0, 0
@@ -29,16 +27,17 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) {
 ; CHECK-NEXT:    (
 ; CHECK-NEXT:    param0
 ; CHECK-NEXT:    );
+; CHECK-NEXT:    ld.param.f64 %fd1, [retval0];
+; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    mul.lo.s32 %r7, %r10, %r3;
 ; CHECK-NEXT:    or.b32 %r8, %r4, %r7;
 ; CHECK-NEXT:    mul.lo.s32 %r9, %r2, %r8;
 ; CHECK-NEXT:    cvt.rn.f64.s32 %fd3, %r9;
-; CHECK-NEXT:    ld.param.f64 %fd1, [retval0];
-; CHECK-NEXT:    } // callseq 0
 ; CHECK-NEXT:    cvt.rn.f64.u32 %fd4, %r10;
 ; CHECK-NEXT:    add.rn.f64 %fd5, %fd4, %fd3;
+; CHECK-NEXT:    mov.b64 %rd1, 0;
 ; CHECK-NEXT:    st.global.f64 [%rd1], %fd5;
-; CHECK-NEXT:    mov.u32 %r10, %r6;
+; CHECK-NEXT:    mov.b32 %r10, 1;
 ; CHECK-NEXT:    bra.uni $L__BB0_1;
 bb:
   br label %bb3
diff --git a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
index 4e6a5ea12bb75..a48ea6a0a9551 100644
--- a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
+++ b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll
@@ -4,9 +4,9 @@
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
 define ptx_kernel void @t1(ptr %a) {
-; PTX32:      mov.u16 %rs{{[0-9]+}}, 0;
+; PTX32:      mov.b16 %rs{{[0-9]+}}, 0;
 ; PTX32-NEXT: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}};
-; PTX64:      mov.u16 %rs{{[0-9]+}}, 0;
+; PTX64:      mov.b16 %rs{{[0-9]+}}, 0;
 ; PTX64-NEXT: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}};
   store i1 false, ptr %a
   ret void
diff --git a/llvm/test/CodeGen/NVPTX/reg-types.ll b/llvm/test/CodeGen/NVPTX/reg-types.ll
index 218ed65374491..cf2433ad75a97 100644
--- a/llvm/test/CodeGen/NVPTX/reg-types.ll
+++ b/llvm/test/CodeGen/NVPTX/reg-types.ll
@@ -31,16 +31,16 @@ entry:
 
 ; Verify that we use correct register types.
   store i8 1, ptr %s8, align 1
-; CHECK: mov.u16 [[R1:%rs[0-9]]], 1;
+; CHECK: mov.b16 [[R1:%rs[0-9]]], 1;
 ; CHECK-NEXT: st.u8 {{.*}}, [[R1]]
   store i8 2, ptr %u8, align 1
-; CHECK: mov.u16 [[R2:%rs[0-9]]], 2;
+; CHECK: mov.b16 [[R2:%rs[0-9]]], 2;
 ; CHECK-NEXT: st.u8 {{.*}}, [[R2]]
   store i16 3, ptr %s16, align 2
-; CHECK: mov.u16 [[R3:%rs[0-9]]], 3;
+; CHECK: mov.b16 [[R3:%rs[0-9]]], 3;
 ; CHECK-NEXT: st.u16 {{.*}}, [[R3]]
   store i16 4, ptr %u16, align 2
-; CHECK: mov.u16 [[R4:%rs[0-9]]], 4;
+; CHECK: mov.b16 [[R4:%rs[0-9]]], 4;
 ; CHECK-NEXT: st.u16 {{.*}}, [[R4]]
   store i32 5, ptr %s32, align 4
 ; CHECK: mov.b32 [[R5:%r[0-9]]], 5;
@@ -49,10 +49,10 @@ entry:
 ; CHECK: mov.b32 [[R6:%r[0-9]]], 6;
 ; CHECK-NEXT: st.u32 {{.*}}, [[R6]]
   store i64 7, ptr %s64, align 8
-; CHECK: mov.u64 [[R7:%rd[0-9]]], 7;
+; CHECK: mov.b64 [[R7:%rd[0-9]]], 7;
 ; CHECK-NEXT: st.u64 {{.*}}, [[R7]]
   store i64 8, ptr %u64, align 8
-; CHECK: mov.u64 [[R8:%rd[0-9]]], 8;
+; CHECK: mov.b64 [[R8:%rd[0-9]]], 8;
 ; CHECK-NEXT: st.u64 {{.*}}, [[R8]]
 
 ; FP constants are stored via integer registers, but that's an
diff --git a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
index 044d21643ed9d..303c649b794fd 100644
--- a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
@@ -60,17 +60,17 @@ define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwin
 define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v4i8(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<10>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [out_v4i8_param_2];
-; CHECK-NEXT:    ld.param.u32 %r3, [out_v4i8_param_1];
-; CHECK-NEXT:    ld.param.u32 %r4, [out_v4i8_param_0];
-; CHECK-NEXT:    and.b32 %r5, %r4, %r1;
-; CHECK-NEXT:    xor.b32 %r7, %r1, -1;
-; CHECK-NEXT:    and.b32 %r8, %r3, %r7;
-; CHECK-NEXT:    or.b32 %r9, %r5, %r8;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r9;
+; CHECK-NEXT:    ld.param.u32 %r1, [out_v4i8_param_1];
+; CHECK-NEXT:    ld.param.u32 %r2, [out_v4i8_param_0];
+; CHECK-NEXT:    ld.param.u32 %r3, [out_v4i8_param_2];
+; CHECK-NEXT:    and.b32 %r4, %r2, %r3;
+; CHECK-NEXT:    xor.b32 %r5, %r3, -1;
+; CHECK-NEXT:    and.b32 %r6, %r1, %r5;
+; CHECK-NEXT:    or.b32 %r7, %r4, %r6;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
 ; CHECK-NEXT:    ret;
   %mx = and <4 x i8> %x, %mask
   %notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1>
@@ -82,17 +82,17 @@ define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v4i8_undef(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<10>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [out_v4i8_undef_param_2];
-; CHECK-NEXT:    ld.param.u32 %r3, [out_v4i8_undef_param_1];
-; CHECK-NEXT:    ld.param.u32 %r4, [out_v4i8_undef_param_0];
-; CHECK-NEXT:    and.b32 %r5, %r4, %r1;
-; CHECK-NEXT:    xor.b32 %r7, %r1, -16711681;
-; CHECK-NEXT:    and.b32 %r8, %r3, %r7;
-; CHECK-NEXT:    or.b32 %r9, %r5, %r8;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r9;
+; CHECK-NEXT:    ld.param.u32 %r1, [out_v4i8_undef_param_1];
+; CHECK-NEXT:    ld.param.u32 %r2, [out_v4i8_undef_param_0];
+; CHECK-NEXT:    ld.param.u32 %r3, [out_v4i8_undef_param_2];
+; CHECK-NEXT:    and.b32 %r4, %r2, %r3;
+; CHECK-NEXT:    xor.b32 %r5, %r3, -16711681;
+; CHECK-NEXT:    and.b32 %r6, %r1, %r5;
+; CHECK-NEXT:    or.b32 %r7, %r4, %r6;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
 ; CHECK-NEXT:    ret;
   %mx = and <4 x i8> %x, %mask
   %notmask = xor <4 x i8> %mask, <i8 -1, i8 -1, i8 undef, i8 -1>
@@ -104,17 +104,17 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi
 define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v2i16(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<10>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u32 %r1, [out_v2i16_param_2];
-; CHECK-NEXT:    ld.param.u32 %r3, [out_v2i16_param_1];
-; CHECK-NEXT:    ld.param.u32 %r4, [out_v2i16_param_0];
-; CHECK-NEXT:    and.b32 %r5, %r4, %r1;
-; CHECK-NEXT:    xor.b32 %r7, %r1, -1;
-; CHECK-NEXT:    and.b32 %r8, %r3, %r7;
-; CHECK-NEXT:    or.b32 %r9, %r5, %r8;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r9;
+; CHECK-NEXT:    ld.param.u32 %r1, [out_v2i16_param_1];
+; CHECK-NEXT:    ld.param.u32 %r2, [out_v2i16_param_0];
+; CHECK-NEXT:    ld.param.u32 %r3, [out_v2i16_param_2];
+; CHECK-NEXT:    and.b32 %r4, %r2, %r3;
+; CHECK-NEXT:    xor.b32 %r5, %r3, -1;
+; CHECK-NEXT:    and.b32 %r6, %r1, %r5;
+; CHECK-NEXT:    or.b32 %r7, %r4, %r6;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
 ; CHECK-NEXT:    ret;
   %mx = and <2 x i16> %x, %mask
   %notmask = xor <2 x i16> %mask, <i16 -1, i16 -1>
@@ -152,21 +152,21 @@ define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwin
 define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v8i8(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<21>;
+; CHECK-NEXT:    .reg .b32 %r<15>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [out_v8i8_param_1];
-; CHECK-NEXT:    ld.param.v2.u32 {%r5, %r6}, [out_v8i8_param_2];
-; CHECK-NEXT:    ld.param.v2.u32 {%r9, %r10}, [out_v8i8_param_0];
-; CHECK-NEXT:    and.b32 %r11, %r9, %r5;
-; CHECK-NEXT:    and.b32 %r13, %r10, %r6;
-; CHECK-NEXT:    xor.b32 %r15, %r6, -1;
-; CHECK-NEXT:    xor.b32 %r16, %r5, -1;
-; CHECK-NEXT:    and.b32 %r17, %r1, %r16;
-; CHECK-NEXT:    and.b32 %r18, %r2, %r15;
-; CHECK-NEXT:    or.b32 %r19, %r13, %r18;
-; CHECK-NEXT:    or.b32 %r20, %r11, %r17;
-; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r20, %r19};
+; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [out_v8i8_param_0];
+; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [out_v8i8_param_2];
+; CHECK-NEXT:    and.b32 %r5, %r1, %r3;
+; CHECK-NEXT:    and.b32 %r6, %r2, %r4;
+; CHECK-NEXT:    ld.param.v2.u32 {%r7, %r8}, [out_v8i8_param_1];
+; CHECK-NEXT:    xor.b32 %r9, %r4, -1;
+; CHECK-NEXT:    xor.b32 %r10, %r3, -1;
+; CHECK-NEXT:    and.b32 %r11, %r7, %r10;
+; CHECK-NEXT:    and.b32 %r12, %r8, %r9;
+; CHECK-NEXT:    or.b32 %r13, %r6, %r12;
+; CHECK-NEXT:    or.b32 %r14, %r5, %r11;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r14, %r13};
 ; CHECK-NEXT:    ret;
   %mx = and <8 x i8> %x, %mask
   %notmask = xor <8 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -178,21 +178,21 @@ define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v4i16(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<21>;
+; CHECK-NEXT:    .reg .b32 %r<15>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [out_v4i16_param_1];
-; CHECK-NEXT:    ld.param.v2.u32 {%r5, %r6}, [out_v4i16_param_2];
-; CHECK-NEXT:    ld.param.v2.u32 {%r9, %r10}, [out_v4i16_param_0];
-; CHECK-NEXT:    and.b32 %r11, %r9, %r5;
-; CHECK-NEXT:    and.b32 %r13, %r10, %r6;
-; CHECK-NEXT:    xor.b32 %r15, %r6, -1;
-; CHECK-NEXT:    xor.b32 %r16, %r5, -1;
-; CHECK-NEXT:    and.b32 %r17, %r1, %r16;
-; CHECK-NEXT:    and.b32 %r18, %r2, %r15;
-; CHECK-NEXT:    or.b32 %r19, %r13, %r18;
-; CHECK-NEXT:    or.b32 %r20, %r11, %r17;
-; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r20, %r19};
+; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [out_v4i16_param_0];
+; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [out_v4i16_param_2];
+; CHECK-NEXT:    and.b32 %r5, %r1, %r3;
+; CHECK-NEXT:    and.b32 %r6, %r2, %r4;
+; CHECK-NEXT:    ld.param.v2.u32 {%r7, %r8}, [out_v4i16_param_1];
+; CHECK-NEXT:    xor.b32 %r9, %r4, -1;
+; CHECK-NEXT:    xor.b32 %r10, %r3, -1;
+; CHECK-NEXT:    and.b32 %r11, %r7, %r10;
+; CHECK-NEXT:    and.b32 %r12, %r8, %r9;
+; CHECK-NEXT:    or.b32 %r13, %r6, %r12;
+; CHECK-NEXT:    or.b32 %r14, %r5, %r11;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r14, %r13};
 ; CHECK-NEXT:    ret;
   %mx = and <4 x i16> %x, %mask
   %notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1>
@@ -204,21 +204,21 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
 define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v4i16_undef(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<21>;
+; CHECK-NEXT:    .reg .b32 %r<15>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [out_v4i16_undef_param_1];
-; CHECK-NEXT:    ld.param.v2.u32 {%r5, %r6}, [out_v4i16_undef_param_2];
-; CHECK-NEXT:    ld.param.v2.u32 {%r9, %r10}, [out_v4i16_undef_param_0];
-; CHECK-NEXT:    and.b32 %r11, %r9, %r5;
-; CHECK-NEXT:    and.b32 %r13, %r10, %r6;
-; CHECK-NEXT:    xor.b32 %r15, %r6, -65536;
-; CHECK-NEXT:    xor.b32 %r16, %r5, -1;
-; CHECK-NEXT:    and.b32 %r17, %r1, %r16;
-; CHECK-NEXT:    and.b32 %r18, %r2, %r15;
-; CHECK-NEXT:    or.b32 %r19, %r13, %r18;
-; CHECK-NEXT:    or.b32 %r20, %r11, %r17;
-; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r20, %r19};
+; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [out_v4i16_undef_param_0];
+; CHECK-NEXT:    ld.param.v2.u32 {%r3, %r4}, [out_v4i16_undef_param_2];
+; CHECK-NEXT:    and.b32 %r5, %r1, %r3;
+; CHECK-NEXT:    and.b32 %r6, %r2, %r4;
+; CHECK-NEXT:    ld.param.v2.u32 {%r7, %r8}, [out_v4i16_undef_param_1];
+; CHECK-NEXT:    xor.b32 %r9, %r4, -65536;
+; CHECK-NEXT:    xor.b32 %r10, %r3, -1;
+; CHECK-NEXT:    and.b32 %r11, %r7, %r10;
+; CHECK-NEXT:    and.b32 %r12, %r8, %r9;
+; CHECK-NEXT:    or.b32 %r13, %r6, %r12;
+; CHECK-NEXT:    or.b32 %r14, %r5, %r11;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r14, %r13};
 ; CHECK-NEXT:    ret;
   %mx = and <4 x i16> %x, %mask
   %notmask = xor <4 x i16> %mask, <i16 -1, i16 -1, i16 undef, i16 -1>
@@ -282,29 +282,29 @@ define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwin
 define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v16i8(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<41>;
+; CHECK-NEXT:    .reg .b32 %r<29>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v16i8_param_1];
-; CHECK-NEXT:    ld.param.v4.u32 {%r9, %r10, %r11, %r12}, [out_v16i8_param_2];
-; CHECK-NEXT:    ld.param.v4.u32 {%r17, %r18, %r19, %r20}, [out_v16i8_param_0];
-; CHECK-NEXT:    and.b32 %r21, %r17, %r9;
-; CHECK-NEXT:    and.b32 %r23, %r18, %r10;
-; CHECK-NEXT:    and.b32 %r25, %r19, %r11;
-; CHECK-NEXT:    and.b32 %r27, %r20, %r12;
-; CHECK-NEXT:    xor.b32 %r29, %r12, -1;
-; CHECK-NEXT:    xor.b32 %r30, %r11, -1;
-; CHECK-NEXT:    xor.b32 %r31, %r10, -1;
-; CHECK-NEXT:    xor.b32 %r32, %r9, -1;
-; CHECK-NEXT:    and.b32 %r33, %r1, %r32;
-; CHECK-NEXT:    and.b32 %r34, %r2, %r31;
-; CHECK-NEXT:    and.b32 %r35, %r3, %r30;
-; CHECK-NEXT:    and.b32 %r36, %r4, %r29;
-; CHECK-NEXT:    or.b32 %r37, %r27, %r36;
-; CHECK-NEXT:    or.b32 %r38, %r25, %r35;
-; CHECK-NEXT:    or.b32 %r39, %r23, %r34;
-; CHECK-NEXT:    or.b32 %r40, %r21, %r33;
-; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r40, %r39, %r38, %r37};
+; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v16i8_param_0];
+; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [out_v16i8_param_2];
+; CHECK-NEXT:    and.b32 %r9, %r1, %r5;
+; CHECK-NEXT:    and.b32 %r10, %r2, %r6;
+; CHECK-NEXT:    and.b32 %r11, %r3, %r7;
+; CHECK-NEXT:    and.b32 %r12, %r4, %r8;
+; CHECK-NEXT:    ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [out_v16i8_param_1];
+; CHECK-NEXT:    xor.b32 %r17, %r8, -1;
+; CHECK-NEXT:    xor.b32 %r18, %r7, -1;
+; CHECK-NEXT:    xor.b32 %r19, %r6, -1;
+; CHECK-NEXT:    xor.b32 %r20, %r5, -1;
+; CHECK-NEXT:    and.b32 %r21, %r13, %r20;
+; CHECK-NEXT:    and.b32 %r22, %r14, %r19;
+; CHECK-NEXT:    and.b32 %r23, %r15, %r18;
+; CHECK-NEXT:    and.b32 %r24, %r16, %r17;
+; CHECK-NEXT:    or.b32 %r25, %r12, %r24;
+; CHECK-NEXT:    or.b32 %r26, %r11, %r23;
+; CHECK-NEXT:    or.b32 %r27, %r10, %r22;
+; CHECK-NEXT:    or.b32 %r28, %r9, %r21;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r28, %r27, %r26, %r25};
 ; CHECK-NEXT:    ret;
   %mx = and <16 x i8> %x, %mask
   %notmask = xor <16 x i8> %mask, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
@@ -316,29 +316,29 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin
 define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v8i16(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<41>;
+; CHECK-NEXT:    .reg .b32 %r<29>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v8i16_param_1];
-; CHECK-NEXT:    ld.param.v4.u32 {%r9, %r10, %r11, %r12}, [out_v8i16_param_2];
-; CHECK-NEXT:    ld.param.v4.u32 {%r17, %r18, %r19, %r20}, [out_v8i16_param_0];
-; CHECK-NEXT:    and.b32 %r21, %r17, %r9;
-; CHECK-NEXT:    and.b32 %r23, %r18, %r10;
-; CHECK-NEXT:    and.b32 %r25, %r19, %r11;
-; CHECK-NEXT:    and.b32 %r27, %r20, %r12;
-; CHECK-NEXT:    xor.b32 %r29, %r12, -1;
-; CHECK-NEXT:    xor.b32 %r30, %r11, -1;
-; CHECK-NEXT:    xor.b32 %r31, %r10, -1;
-; CHECK-NEXT:    xor.b32 %r32, %r9, -1;
-; CHECK-NEXT:    and.b32 %r33, %r1, %r32;
-; CHECK-NEXT:    and.b32 %r34, %r2, %r31;
-; CHECK-NEXT:    and.b32 %r35, %r3, %r30;
-; CHECK-NEXT:    and.b32 %r36, %r4, %r29;
-; CHECK-NEXT:    or.b32 %r37, %r27, %r36;
-; CHECK-NEXT:    or.b32 %r38, %r25, %r35;
-; CHECK-NEXT:    or.b32 %r39, %r23, %r34;
-; CHECK-NEXT:    or.b32 %r40, %r21, %r33;
-; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r40, %r39, %r38, %r37};
+; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v8i16_param_0];
+; CHECK-NEXT:    ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [out_v8i16_param_2];
+; CHECK-NEXT:    and.b32 %r9, %r1, %r5;
+; CHECK-NEXT:    and.b32 %r10, %r2, %r6;
+; CHECK-NEXT:    and.b32 %r11, %r3, %r7;
+; CHECK-NEXT:    and.b32 %r12, %r4, %r8;
+; CHECK-NEXT:    ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [out_v8i16_param_1];
+; CHECK-NEXT:    xor.b32 %r17, %r8, -1;
+; CHECK-NEXT:    xor.b32 %r18, %r7, -1;
+; CHECK-NEXT:    xor.b32 %r19, %r6, -1;
+; CHECK-NEXT:    xor.b32 %r20, %r5, -1;
+; CHECK-NEXT:    and.b32 %r21, %r13, %r20;
+; CHECK-NEXT:    and.b32 %r22, %r14, %r19;
+; CHECK-NEXT:    and.b32 %r23, %r15, %r18;
+; CHECK-NEXT:    and.b32 %r24, %r16, %r17;
+; CHECK-NEXT:    or.b32 %r25, %r12, %r24;
+; CHECK-NEXT:    or.b32 %r26, %r11, %r23;
+; CHECK-NEXT:    or.b32 %r27, %r10, %r22;
+; CHECK-NEXT:    or.b32 %r28, %r9, %r21;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r28, %r27, %r26, %r25};
 ; CHECK-NEXT:    ret;
   %mx = and <8 x i16> %x, %mask
   %notmask = xor <8 x i16> %mask, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
@@ -497,7 +497,7 @@ define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind
 define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v4i8(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [in_v4i8_param_0];
@@ -517,7 +517,7 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
 ; CHECK-LABEL: in_v2i16(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u32 %r1, [in_v2i16_param_0];
@@ -561,7 +561,7 @@ define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind
 define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v8i8(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<15>;
+; CHECK-NEXT:    .reg .b32 %r<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [in_v8i8_param_0];
@@ -570,10 +570,10 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 ; CHECK-NEXT:    xor.b32 %r7, %r2, %r4;
 ; CHECK-NEXT:    and.b32 %r8, %r7, %r6;
 ; CHECK-NEXT:    xor.b32 %r9, %r8, %r4;
-; CHECK-NEXT:    xor.b32 %r11, %r1, %r3;
-; CHECK-NEXT:    and.b32 %r12, %r11, %r5;
-; CHECK-NEXT:    xor.b32 %r13, %r12, %r3;
-; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r13, %r9};
+; CHECK-NEXT:    xor.b32 %r10, %r1, %r3;
+; CHECK-NEXT:    and.b32 %r11, %r10, %r5;
+; CHECK-NEXT:    xor.b32 %r12, %r11, %r3;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r12, %r9};
 ; CHECK-NEXT:    ret;
   %n0 = xor <8 x i8> %x, %y
   %n1 = and <8 x i8> %n0, %mask
@@ -584,7 +584,7 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind {
 define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind {
 ; CHECK-LABEL: in_v4i16(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<15>;
+; CHECK-NEXT:    .reg .b32 %r<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.u32 {%r1, %r2}, [in_v4i16_param_0];
@@ -593,10 +593,10 @@ define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind
 ; CHECK-NEXT:    xor.b32 %r7, %r2, %r4;
 ; CHECK-NEXT:    and.b32 %r8, %r7, %r6;
 ; CHECK-NEXT:    xor.b32 %r9, %r8, %r4;
-; CHECK-NEXT:    xor.b32 %r11, %r1, %r3;
-; CHECK-NEXT:    and.b32 %r12, %r11, %r5;
-; CHECK-NEXT:    xor.b32 %r13, %r12, %r3;
-; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r13, %r9};
+; CHECK-NEXT:    xor.b32 %r10, %r1, %r3;
+; CHECK-NEXT:    and.b32 %r11, %r10, %r5;
+; CHECK-NEXT:    xor.b32 %r12, %r11, %r3;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r12, %r9};
 ; CHECK-NEXT:    ret;
   %n0 = xor <4 x i16> %x, %y
   %n1 = and <4 x i16> %n0, %mask
@@ -654,7 +654,7 @@ define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind
 define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v16i8(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<29>;
+; CHECK-NEXT:    .reg .b32 %r<25>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [in_v16i8_param_0];
@@ -669,10 +669,10 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind
 ; CHECK-NEXT:    and.b32 %r19, %r10, %r15;
 ; CHECK-NEXT:    and.b32 %r20, %r9, %r16;
 ; CHECK-NEXT:    xor.b32 %r21, %r20, %r8;
-; CHECK-NEXT:    xor.b32 %r23, %r19, %r7;
-; CHECK-NEXT:    xor.b32 %r25, %r18, %r6;
-; CHECK-NEXT:    xor.b32 %r27, %r17, %r5;
-; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r27, %r25, %r23, %r21};
+; CHECK-NEXT:    xor.b32 %r22, %r19, %r7;
+; CHECK-NEXT:    xor.b32 %r23, %r18, %r6;
+; CHECK-NEXT:    xor.b32 %r24, %r17, %r5;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r24, %r23, %r22, %r21};
 ; CHECK-NEXT:    ret;
   %n0 = xor <16 x i8> %x, %y
   %n1 = and <16 x i8> %n0, %mask
@@ -683,7 +683,7 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind
 define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind {
 ; CHECK-LABEL: in_v8i16(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<29>;
+; CHECK-NEXT:    .reg .b32 %r<25>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [in_v8i16_param_0];
@@ -698,10 +698,10 @@ define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind
 ; CHECK-NEXT:    and.b32 %r19, %r10, %r15;
 ; CHECK-NEXT:    and.b32 %r20, %r9, %r16;
 ; CHECK-NEXT:    xor.b32 %r21, %r20, %r8;
-; CHECK-NEXT:    xor.b32 %r23, %r19, %r7;
-; CHECK-NEXT:    xor.b32 %r25, %r18, %r6;
-; CHECK-NEXT:    xor.b32 %r27, %r17, %r5;
-; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r27, %r25, %r23, %r21};
+; CHECK-NEXT:    xor.b32 %r22, %r19, %r7;
+; CHECK-NEXT:    xor.b32 %r23, %r18, %r6;
+; CHECK-NEXT:    xor.b32 %r24, %r17, %r5;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r24, %r23, %r22, %r21};
 ; CHECK-NEXT:    ret;
   %n0 = xor <8 x i16> %x, %y
   %n1 = and <8 x i16> %n0, %mask
diff --git a/llvm/test/CodeGen/NVPTX/vaargs.ll b/llvm/test/CodeGen/NVPTX/vaargs.ll
index 8ecdff9d65ac1..697bdcd935fae 100644
--- a/llvm/test/CodeGen/NVPTX/vaargs.ll
+++ b/llvm/test/CodeGen/NVPTX/vaargs.ll
@@ -16,7 +16,7 @@ entry:
 
 ; Test va_start
 ; CHECK:         .param .align 8 .b8 foo_vararg[]
-; CHECK:         mov.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], foo_vararg;
+; CHECK:         mov.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], foo_vararg;
 ; CHECK-NEXT:    st.u[[BITS]] [%SP], [[VA_PTR]];
 
   call void @llvm.va_start(ptr %al)
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index 98b051f92a20f..cb54812dea6d9 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -111,13 +111,13 @@ define dso_local i32 @foo() {
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot1;
 ; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
-; CHECK-PTX-NEXT:    mov.u64 %rd1, 4294967297;
+; CHECK-PTX-NEXT:    mov.b64 %rd1, 4294967297;
 ; CHECK-PTX-NEXT:    st.u64 [%SP], %rd1;
 ; CHECK-PTX-NEXT:    mov.b32 %r1, 1;
 ; CHECK-PTX-NEXT:    st.u32 [%SP+8], %r1;
-; CHECK-PTX-NEXT:    mov.u64 %rd2, 1;
+; CHECK-PTX-NEXT:    mov.b64 %rd2, 1;
 ; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd2;
-; CHECK-PTX-NEXT:    mov.u64 %rd3, 4607182418800017408;
+; CHECK-PTX-NEXT:    mov.b64 %rd3, 4607182418800017408;
 ; CHECK-PTX-NEXT:    st.u64 [%SP+24], %rd3;
 ; CHECK-PTX-NEXT:    st.u64 [%SP+32], %rd3;
 ; CHECK-PTX-NEXT:    add.u64 %rd4, %SP, 0;
@@ -242,9 +242,9 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    st.u32 [%SP+8], %r1;
 ; CHECK-PTX-NEXT:    add.u64 %rd5, %SP, 8;
 ; CHECK-PTX-NEXT:    or.b64 %rd6, %rd5, 4;
-; CHECK-PTX-NEXT:    mov.u16 %rs9, 1;
+; CHECK-PTX-NEXT:    mov.b16 %rs9, 1;
 ; CHECK-PTX-NEXT:    st.u8 [%rd6], %rs9;
-; CHECK-PTX-NEXT:    mov.u64 %rd7, 1;
+; CHECK-PTX-NEXT:    mov.b64 %rd7, 1;
 ; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd7;
 ; CHECK-PTX-NEXT:    { // callseq 1, 0
 ; CHECK-PTX-NEXT:    .param .b32 param0;
@@ -400,7 +400,7 @@ define dso_local void @qux() {
 ; CHECK-PTX-NEXT:    add.s64 %rd3, %rd2, 8;
 ; CHECK-PTX-NEXT:    ld.global.nc.u64 %rd4, [%rd3];
 ; CHECK-PTX-NEXT:    st.u64 [%SP+8], %rd4;
-; CHECK-PTX-NEXT:    mov.u64 %rd5, 1;
+; CHECK-PTX-NEXT:    mov.b64 %rd5, 1;
 ; CHECK-PTX-NEXT:    st.u64 [%SP+16], %rd5;
 ; CHECK-PTX-NEXT:    add.u64 %rd6, %SP, 16;
 ; CHECK-PTX-NEXT:    { // callseq 3, 0
diff --git a/llvm/test/CodeGen/NVPTX/vector-returns.ll b/llvm/test/CodeGen/NVPTX/vector-returns.ll
index bb120ee2ea019..2001d199ce0a7 100644
--- a/llvm/test/CodeGen/NVPTX/vector-returns.ll
+++ b/llvm/test/CodeGen/NVPTX/vector-returns.ll
@@ -9,7 +9,7 @@ define <3 x i64> @long3() {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u64 %rd1, 0;
+; CHECK-NEXT:    mov.b64 %rd1, 0;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd1, %rd1};
 ; CHECK-NEXT:    st.param.b64 [func_retval0+16], %rd1;
 ; CHECK-NEXT:    ret;
@@ -22,7 +22,7 @@ define <2 x i64> @long2() {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u64 %rd1, 0;
+; CHECK-NEXT:    mov.b64 %rd1, 0;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd1, %rd1};
 ; CHECK-NEXT:    ret;
   ret <2 x i64> zeroinitializer
@@ -34,7 +34,7 @@ define <1 x i64> @long1() {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u64 %rd1, 0;
+; CHECK-NEXT:    mov.b64 %rd1, 0;
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd1;
 ; CHECK-NEXT:    ret;
   ret <1 x i64> zeroinitializer
@@ -108,7 +108,7 @@ define <9 x i16> @short9() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.v4.b16 [func_retval0], {%rs1, %rs1, %rs1, %rs1};
 ; CHECK-NEXT:    st.param.v4.b16 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1};
 ; CHECK-NEXT:    st.param.b16 [func_retval0+16], %rs1;
@@ -134,7 +134,7 @@ define <7 x i16> @short7() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.v4.b16 [func_retval0], {%rs1, %rs1, %rs1, %rs1};
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0+8], {%rs1, %rs1};
 ; CHECK-NEXT:    st.param.b16 [func_retval0+12], %rs1;
@@ -148,7 +148,7 @@ define <5 x i16> @short5() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.v4.b16 [func_retval0], {%rs1, %rs1, %rs1, %rs1};
 ; CHECK-NEXT:    st.param.b16 [func_retval0+8], %rs1;
 ; CHECK-NEXT:    ret;
@@ -173,7 +173,7 @@ define <3 x i16> @short3() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs1, %rs1};
 ; CHECK-NEXT:    st.param.b16 [func_retval0+4], %rs1;
 ; CHECK-NEXT:    ret;
@@ -198,7 +198,7 @@ define <1 x i16> @short1() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-NEXT:    ret;
   ret <1 x i16> zeroinitializer
@@ -210,7 +210,7 @@ define <17 x i8> @byte17() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1};
 ; CHECK-NEXT:    st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1};
 ; CHECK-NEXT:    st.param.v4.b8 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1};
@@ -238,7 +238,7 @@ define <15 x i8> @byte15() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1};
 ; CHECK-NEXT:    st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1};
 ; CHECK-NEXT:    st.param.v4.b8 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1};
@@ -254,7 +254,7 @@ define <9 x i8> @byte9() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1};
 ; CHECK-NEXT:    st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1};
 ; CHECK-NEXT:    st.param.b8 [func_retval0+8], %rs1;
@@ -280,7 +280,7 @@ define <7 x i8> @byte7() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1};
 ; CHECK-NEXT:    st.param.v2.b8 [func_retval0+4], {%rs1, %rs1};
 ; CHECK-NEXT:    st.param.b8 [func_retval0+6], %rs1;
@@ -294,7 +294,7 @@ define <5 x i8> @byte5() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1};
 ; CHECK-NEXT:    st.param.b8 [func_retval0+4], %rs1;
 ; CHECK-NEXT:    ret;
@@ -343,7 +343,7 @@ define <1 x i8> @byte1() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    ret;
   ret <1 x i8> zeroinitializer
@@ -355,7 +355,7 @@ define <17 x i1> @bit17() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.v4.b8 [func_retval0], {%rs1, %rs1, %rs1, %rs1};
 ; CHECK-NEXT:    st.param.v4.b8 [func_retval0+4], {%rs1, %rs1, %rs1, %rs1};
 ; CHECK-NEXT:    st.param.v4.b8 [func_retval0+8], {%rs1, %rs1, %rs1, %rs1};
@@ -371,7 +371,7 @@ define <16 x i1> @bit16() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.v2.b8 [func_retval0], {%rs1, %rs1};
 ; CHECK-NEXT:    st.param.v2.b8 [func_retval0+2], {%rs1, %rs1};
 ; CHECK-NEXT:    st.param.v2.b8 [func_retval0+4], {%rs1, %rs1};
@@ -390,7 +390,7 @@ define <15 x i1> @bit15() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.v2.b8 [func_retval0], {%rs1, %rs1};
 ; CHECK-NEXT:    st.param.v2.b8 [func_retval0+2], {%rs1, %rs1};
 ; CHECK-NEXT:    st.param.v2.b8 [func_retval0+4], {%rs1, %rs1};
@@ -409,7 +409,7 @@ define <9 x i1> @bit9() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.v2.b8 [func_retval0], {%rs1, %rs1};
 ; CHECK-NEXT:    st.param.v2.b8 [func_retval0+2], {%rs1, %rs1};
 ; CHECK-NEXT:    st.param.v2.b8 [func_retval0+4], {%rs1, %rs1};
@@ -425,7 +425,7 @@ define <8 x i1> @bit8() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs1;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rs1;
@@ -444,7 +444,7 @@ define <7 x i1> @bit7() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs1;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rs1;
@@ -462,7 +462,7 @@ define <5 x i1> @bit5() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs1;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rs1;
@@ -478,7 +478,7 @@ define <4 x i1> @bit4() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs1;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rs1;
@@ -493,7 +493,7 @@ define <3 x i1> @bit3() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs1;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+2], %rs1;
@@ -507,7 +507,7 @@ define <2 x i1> @bit2() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    st.param.b8 [func_retval0+1], %rs1;
 ; CHECK-NEXT:    ret;
@@ -520,7 +520,7 @@ define <1 x i1> @bit1() {
 ; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    mov.u16 %rs1, 0;
+; CHECK-NEXT:    mov.b16 %rs1, 0;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    ret;
   ret <1 x i1> zeroinitializer

From b0a4b5b35ab1951d0a4fa95ff58d96e902aa8b1e Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 19 Dec 2024 08:00:02 -0800
Subject: [PATCH 066/209] [TableGen] Avoid repeated hash lookups (NFC)
 (#120532)

---
 .../TableGen/Common/CodeGenDAGPatterns.cpp    | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
index 31bf9a98943e5..193d29717d2c9 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
@@ -3007,13 +3007,12 @@ TreePatternNodePtr TreePattern::ParseTreePattern(const Init *TheInit,
       // and "(MY_PAT $b, $a)" should not be allowed in the same pattern;
       // neither should "(MY_PAT_1 $a, $b)" and "(MY_PAT_2 $a, $b)".
       auto OperandId = std::make_pair(Operator, i);
-      auto PrevOp = ComplexPatternOperands.find(Child->getName());
-      if (PrevOp != ComplexPatternOperands.end()) {
-        if (PrevOp->getValue() != OperandId)
-          error("All ComplexPattern operands must appear consistently: "
-                "in the same order in just one ComplexPattern instance.");
-      } else
-        ComplexPatternOperands[Child->getName()] = OperandId;
+      auto [PrevOp, Inserted] =
+          ComplexPatternOperands.try_emplace(Child->getName(), OperandId);
+      if (!Inserted && PrevOp->getValue() != OperandId) {
+        error("All ComplexPattern operands must appear consistently: "
+              "in the same order in just one ComplexPattern instance.");
+      }
     }
   }
 
@@ -3095,14 +3094,14 @@ bool TreePattern::InferAllTypes(
       // If we have input named node types, propagate their types to the named
       // values here.
       if (InNamedTypes) {
-        if (!InNamedTypes->count(Entry.getKey())) {
+        auto InIter = InNamedTypes->find(Entry.getKey());
+        if (InIter == InNamedTypes->end()) {
           error("Node '" + std::string(Entry.getKey()) +
                 "' in output pattern but not input pattern");
           return true;
         }
 
-        const SmallVectorImpl<TreePatternNode *> &InNodes =
-            InNamedTypes->find(Entry.getKey())->second;
+        const SmallVectorImpl<TreePatternNode *> &InNodes = InIter->second;
 
         // The input types should be fully resolved by now.
         for (TreePatternNode *Node : Nodes) {
@@ -3855,7 +3854,8 @@ void CodeGenDAGPatterns::parseInstructionPattern(CodeGenInstruction &CGI,
       continue;
     }
 
-    if (!InstInputs.count(OpName)) {
+    auto InIter = InstInputs.find(OpName);
+    if (InIter == InstInputs.end()) {
       // If this is an operand with a DefaultOps set filled in, we can ignore
       // this.  When we codegen it, we will do so as always executed.
       if (Op.Rec->isSubClassOf("OperandWithDefaultOps")) {
@@ -3868,8 +3868,8 @@ void CodeGenDAGPatterns::parseInstructionPattern(CodeGenInstruction &CGI,
               " does not appear in the instruction pattern");
       continue;
     }
-    TreePatternNodePtr InVal = InstInputs[OpName];
-    InstInputs.erase(OpName); // It occurred, remove from map.
+    TreePatternNodePtr InVal = InIter->second;
+    InstInputs.erase(InIter); // It occurred, remove from map.
 
     if (InVal->isLeaf() && isa<DefInit>(InVal->getLeafValue())) {
       const Record *InRec = cast<DefInit>(InVal->getLeafValue())->getDef();

From d56edc14d8f7e6f0a43f488ef8c2457e1c0cad91 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 19 Dec 2024 11:12:05 -0500
Subject: [PATCH 067/209] [libc++] Bump the Docker image used in the CI
 (#120248)

This switches to using a slightly newer CMake version in our CI.
---
 .github/workflows/libcxx-build-and-test.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index cba8afbb54f0f..a061be3892523 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -37,7 +37,7 @@ jobs:
   stage1:
     if: github.repository_owner == 'llvm'
     runs-on: libcxx-self-hosted-linux
-    container: ghcr.io/llvm/libcxx-linux-builder:0fd6f684b9c84c32d6cbfd9742402e788b2879f1
+    container: ghcr.io/llvm/libcxx-linux-builder:d8a0709b1090350a7fe3604d8ab78c7d62f10698
     continue-on-error: false
     strategy:
       fail-fast: false
@@ -74,7 +74,7 @@ jobs:
   stage2:
     if: github.repository_owner == 'llvm'
     runs-on: libcxx-self-hosted-linux
-    container: ghcr.io/llvm/libcxx-linux-builder:0fd6f684b9c84c32d6cbfd9742402e788b2879f1
+    container: ghcr.io/llvm/libcxx-linux-builder:d8a0709b1090350a7fe3604d8ab78c7d62f10698
     needs: [ stage1 ]
     continue-on-error: false
     strategy:
@@ -162,7 +162,7 @@ jobs:
         - config: 'generic-msan'
           machine: libcxx-self-hosted-linux
     runs-on: ${{ matrix.machine }}
-    container: ghcr.io/llvm/libcxx-linux-builder:0fd6f684b9c84c32d6cbfd9742402e788b2879f1
+    container: ghcr.io/llvm/libcxx-linux-builder:d8a0709b1090350a7fe3604d8ab78c7d62f10698
     steps:
       - uses: actions/checkout@v4
       - name: ${{ matrix.config }}

From cc246d4a29a0ece8470d2baa1f98245446051fe3 Mon Sep 17 00:00:00 2001
From: Abhay Kanhere <abhay@kanhere.net>
Date: Thu, 19 Dec 2024 08:19:11 -0800
Subject: [PATCH 068/209] [Transforms][CodeExtraction] bug fix regions with
 stackrestore (#118564)

Ensure code extraction for outlining to a function does not create a function with stacksave of caller to restore stack (e.g. tail call).
---
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   | 18 +++++
 .../outline-inner-region-stacktoocomplex.ll   | 71 +++++++++++++++++++
 2 files changed, 89 insertions(+)
 create mode 100644 llvm/test/Transforms/HotColdSplit/outline-inner-region-stacktoocomplex.ll

diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 610a77bc4c31e..7ddb9e22c8344 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -627,6 +627,24 @@ bool CodeExtractor::isEligible() const {
         return false;
     }
   }
+  // stacksave as input implies stackrestore in the outlined function.
+  // This can confuse prolog epilog insertion phase.
+  // stacksave's uses must not cross outlined function.
+  for (BasicBlock *BB : Blocks) {
+    for (Instruction &I : *BB) {
+      IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+      if (!II)
+        continue;
+      bool IsSave = II->getIntrinsicID() == Intrinsic::stacksave;
+      bool IsRestore = II->getIntrinsicID() == Intrinsic::stackrestore;
+      if (IsSave && any_of(II->users(), [&Blks = this->Blocks](User *U) {
+            return !definedInRegion(Blks, U);
+          }))
+        return false;
+      if (IsRestore && !definedInRegion(Blocks, II->getArgOperand(0)))
+        return false;
+    }
+  }
   return true;
 }
 
diff --git a/llvm/test/Transforms/HotColdSplit/outline-inner-region-stacktoocomplex.ll b/llvm/test/Transforms/HotColdSplit/outline-inner-region-stacktoocomplex.ll
new file mode 100644
index 0000000000000..2ef827b04e6e2
--- /dev/null
+++ b/llvm/test/Transforms/HotColdSplit/outline-inner-region-stacktoocomplex.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=hotcoldsplit --hotcoldsplit-max-params=1 < %s | FileCheck %s
+
+target datalayout = "E-m:a-p:32:32-i64:64-n32"
+
+define void @foo(i32 %cond) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: i32 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[COND_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 [[COND]], ptr [[COND_ADDR]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[COND_ADDR]], align 4
+; CHECK-NEXT:    [[STKS:%.*]] = tail call ptr @llvm.stacksave.p0()
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END2:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[COND_ADDR]], align 4
+; CHECK-NEXT:    call void @sink(i32 [[TMP0]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP1]], 10
+; CHECK-NEXT:    br i1 [[CMP]], label %[[IF_THEN1:.*]], label %[[IF_ELSE:.*]]
+; CHECK:       [[IF_THEN1]]:
+; CHECK-NEXT:    call void @sideeffect(i32 2)
+; CHECK-NEXT:    br label %[[IF_END:.*]]
+; CHECK:       [[IF_ELSE]]:
+; CHECK-NEXT:    call void @sink(i32 0)
+; CHECK-NEXT:    call void @sideeffect(i32 0)
+; CHECK-NEXT:    call void @llvm.stackrestore.p0(ptr [[STKS]])
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    br label %[[IF_END2]]
+; CHECK:       [[IF_END2]]:
+; CHECK-NEXT:    call void @sideeffect(i32 1)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond.addr = alloca i32
+  store i32 %cond, ptr %cond.addr
+  %0 = load i32, ptr %cond.addr
+  %stks =  tail call ptr @llvm.stacksave.p0()
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.end2
+
+if.then:                                          ; preds = %entry
+  %1 = load i32, ptr %cond.addr
+  call void @sink(i32 %0)
+  %cmp = icmp sgt i32 %1, 10
+  br i1 %cmp, label %if.then1, label %if.else
+
+if.then1:                                         ; preds = %if.then
+  call void @sideeffect(i32 2)
+  br label %if.end
+
+if.else:                                          ; preds = %if.then
+  call void @sink(i32 0)
+  call void @sideeffect(i32 0)
+  call void @llvm.stackrestore.p0(ptr %stks)
+  br label %if.end
+
+
+if.end:                                           ; preds = %if.else, %if.then1
+  br label %if.end2
+
+if.end2:                                          ; preds = %entry
+  call void @sideeffect(i32 1)
+  ret void
+}
+
+
+declare void @sideeffect(i32)
+
+declare void @sink(i32) cold

From 4039a79de71bd969ef5bf944fd9f46430338ff7e Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245@hotmail.com>
Date: Thu, 19 Dec 2024 11:19:25 -0500
Subject: [PATCH 069/209] [libc++][test] Improve tests for assign in
 std::vector and vector<bool> (#119163)

This PR enhances the test coverage for std::vector::assign by adding new
tests for several important test cases that were previously missing, as
shown in the following table:

| test cases                        | forward_iterator | input_iterator |
|-----------------------------------|------------------|----------------|
| new_size > capacity()             | Yes              | Yes            |
| size() < new_size <= capacity()   | No               | No             |
| new_size <= size()                | No               | No             |

Similarly, no tests have previously covered `assign(InputIterator, InputIterator)`
and `assign(size_type, const value_type&)` for `vector<bool>`.

With this patch applied, all missing tests are covered.
---
 .../vector.bool/assign_iter_iter.pass.cpp     |  93 +++++++++++++++
 .../vector.bool/assign_size_value.pass.cpp    |  57 ++++++++++
 .../vector.cons/assign_iter_iter.pass.cpp     | 107 ++++++++++++++++--
 3 files changed, 250 insertions(+), 7 deletions(-)
 create mode 100644 libcxx/test/std/containers/sequences/vector.bool/assign_iter_iter.pass.cpp
 create mode 100644 libcxx/test/std/containers/sequences/vector.bool/assign_size_value.pass.cpp

diff --git a/libcxx/test/std/containers/sequences/vector.bool/assign_iter_iter.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/assign_iter_iter.pass.cpp
new file mode 100644
index 0000000000000..91788b3707592
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/vector.bool/assign_iter_iter.pass.cpp
@@ -0,0 +1,93 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <vector>
+
+// template <class InputIt>
+// constexpr void assign(InputIt first, InputIt last);
+
+#include <vector>
+#include <cassert>
+#include "test_macros.h"
+#include "test_iterators.h"
+
+TEST_CONSTEXPR_CXX20 bool tests() {
+  {   // Test with various cases where assign may or may not trigger reallocations for forward_iterator
+    { // Reallocation happens
+      std::vector<bool> in(128, true);
+      std::vector<bool> v(5, false);
+      assert(v.capacity() < in.size());
+      using It = forward_iterator<std::vector<bool>::iterator>;
+      v.assign(It(in.begin()), It(in.end()));
+      assert(v == in);
+    }
+    { // No reallocation: fit within current size
+      bool in[]     = {false, true, false, true, true};
+      std::size_t N = sizeof(in) / sizeof(in[0]);
+      std::vector<bool> v(2 * N, false);
+      using It = forward_iterator<bool*>;
+      v.assign(It(in), It(in + N));
+      assert(v.size() == N);
+      for (std::size_t i = 0; i < N; ++i)
+        assert(v[i] == in[i]);
+    }
+    { // No reallocation: fit within spare space
+      bool in[]     = {false, true, false, true, true};
+      std::size_t N = sizeof(in) / sizeof(in[0]);
+      std::vector<bool> v(N / 2, false);
+      v.reserve(N * 2);
+      using It = forward_iterator<bool*>;
+      v.assign(It(in), It(in + N));
+      assert(v.size() == N);
+      for (std::size_t i = 0; i < N; ++i)
+        assert(v[i] == in[i]);
+    }
+  }
+
+  {   // Test with various cases where assign may or may not trigger reallocations for input_iterator
+    { // Reallocation happens
+      std::vector<bool> in(128, true);
+      std::vector<bool> v(5, false);
+      assert(v.capacity() < in.size());
+      using It = cpp17_input_iterator<std::vector<bool>::iterator>;
+      v.assign(It(in.begin()), It(in.end()));
+      assert(v == in);
+    }
+    { // No reallocation: fit within current size
+      bool in[]     = {false, true, false, true, true};
+      std::size_t N = sizeof(in) / sizeof(in[0]);
+      std::vector<bool> v(2 * N, false);
+      using It = cpp17_input_iterator<bool*>;
+      v.assign(It(in), It(in + N));
+      assert(v.size() == N);
+      for (std::size_t i = 0; i < N; ++i)
+        assert(v[i] == in[i]);
+    }
+    { // No reallocation: fit within spare space
+      bool in[]     = {false, true, false, true, true};
+      std::size_t N = sizeof(in) / sizeof(in[0]);
+      std::vector<bool> v(N / 2, false);
+      v.reserve(N * 2);
+      using It = cpp17_input_iterator<bool*>;
+      v.assign(It(in), It(in + N));
+      assert(v.size() == N);
+      for (std::size_t i = 0; i < N; ++i)
+        assert(v[i] == in[i]);
+    }
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  tests();
+#if TEST_STD_VER > 17
+  static_assert(tests());
+#endif
+  return 0;
+}
diff --git a/libcxx/test/std/containers/sequences/vector.bool/assign_size_value.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/assign_size_value.pass.cpp
new file mode 100644
index 0000000000000..d2513bb84c806
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/vector.bool/assign_size_value.pass.cpp
@@ -0,0 +1,57 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <vector>
+
+// void assign(size_type n, const value_type& x);
+
+#include <vector>
+#include <cassert>
+#include "test_macros.h"
+#include "test_iterators.h"
+
+TEST_CONSTEXPR_CXX20 bool tests() {
+  {   // Test with various cases where assign may or may not trigger reallocations
+    { // Reallocation happens
+      std::size_t N = 128;
+      std::vector<bool> v(5, false);
+      assert(v.capacity() < N);
+      v.assign(N, true);
+      assert(v.size() == N);
+      for (std::size_t i = 0; i < N; ++i)
+        assert(v[i] == true);
+    }
+    { // No reallocation: fit within current size
+      std::size_t N = 5;
+      std::vector<bool> v(2 * N, false);
+      v.assign(N, true);
+      assert(v.size() == N);
+      for (std::size_t i = 0; i < N; ++i)
+        assert(v[i] == true);
+    }
+    { // No reallocation: fit within spare space
+      std::size_t N = 5;
+      std::vector<bool> v(N / 2, false);
+      v.reserve(N * 2);
+      v.assign(N, true);
+      assert(v.size() == N);
+      for (std::size_t i = 0; i < N; ++i)
+        assert(v[i] == true);
+    }
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  tests();
+#if TEST_STD_VER > 17
+  static_assert(tests());
+#endif
+  return 0;
+}
diff --git a/libcxx/test/std/containers/sequences/vector/vector.cons/assign_iter_iter.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.cons/assign_iter_iter.pass.cpp
index b6aeba4e4eef3..91060679cd7f9 100644
--- a/libcxx/test/std/containers/sequences/vector/vector.cons/assign_iter_iter.pass.cpp
+++ b/libcxx/test/std/containers/sequences/vector/vector.cons/assign_iter_iter.pass.cpp
@@ -19,17 +19,16 @@
 #include "asan_testing.h"
 #include "test_iterators.h"
 #if TEST_STD_VER >= 11
-#include "emplace_constructible.h"
-#include "container_test_types.h"
+#  include "emplace_constructible.h"
+#  include "container_test_types.h"
 #endif
 
-
 TEST_CONSTEXPR_CXX20 bool test() {
 #if TEST_STD_VER >= 11
   int arr1[] = {42};
   int arr2[] = {1, 101, 42};
-  {
-    using T = EmplaceConstructibleMoveableAndAssignable<int>;
+  { // Test with new_size > capacity() == 0 for forward_iterator, resulting in reallocation during assign
+    using T  = EmplaceConstructibleMoveableAndAssignable<int>;
     using It = forward_iterator<int*>;
     {
       std::vector<T> v;
@@ -44,8 +43,8 @@ TEST_CONSTEXPR_CXX20 bool test() {
       assert(v[2].value == 42);
     }
   }
-  {
-    using T = EmplaceConstructibleMoveableAndAssignable<int>;
+  { // Test with new_size > capacity() == 0 for input_iterator, resulting in reallocation during assign
+    using T  = EmplaceConstructibleMoveableAndAssignable<int>;
     using It = cpp17_input_iterator<int*>;
     {
       std::vector<T> v;
@@ -64,6 +63,100 @@ TEST_CONSTEXPR_CXX20 bool test() {
       assert(v[2].value == 42);
     }
   }
+
+  { // Test with new_size < size() for forward_iterator, resulting in destruction at end during assign
+    using T  = EmplaceConstructibleMoveableAndAssignable<int>;
+    using It = forward_iterator<int*>;
+    {
+      std::vector<T> v;
+      v.reserve(5);
+      for (std::size_t i = 0; i < v.capacity(); ++i)
+        v.emplace_back(99);
+      v.assign(It(arr1), It(std::end(arr1)));
+      assert(v.size() == 1);
+      assert(v[0].value == 42);
+    }
+    {
+      std::vector<T> v;
+      v.reserve(5);
+      for (std::size_t i = 0; i < v.capacity(); ++i)
+        v.emplace_back(99);
+      v.assign(It(arr2), It(std::end(arr2)));
+      assert(v.size() == 3);
+      assert(v[0].value == 1);
+      assert(v[1].value == 101);
+      assert(v[2].value == 42);
+    }
+  }
+  { // Test with new_size < size() for input_iterator, resulting in destruction at end during assign
+    using T  = EmplaceConstructibleMoveableAndAssignable<int>;
+    using It = cpp17_input_iterator<int*>;
+    {
+      std::vector<T> v;
+      v.reserve(5);
+      for (std::size_t i = 0; i < v.capacity(); ++i)
+        v.emplace_back(99);
+      v.assign(It(arr1), It(std::end(arr1)));
+      assert(v.size() == 1);
+      assert(v[0].value == 42);
+    }
+    {
+      std::vector<T> v;
+      v.reserve(5);
+      for (std::size_t i = 0; i < v.capacity(); ++i)
+        v.emplace_back(99);
+      v.assign(It(arr2), It(std::end(arr2)));
+      assert(v.size() == 3);
+      assert(v[0].value == 1);
+      assert(v[1].value == 101);
+      assert(v[2].value == 42);
+    }
+  }
+
+  { // Test with size() < new_size < capacity() for forward_iterator, resulting in construction at end during assign
+    using T  = EmplaceConstructibleMoveableAndAssignable<int>;
+    using It = forward_iterator<int*>;
+    {
+      std::vector<T> v;
+      v.reserve(5);
+      v.assign(It(arr1), It(std::end(arr1)));
+      assert(v.size() == 1);
+      assert(v[0].value == 42);
+    }
+    {
+      std::vector<T> v;
+      v.reserve(5);
+      for (std::size_t i = 0; i < 2; ++i)
+        v.emplace_back(99);
+      v.assign(It(arr2), It(std::end(arr2)));
+      assert(v.size() == 3);
+      assert(v[0].value == 1);
+      assert(v[1].value == 101);
+      assert(v[2].value == 42);
+    }
+  }
+  { // Test with size() < new_size < capacity() for input_iterator, resulting in construction at end during assign
+    using T  = EmplaceConstructibleMoveableAndAssignable<int>;
+    using It = cpp17_input_iterator<int*>;
+    {
+      std::vector<T> v;
+      v.reserve(5);
+      v.assign(It(arr1), It(std::end(arr1)));
+      assert(v.size() == 1);
+      assert(v[0].value == 42);
+    }
+    {
+      std::vector<T> v;
+      v.reserve(5);
+      for (std::size_t i = 0; i < 2; ++i)
+        v.emplace_back(99);
+      v.assign(It(arr2), It(std::end(arr2)));
+      assert(v.size() == 3);
+      assert(v[0].value == 1);
+      assert(v[1].value == 101);
+      assert(v[2].value == 42);
+    }
+  }
 #endif
 
   // Test with a number of elements in the source range that is greater than capacity

From 2b6713d3b87d6e0bf562cf10ef620a12328c4106 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 19 Dec 2024 11:22:29 -0500
Subject: [PATCH 070/209] [lld/coff] Fix assert on /start-lib foo.obj /end-lib
 during eager loads (#120292)

If foo.obj is eagerly loaded (due to a prior undef referencing one if
its symbols) and has more than one symbol, we used to assert:
SymbolTable::addLazyObject() for the first symbol would set `lazy` to
false and load all symbols from the file, but the outer
ObjFile::parseLazy() loop would continue to run and call addLazyObject()
for the second symbol, which would assert.

Instead, just stop adding lazy symbols if the file got loaded for real
while adding a symbol.

(The ELF port has a similar early exit in `ObjFile<ELFT>::parseLazy()`.)
---
 lld/COFF/InputFiles.cpp    |  7 ++++++-
 lld/test/COFF/start-lib.ll | 30 ++++++++++++++++++++++++------
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index 3de2cdb0e53ee..ad21311e8b28f 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -193,6 +193,8 @@ void ObjFile::parseLazy() {
     if (coffSym.isAbsolute() && ignoredSymbolName(name))
       continue;
     symtab.addLazyObject(this, name);
+    if (!lazy)
+      return;
     i += coffSym.getNumberOfAuxSymbols();
   }
 }
@@ -1292,8 +1294,11 @@ void BitcodeFile::parse() {
 
 void BitcodeFile::parseLazy() {
   for (const lto::InputFile::Symbol &sym : obj->symbols())
-    if (!sym.isUndefined())
+    if (!sym.isUndefined()) {
       symtab.addLazyObject(this, sym.getName());
+      if (!lazy)
+        return;
+    }
 }
 
 MachineTypes BitcodeFile::getMachineType() const {
diff --git a/lld/test/COFF/start-lib.ll b/lld/test/COFF/start-lib.ll
index f4f49ed4e4f6e..5466e9e44d0df 100644
--- a/lld/test/COFF/start-lib.ll
+++ b/lld/test/COFF/start-lib.ll
@@ -6,24 +6,26 @@
 ; RUN: llc -filetype=obj %t.dir/main.ll -o %t.obj
 ; RUN: llc -filetype=obj %t.dir/start-lib1.ll -o %t1.obj
 ; RUN: llc -filetype=obj %t.dir/start-lib2.ll -o %t2.obj
+; RUN: llc -filetype=obj %t.dir/eager.ll -o %t-eager.obj
 ; RUN: opt -thinlto-bc %t.dir/main.ll -o %t.bc
 ; RUN: opt -thinlto-bc %t.dir/start-lib1.ll -o %t1.bc
 ; RUN: opt -thinlto-bc %t.dir/start-lib2.ll -o %t2.bc
+; RUN: opt -thinlto-bc %t.dir/eager.ll -o %t-eager.bc
 ;
 ; RUN: lld-link -out:%t1.exe -entry:main -opt:noref -lldmap:%t1.map \
-; RUN:     %t.obj %t1.obj %t2.obj
+; RUN:     %t.obj %t1.obj %t2.obj %t-eager.obj
 ; RUN: FileCheck --check-prefix=TEST1 %s < %t1.map
 ; RUN: lld-link -out:%t1.exe -entry:main -opt:noref -lldmap:%t1.thinlto.map \
-; RUN:     %t.bc %t1.bc %t2.bc
+; RUN:     %t.bc %t1.bc %t2.bc %t-eager.bc
 ; RUN: FileCheck --check-prefix=TEST1 %s < %t1.thinlto.map
 ; TEST1: foo
 ; TEST1: bar
 ;
 ; RUN: lld-link -out:%t2.exe -entry:main -opt:noref -lldmap:%t2.map \
-; RUN:     %t.obj -start-lib %t1.obj -end-lib %t2.obj
+; RUN:     %t.obj -start-lib %t1.obj %t-eager.obj -end-lib %t2.obj
 ; RUN: FileCheck --check-prefix=TEST2 %s < %t2.map
 ; RUN: lld-link -out:%t2.exe -entry:main -opt:noref -lldmap:%t2.thinlto.map \
-; RUN:     %t.bc -start-lib %t1.bc -end-lib %t2.bc
+; RUN:     %t.bc -start-lib %t1.bc %t-eager.bc -end-lib %t2.bc
 ; RUN: FileCheck --check-prefix=TEST2 %s < %t2.thinlto.map
 ; TEST2:     Address Size Align Out In Symbol
 ; TEST2-NOT:                           {{ }}foo{{$}}
@@ -31,10 +33,10 @@
 ; TEST2-NOT:                           {{ }}foo{{$}}
 ;
 ; RUN: lld-link -out:%t3.exe -entry:main -opt:noref -lldmap:%t3.map \
-; RUN:     %t.obj -start-lib %t1.obj %t2.obj
+; RUN:     %t.obj -start-lib %t1.obj %t2.obj %t-eager.obj
 ; RUN: FileCheck --check-prefix=TEST3 %s < %t3.map
 ; RUN: lld-link -out:%t3.exe -entry:main -opt:noref -lldmap:%t3.thinlto.map \
-; RUN:     %t.bc -start-lib %t1.bc %t2.bc
+; RUN:     %t.bc -start-lib %t1.bc %t2.bc %t-eager.bc
 ; RUN: FileCheck --check-prefix=TEST3 %s < %t3.thinlto.map
 ; TEST3:     Address Size Align Out In Symbol
 ; TEST3-NOT: {{ }}foo{{$}}
@@ -46,7 +48,10 @@
 target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc-windows-msvc"
 
+declare void @eager()
+
 define void @main() {
+  call void @eager()
   ret void
 }
 
@@ -79,3 +84,16 @@ define i32 @bar() {
 
 !llvm.linker.options = !{!0}
 !0 = !{!"/INCLUDE:bar"}
+
+#--- eager.ll
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+define void @eager() {
+  ret void
+}
+
+define i32 @ogre() {
+  ret i32 1
+}

From b05071de89e9c26ef8b3f7ab2ff6a56241b54ea8 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 19 Dec 2024 11:23:09 -0500
Subject: [PATCH 071/209] [lld/ELF] Add tests for start-lib / end-lib with
 eager loads (#120294)

Contains tests for the scenarios fixed in lld/COFF in #120292. They pass
without code changes, but I didn't see existing tests for this.
---
 lld/test/ELF/Inputs/eager.s      |  7 +++++++
 lld/test/ELF/lto/Inputs/eager.ll | 10 ++++++++++
 lld/test/ELF/lto/start-lib.ll    | 10 +++++++---
 lld/test/ELF/start-lib.s         |  9 ++++++---
 4 files changed, 30 insertions(+), 6 deletions(-)
 create mode 100644 lld/test/ELF/Inputs/eager.s
 create mode 100644 lld/test/ELF/lto/Inputs/eager.ll

diff --git a/lld/test/ELF/Inputs/eager.s b/lld/test/ELF/Inputs/eager.s
new file mode 100644
index 0000000000000..1c6b6852bc782
--- /dev/null
+++ b/lld/test/ELF/Inputs/eager.s
@@ -0,0 +1,7 @@
+.globl eager
+eager:
+  retq
+
+.globl ogre
+ogre:
+  retq
diff --git a/lld/test/ELF/lto/Inputs/eager.ll b/lld/test/ELF/lto/Inputs/eager.ll
new file mode 100644
index 0000000000000..ffd550e6f9cd2
--- /dev/null
+++ b/lld/test/ELF/lto/Inputs/eager.ll
@@ -0,0 +1,10 @@
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @eager() {
+  ret void
+}
+
+define void @ogre() {
+  ret void
+}
diff --git a/lld/test/ELF/lto/start-lib.ll b/lld/test/ELF/lto/start-lib.ll
index 39f62a7b1074f..6d508c6478ec2 100644
--- a/lld/test/ELF/lto/start-lib.ll
+++ b/lld/test/ELF/lto/start-lib.ll
@@ -3,18 +3,19 @@
 ; RUN: llvm-as %s -o %t1.o
 ; RUN: llvm-as %p/Inputs/start-lib1.ll -o %t2.o
 ; RUN: llvm-as %p/Inputs/start-lib2.ll -o %t3.o
+; RUN: llvm-as %p/Inputs/eager.ll -o %t-eager.o
 ;
-; RUN: ld.lld -shared -o %t3 %t1.o %t2.o %t3.o
+; RUN: ld.lld -shared -o %t3 %t1.o %t2.o %t3.o %t-eager.o
 ; RUN: llvm-readobj --symbols %t3 | FileCheck --check-prefix=TEST1 %s
 ; TEST1: Name: foo
 ; TEST1: Name: bar
 ;
-; RUN: ld.lld -shared -o %t3 -u bar %t1.o --start-lib %t2.o %t3.o
+; RUN: ld.lld -shared -o %t3 -u bar %t1.o --start-lib %t2.o %t3.o %t-eager.o
 ; RUN: llvm-readobj --symbols %t3 | FileCheck --check-prefix=TEST2 %s
 ; TEST2-NOT: Name: foo
 ; TEST2: Name: bar
 ;
-; RUN: ld.lld -shared -o %t3 %t1.o --start-lib %t2.o %t3.o
+; RUN: ld.lld -shared -o %t3 %t1.o --start-lib %t2.o %t3.o %t-eager.o
 ; RUN: llvm-readobj --symbols %t3 | FileCheck --check-prefix=TEST3 %s
 ; TEST3-NOT: Name: foo
 ; TEST3-NOT: Name: bar
@@ -22,6 +23,9 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+declare void @eager()
+
 define void @_start() {
+  call void @eager()
   ret void
 }
diff --git a/lld/test/ELF/start-lib.s b/lld/test/ELF/start-lib.s
index 04ac5a6323d05..5bd0bd628bf3f 100644
--- a/lld/test/ELF/start-lib.s
+++ b/lld/test/ELF/start-lib.s
@@ -5,18 +5,20 @@
 // RUN:   %p/Inputs/start-lib1.s -o %t2.o
 // RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux \
 // RUN:   %p/Inputs/start-lib2.s -o %t3.o
+// RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux \
+// RUN:   %p/Inputs/eager.s -o %t-eager.o
 
-// RUN: ld.lld -o %t3 %t1.o %t2.o %t3.o
+// RUN: ld.lld -o %t3 %t1.o %t2.o %t3.o %t-eager.o
 // RUN: llvm-readobj --symbols %t3 | FileCheck --check-prefix=TEST1 %s
 // TEST1: Name: foo
 // TEST1: Name: bar
 
-// RUN: ld.lld -o %t3 %t1.o -u bar --start-lib %t2.o %t3.o
+// RUN: ld.lld -o %t3 %t1.o -u bar --start-lib %t2.o %t3.o %t-eager.o
 // RUN: llvm-readobj --symbols %t3 | FileCheck --check-prefix=TEST2 %s
 // TEST2-NOT: Name: foo
 // TEST2: Name: bar
 
-// RUN: ld.lld -o %t3 %t1.o --start-lib %t2.o %t3.o
+// RUN: ld.lld -o %t3 %t1.o --start-lib %t2.o %t3.o %t-eager.o
 // RUN: llvm-readobj --symbols %t3 | FileCheck --check-prefix=TEST3 %s
 // TEST3-NOT: Name: foo
 // TEST3-NOT: Name: bar
@@ -32,3 +34,4 @@
 
 .globl _start
 _start:
+  callq eager

From f8bcd93224283291534d75a61cc7e5d8fbf0d311 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 19 Dec 2024 11:30:54 -0500
Subject: [PATCH 072/209] [lld/COFF] Fix -start-lib / -end-lib after
 reviews.llvm.org/D116434 (#120452)

That change forgot to set `lazy` to false before calling `addFile()` in
`forceLazy()` which caused `addFile()` to parse the file we want to
force a load for to be added as a lazy object again instead of adding
the file to `ctx.objFileInstances`.

This is caught by a pretty simple test (included).
---
 lld/COFF/SymbolTable.cpp   |  1 +
 lld/test/COFF/start-lib.ll | 76 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+)

diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index 6b3375e13e839..b1d375b226583 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -116,6 +116,7 @@ static void forceLazy(Symbol *s) {
   }
   case Symbol::Kind::LazyObjectKind: {
     InputFile *file = cast<LazyObject>(s)->file;
+    file->lazy = false;
     file->symtab.addFile(file);
     break;
   }
diff --git a/lld/test/COFF/start-lib.ll b/lld/test/COFF/start-lib.ll
index 5466e9e44d0df..a46147f21ccbb 100644
--- a/lld/test/COFF/start-lib.ll
+++ b/lld/test/COFF/start-lib.ll
@@ -97,3 +97,79 @@ define void @eager() {
 define i32 @ogre() {
   ret i32 1
 }
+
+
+; Check that lazy object files trigger loads correctly.
+; If the links succeed, that's enough, no additional tests needed.
+
+; RUN: llc -filetype=obj %t.dir/main2.ll -o %t-main2.obj
+; RUN: llc -filetype=obj %t.dir/foo.ll -o %t-foo.obj
+; RUN: llc -filetype=obj %t.dir/bar.ll -o %t-bar.obj
+; RUN: llc -filetype=obj %t.dir/baz.ll -o %t-baz.obj
+; RUN: opt -thinlto-bc %t.dir/main2.ll -o %t-main2.bc
+; RUN: opt -thinlto-bc %t.dir/foo.ll -o %t-foo.bc
+; RUN: opt -thinlto-bc %t.dir/bar.ll -o %t-bar.bc
+; RUN: opt -thinlto-bc %t.dir/baz.ll -o %t-baz.bc
+
+; RUN: lld-link -out:%t2.exe -entry:main \
+; RUN:     %t-main2.obj %t-foo.obj %t-bar.obj %t-baz.obj
+; RUN: lld-link -out:%t2.exe -entry:main \
+; RUN:     %t-main2.obj /start-lib %t-foo.obj %t-bar.obj %t-baz.obj /end-lib
+; RUN: lld-link -out:%t2.exe -entry:main \
+; RUN:     /start-lib %t-foo.obj %t-bar.obj %t-baz.obj /end-lib %t-main2.obj
+
+; RUN: lld-link -out:%t2.exe -entry:main \
+; RUN:     %t-main2.bc %t-foo.bc %t-bar.bc %t-baz.bc
+; RUN: lld-link -out:%t2.exe -entry:main \
+; RUN:     %t-main2.bc /start-lib %t-foo.bc %t-bar.bc %t-baz.bc /end-lib
+; RUN: lld-link -out:%t2.exe -entry:main \
+; RUN:     /start-lib %t-foo.bc %t-bar.bc %t-baz.bc /end-lib %t-main2.bc
+
+#--- main2.ll
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+declare void @bar()
+
+define void @main() {
+  call void () @bar()
+  ret void
+}
+
+
+#--- foo.ll
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+define void @foo() {
+  ret void
+}
+
+
+#--- bar.ll
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+; One undefined symbol from the lazy obj file before it,
+; one from the one after it.
+declare void @foo()
+declare void @baz()
+
+define void @bar() {
+  call void () @foo()
+  call void () @baz()
+  ret void
+}
+
+
+#--- baz.ll
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+define void @baz() {
+  ret void
+}

From e6b24955455d74ee748d1e9986d67de6d40ed22e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 19 Dec 2024 08:35:32 -0800
Subject: [PATCH 073/209] [SelectionDAG] Split SDNode::use_iterator into
 user_iterator and use_iterator. (#120531)

SDNode::use_iterator now returns an SDUse& when dereferenced.
SDNode::user_iterator returns SDNode*. SDNode::use_begin/use_end/uses
work on use_iterator. SDNode::user_begin/user_end/users work on
user_iterator.

We can now write range based for loops using SDUse& and SDNode::uses().
I've converted many of these in this patch. I didn't update loops that
have additional variables updated in their for statement.

Some loops use SDNode::use_iterator::getOperandNo() which also prevents
using range based for loops. I plan to move this into SDUse in a follow
up patch.
---
 llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 81 +++++++++++++------
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 39 ++++-----
 .../CodeGen/SelectionDAG/LegalizeTypes.cpp    |  7 +-
 .../SelectionDAG/ScheduleDAGSDNodes.cpp       |  2 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 40 +++++----
 .../CodeGen/SelectionDAG/SelectionDAGISel.cpp |  4 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 30 +++----
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 10 ++-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 30 ++++---
 llvm/lib/Target/ARM/ARMISelLowering.cpp       | 33 +++-----
 .../Target/Hexagon/HexagonISelDAGToDAG.cpp    |  2 +-
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp   | 25 +++---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   | 45 +++++------
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   |  2 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 12 ++-
 .../Target/SystemZ/SystemZISelLowering.cpp    | 26 +++---
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp       | 67 +++++++--------
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 27 ++++---
 18 files changed, 245 insertions(+), 237 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 77c04369f3e92..e13f41162628a 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -750,7 +750,7 @@ END_TWO_BYTE_PACK()
   bool use_empty() const { return UseList == nullptr; }
 
   /// Return true if there is exactly one use of this node.
-  bool hasOneUse() const { return hasSingleElement(users()); }
+  bool hasOneUse() const { return hasSingleElement(uses()); }
 
   /// Return the number of uses of this node. This method takes
   /// time proportional to the number of uses.
@@ -806,9 +806,6 @@ END_TWO_BYTE_PACK()
       return !operator==(x);
     }
 
-    /// Return true if this iterator is at the end of uses list.
-    bool atEnd() const { return Op == nullptr; }
-
     // Iterator traversal: forward iteration only.
     use_iterator &operator++() {          // Preincrement
       assert(Op && "Cannot increment end iterator!");
@@ -821,14 +818,12 @@ END_TWO_BYTE_PACK()
     }
 
     /// Retrieve a pointer to the current user node.
-    SDNode *operator*() const {
+    SDUse &operator*() const {
       assert(Op && "Cannot dereference end iterator!");
-      return Op->getUser();
+      return *Op;
     }
 
-    SDNode *operator->() const { return operator*(); }
-
-    SDUse &getUse() const { return *Op; }
+    SDUse *operator->() const { return &operator*(); }
 
     /// Retrieve the operand # of this use in its user.
     unsigned getOperandNo() const {
@@ -837,6 +832,43 @@ END_TWO_BYTE_PACK()
     }
   };
 
+  class user_iterator {
+    friend class SDNode;
+    use_iterator UI;
+
+    explicit user_iterator(SDUse *op) : UI(op) {};
+
+  public:
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = SDNode *;
+    using difference_type = std::ptrdiff_t;
+    using pointer = value_type *;
+    using reference = value_type &;
+
+    user_iterator() = default;
+
+    bool operator==(const user_iterator &x) const { return UI == x.UI; }
+    bool operator!=(const user_iterator &x) const { return !operator==(x); }
+
+    user_iterator &operator++() { // Preincrement
+      ++UI;
+      return *this;
+    }
+
+    user_iterator operator++(int) { // Postincrement
+      auto tmp = *this;
+      ++*this;
+      return tmp;
+    }
+
+    // Retrieve a pointer to the current User.
+    SDNode *operator*() const { return UI->getUser(); }
+
+    SDNode *operator->() const { return operator*(); }
+
+    SDUse &getUse() const { return *UI; }
+  };
+
   /// Provide iteration support to walk over all uses of an SDNode.
   use_iterator use_begin() const {
     return use_iterator(UseList);
@@ -844,22 +876,25 @@ END_TWO_BYTE_PACK()
 
   static use_iterator use_end() { return use_iterator(nullptr); }
 
-  /// Provide iteration support to walk over all users of an SDNode.
-  /// For now, this should only be used to get a pointer to the first user.
-  /// FIXME: Rename use_iterator to user_iterator. Add user_end().
-  use_iterator user_begin() const { return use_iterator(UseList); }
-
-  // Dereferencing use_iterator returns the user SDNode* making it closer to a
-  // user_iterator thus this function is called users() to reflect that.
-  // FIXME: Rename to user_iterator and introduce a use_iterator that returns
-  // SDUse*.
-  inline iterator_range<use_iterator> users() {
+  inline iterator_range<use_iterator> uses() {
     return make_range(use_begin(), use_end());
   }
-  inline iterator_range<use_iterator> users() const {
+  inline iterator_range<use_iterator> uses() const {
     return make_range(use_begin(), use_end());
   }
 
+  /// Provide iteration support to walk over all users of an SDNode.
+  user_iterator user_begin() const { return user_iterator(UseList); }
+
+  static user_iterator user_end() { return user_iterator(nullptr); }
+
+  inline iterator_range<user_iterator> users() {
+    return make_range(user_begin(), user_end());
+  }
+  inline iterator_range<user_iterator> users() const {
+    return make_range(user_begin(), user_end());
+  }
+
   /// Return true if there are exactly NUSES uses of the indicated value.
   /// This method ignores uses of other values defined by this operation.
   bool hasNUsesOfValue(unsigned NUses, unsigned Value) const;
@@ -1019,9 +1054,9 @@ END_TWO_BYTE_PACK()
   /// If this node has a glue value with a user, return
   /// the user (there is at most one). Otherwise return NULL.
   SDNode *getGluedUser() const {
-    for (use_iterator UI = use_begin(), UE = use_end(); UI != UE; ++UI)
-      if (UI.getUse().get().getValueType() == MVT::Glue)
-        return *UI;
+    for (SDUse &U : uses())
+      if (U.getValueType() == MVT::Glue)
+        return U.getUser();
     return nullptr;
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 85009439c37b3..7a458ff830ab4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13229,12 +13229,11 @@ static bool ExtendUsesToFormExtLoad(EVT VT, SDNode *N, SDValue N0,
                                     const TargetLowering &TLI) {
   bool HasCopyToRegUses = false;
   bool isTruncFree = TLI.isTruncateFree(VT, N0.getValueType());
-  for (SDNode::use_iterator UI = N0->use_begin(), UE = N0->use_end(); UI != UE;
-       ++UI) {
-    SDNode *User = *UI;
+  for (SDUse &Use : N0->uses()) {
+    SDNode *User = Use.getUser();
     if (User == N)
       continue;
-    if (UI.getUse().getResNo() != N0.getResNo())
+    if (Use.getResNo() != N0.getResNo())
       continue;
     // FIXME: Only extend SETCC N, N and SETCC N, c for now.
     if (ExtOpc != ISD::ANY_EXTEND && User->getOpcode() == ISD::SETCC) {
@@ -13266,9 +13265,7 @@ static bool ExtendUsesToFormExtLoad(EVT VT, SDNode *N, SDValue N0,
 
   if (HasCopyToRegUses) {
     bool BothLiveOut = false;
-    for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
-         UI != UE; ++UI) {
-      SDUse &Use = UI.getUse();
+    for (SDUse &Use : N->uses()) {
       if (Use.getResNo() == 0 && Use.getUser()->getOpcode() == ISD::CopyToReg) {
         BothLiveOut = true;
         break;
@@ -13780,11 +13777,10 @@ SDValue DAGCombiner::foldSextSetcc(SDNode *N) {
 
         // Non-chain users of this value must either be the setcc in this
         // sequence or extends that can be folded into the new {z/s}ext-load.
-        for (SDNode::use_iterator UI = V->use_begin(), UE = V->use_end();
-             UI != UE; ++UI) {
+        for (SDUse &Use : V->uses()) {
           // Skip uses of the chain and the setcc.
-          SDNode *User = *UI;
-          if (UI.getUse().getResNo() != 0 || User == N0.getNode())
+          SDNode *User = Use.getUser();
+          if (Use.getResNo() != 0 || User == N0.getNode())
             continue;
           // Extra users must have exactly the same cast we are about to create.
           // TODO: This restriction could be eased if ExtendUsesToFormExtLoad()
@@ -18928,7 +18924,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
     for (SDNode::use_iterator UI = BasePtr->use_begin(),
                               UE = BasePtr->use_end();
          UI != UE; ++UI) {
-      SDUse &Use = UI.getUse();
+      SDUse &Use = *UI;
       // Skip the use that is Ptr and uses of other results from BasePtr's
       // node (important for nodes that return multiple results).
       if (Use.getUser() == Ptr.getNode() || Use != BasePtr)
@@ -20056,13 +20052,12 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) {
   // Check if this load is used as several smaller chunks of bits.
   // Basically, look for uses in trunc or trunc(lshr) and record a new chain
   // of computation for each trunc.
-  for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
-       UI != UIEnd; ++UI) {
+  for (SDUse &U : LD->uses()) {
     // Skip the uses of the chain.
-    if (UI.getUse().getResNo() != 0)
+    if (U.getResNo() != 0)
       continue;
 
-    SDNode *User = *UI;
+    SDNode *User = U.getUser();
     unsigned Shift = 0;
 
     // Check if this is a trunc(lshr).
@@ -20940,7 +20935,7 @@ DAGCombiner::getStoreMergeCandidates(StoreSDNode *St,
     // This must be a chain use.
     if (UseIter.getOperandNo() != 0)
       return;
-    if (auto *OtherStore = dyn_cast<StoreSDNode>(*UseIter)) {
+    if (auto *OtherStore = dyn_cast<StoreSDNode>(UseIter->getUser())) {
       BaseIndexOffset Ptr;
       int64_t PtrDiff;
       if (CandidateMatch(OtherStore, Ptr, PtrDiff) &&
@@ -20958,12 +20953,13 @@ DAGCombiner::getStoreMergeCandidates(StoreSDNode *St,
       return nullptr;
     for (auto I = RootNode->use_begin(), E = RootNode->use_end();
          I != E && NumNodesExplored < MaxSearchNodes; ++I, ++NumNodesExplored) {
-      if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) { // walk down chain
-        for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2)
+      SDNode *User = I->getUser();
+      if (I.getOperandNo() == 0 && isa<LoadSDNode>(User)) { // walk down chain
+        for (auto I2 = User->use_begin(), E2 = User->use_end(); I2 != E2; ++I2)
           TryToAddCandidate(I2);
       }
       // Check stores that depend on the root (e.g. Store 3 in the chart above).
-      if (I.getOperandNo() == 0 && isa<StoreSDNode>(*I)) {
+      if (I.getOperandNo() == 0 && isa<StoreSDNode>(User)) {
         TryToAddCandidate(I);
       }
     }
@@ -27320,8 +27316,7 @@ SDValue DAGCombiner::visitGET_FPENV_MEM(SDNode *N) {
 
   // Check if the loaded value is used only in a store operation.
   StoreSDNode *StNode = nullptr;
-  for (auto I = LdNode->use_begin(), E = LdNode->use_end(); I != E; ++I) {
-    SDUse &U = I.getUse();
+  for (SDUse &U : LdNode->uses()) {
     if (U.getResNo() == 0) {
       if (auto *St = dyn_cast<StoreSDNode>(U.getUser())) {
         if (StNode)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index c7d29ec1a836c..b6abad830c371 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -88,10 +88,9 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
         if (I != ReplacedValues.end()) {
           Mapped |= 1;
           // Check that remapped values are only used by nodes marked NewNode.
-          for (SDNode::use_iterator UI = Node.use_begin(), UE = Node.use_end();
-               UI != UE; ++UI)
-            if (UI.getUse().getResNo() == i)
-              assert(UI->getNodeId() == NewNode &&
+          for (SDUse &U : Node.uses())
+            if (U.getResNo() == i)
+              assert(U.getUser()->getNodeId() == NewNode &&
                      "Remapped value has non-trivial use!");
 
           // Check that the final result of applying ReplacedValues is not
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 2e59dbf2f7028..26fc75c0578ec 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -236,7 +236,7 @@ void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
   // This algorithm requires a reasonably low use count before finding a match
   // to avoid uselessly blowing up compile time in large blocks.
   unsigned UseCount = 0;
-  for (SDNode::use_iterator I = Chain->use_begin(), E = Chain->use_end();
+  for (SDNode::user_iterator I = Chain->user_begin(), E = Chain->user_end();
        I != E && UseCount < 100; ++I, ++UseCount) {
     if (I.getUse().getResNo() != Chain.getResNo())
       continue;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index bd9e5d4dce8ec..07749ec87d0b2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -11611,7 +11611,7 @@ class RAUWUpdateListener : public SelectionDAG::DAGUpdateListener {
 
   void NodeDeleted(SDNode *N, SDNode *E) override {
     // Increment the iterator as needed.
-    while (UI != UE && N == *UI)
+    while (UI != UE && N == UI->getUser())
       ++UI;
   }
 
@@ -11650,7 +11650,7 @@ void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) {
   SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
   RAUWUpdateListener Listener(*this, UI, UE);
   while (UI != UE) {
-    SDNode *User = *UI;
+    SDNode *User = UI->getUser();
 
     // This node is about to morph, remove its old self from the CSE maps.
     RemoveNodeFromCSEMaps(User);
@@ -11660,12 +11660,12 @@ void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) {
     // To help reduce the number of CSE recomputations, process all
     // the uses of this user that we can find this way.
     do {
-      SDUse &Use = UI.getUse();
+      SDUse &Use = *UI;
       ++UI;
       Use.set(To);
       if (To->isDivergent() != From->isDivergent())
         updateDivergence(User);
-    } while (UI != UE && *UI == User);
+    } while (UI != UE && UI->getUser() == User);
     // Now that we have modified User, add it back to the CSE maps.  If it
     // already exists there, recursively merge the results together.
     AddModifiedNodeToCSEMaps(User);
@@ -11708,7 +11708,7 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To) {
   SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
   RAUWUpdateListener Listener(*this, UI, UE);
   while (UI != UE) {
-    SDNode *User = *UI;
+    SDNode *User = UI->getUser();
 
     // This node is about to morph, remove its old self from the CSE maps.
     RemoveNodeFromCSEMaps(User);
@@ -11718,12 +11718,12 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To) {
     // To help reduce the number of CSE recomputations, process all
     // the uses of this user that we can find this way.
     do {
-      SDUse &Use = UI.getUse();
+      SDUse &Use = *UI;
       ++UI;
       Use.setNode(To);
       if (To->isDivergent() != From->isDivergent())
         updateDivergence(User);
-    } while (UI != UE && *UI == User);
+    } while (UI != UE && UI->getUser() == User);
 
     // Now that we have modified User, add it back to the CSE maps.  If it
     // already exists there, recursively merge the results together.
@@ -11756,7 +11756,7 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From, const SDValue *To) {
   SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
   RAUWUpdateListener Listener(*this, UI, UE);
   while (UI != UE) {
-    SDNode *User = *UI;
+    SDNode *User = UI->getUser();
 
     // This node is about to morph, remove its old self from the CSE maps.
     RemoveNodeFromCSEMaps(User);
@@ -11767,12 +11767,12 @@ void SelectionDAG::ReplaceAllUsesWith(SDNode *From, const SDValue *To) {
     // user that we can find this way.
     bool To_IsDivergent = false;
     do {
-      SDUse &Use = UI.getUse();
+      SDUse &Use = *UI;
       const SDValue &ToOp = To[Use.getResNo()];
       ++UI;
       Use.set(ToOp);
       To_IsDivergent |= ToOp->isDivergent();
-    } while (UI != UE && *UI == User);
+    } while (UI != UE && UI->getUser() == User);
 
     if (To_IsDivergent != From->isDivergent())
       updateDivergence(User);
@@ -11810,7 +11810,7 @@ void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){
                        UE = From.getNode()->use_end();
   RAUWUpdateListener Listener(*this, UI, UE);
   while (UI != UE) {
-    SDNode *User = *UI;
+    SDNode *User = UI->getUser();
     bool UserRemovedFromCSEMaps = false;
 
     // A user can appear in a use list multiple times, and when this
@@ -11818,7 +11818,7 @@ void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){
     // To help reduce the number of CSE recomputations, process all
     // the uses of this user that we can find this way.
     do {
-      SDUse &Use = UI.getUse();
+      SDUse &Use = *UI;
 
       // Skip uses of different values from the same node.
       if (Use.getResNo() != From.getResNo()) {
@@ -11837,7 +11837,7 @@ void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){
       Use.set(To);
       if (To->isDivergent() != From->isDivergent())
         updateDivergence(User);
-    } while (UI != UE && *UI == User);
+    } while (UI != UE && UI->getUser() == User);
     // We are iterating over all uses of the From node, so if a use
     // doesn't use the specific value, no changes are made.
     if (!UserRemovedFromCSEMaps)
@@ -11982,11 +11982,9 @@ void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
   for (unsigned i = 0; i != Num; ++i) {
     unsigned FromResNo = From[i].getResNo();
     SDNode *FromNode = From[i].getNode();
-    for (SDNode::use_iterator UI = FromNode->use_begin(),
-         E = FromNode->use_end(); UI != E; ++UI) {
-      SDUse &Use = UI.getUse();
+    for (SDUse &Use : FromNode->uses()) {
       if (Use.getResNo() == FromResNo) {
-        UseMemo Memo = { *UI, i, &Use };
+        UseMemo Memo = {Use.getUser(), i, &Use};
         Uses.push_back(Memo);
       }
     }
@@ -12462,8 +12460,8 @@ bool SDNode::hasNUsesOfValue(unsigned NUses, unsigned Value) const {
   assert(Value < getNumValues() && "Bad value!");
 
   // TODO: Only iterate over uses of a given value of the node
-  for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) {
-    if (UI.getUse().getResNo() == Value) {
+  for (SDUse &U : uses()) {
+    if (U.getResNo() == Value) {
       if (NUses == 0)
         return false;
       --NUses;
@@ -12479,8 +12477,8 @@ bool SDNode::hasNUsesOfValue(unsigned NUses, unsigned Value) const {
 bool SDNode::hasAnyUseOfValue(unsigned Value) const {
   assert(Value < getNumValues() && "Bad value!");
 
-  for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI)
-    if (UI.getUse().getResNo() == Value)
+  for (SDUse &U : uses())
+    if (U.getResNo() == Value)
       return true;
 
   return false;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index c4d0552ad55d3..c1dabe05452fb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -3792,8 +3792,8 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       for (unsigned i = 1, e = NodeStack.size()-1; i != e; ++i) {
         unsigned NNonChainUses = 0;
         SDNode *NS = NodeStack[i].getNode();
-        for (auto UI = NS->use_begin(), UE = NS->use_end(); UI != UE; ++UI)
-          if (UI.getUse().getValueType() != MVT::Other)
+        for (const SDUse &U : NS->uses())
+          if (U.getValueType() != MVT::Other)
             if (++NNonChainUses > 1) {
               HasMultipleUses = true;
               break;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8a9ee08869cd3..eb007c25ac89e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23448,11 +23448,10 @@ static SDValue performPostLD1Combine(SDNode *N,
 
   // Check if there are other uses. If so, do not combine as it will introduce
   // an extra load.
-  for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
-       ++UI) {
-    if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
+  for (SDUse &U : LD->uses()) {
+    if (U.getResNo() == 1) // Ignore uses of the chain result.
       continue;
-    if (*UI != N)
+    if (U.getUser() != N)
       return SDValue();
   }
 
@@ -23468,11 +23467,9 @@ static SDValue performPostLD1Combine(SDNode *N,
   SDValue Addr = LD->getOperand(1);
   SDValue Vector = N->getOperand(0);
   // Search for a use of the address operand that is an increment.
-  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
-       Addr.getNode()->use_end(); UI != UE; ++UI) {
-    SDNode *User = *UI;
-    if (User->getOpcode() != ISD::ADD
-        || UI.getUse().getResNo() != Addr.getResNo())
+  for (SDUse &Use : Addr->uses()) {
+    SDNode *User = Use.getUser();
+    if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
       continue;
 
     // If the increment is a constant, it must match the memory ref size.
@@ -24186,11 +24183,9 @@ static SDValue performNEONPostLDSTCombine(SDNode *N,
   SDValue Addr = N->getOperand(AddrOpIdx);
 
   // Search for a use of the address operand that is an increment.
-  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
-       UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
-    SDNode *User = *UI;
-    if (User->getOpcode() != ISD::ADD ||
-        UI.getUse().getResNo() != Addr.getResNo())
+  for (SDUse &Use : Addr->uses()) {
+    SDNode *User = Use.getUser();
+    if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
       continue;
 
     // Check that the add is independent of the load/store.  Otherwise, folding
@@ -26662,12 +26657,11 @@ bool AArch64TargetLowering::getIndexedAddressParts(SDNode *N, SDNode *Op,
 
   // Non-null if there is exactly one user of the loaded value (ignoring chain).
   SDNode *ValOnlyUser = nullptr;
-  for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
-       ++UI) {
-    if (UI.getUse().getResNo() == 1)
+  for (SDUse &U : N->uses()) {
+    if (U.getResNo() == 1)
       continue; // Ignore chain.
     if (ValOnlyUser == nullptr)
-      ValOnlyUser = *UI;
+      ValOnlyUser = U.getUser();
     else {
       ValOnlyUser = nullptr; // Multiple non-chain uses, bail out.
       break;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b0d7dc1986a88..ed699e1cc72e3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -932,7 +932,7 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
   bool IsAdd = N->getOpcode() == ISD::UADDO;
   bool IsVALU = N->isDivergent();
 
-  for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
+  for (SDNode::user_iterator UI = N->user_begin(), E = N->user_end(); UI != E;
        ++UI)
     if (UI.getUse().getResNo() == 1) {
       if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
@@ -3754,7 +3754,8 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
   bool AllUsesAcceptSReg = true;
   for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
     Limit < 10 && U != E; ++U, ++Limit) {
-    const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
+    const TargetRegisterClass *RC =
+        getOperandRegClass(U->getUser(), U.getOperandNo());
 
     // If the register class is unknown, it could be an unknown
     // register class that needs to be an SGPR, e.g. an inline asm
@@ -3764,7 +3765,7 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
 
     if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
       AllUsesAcceptSReg = false;
-      SDNode * User = *U;
+      SDNode *User = U->getUser();
       if (User->isMachineOpcode()) {
         unsigned Opc = User->getMachineOpcode();
         const MCInstrDesc &Desc = SII->get(Opc);
@@ -3773,7 +3774,8 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
           unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
           if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
             unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
-            const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
+            const TargetRegisterClass *CommutedRC =
+                getOperandRegClass(U->getUser(), CommutedOpNo);
             if (CommutedRC == &AMDGPU::VS_32RegClass ||
                 CommutedRC == &AMDGPU::VS_64RegClass)
               AllUsesAcceptSReg = true;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f83ccf6d8280b..3423ea1818579 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6566,15 +6566,12 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
 /// Helper function for LowerBRCOND
 static SDNode *findUser(SDValue Value, unsigned Opcode) {
 
-  SDNode *Parent = Value.getNode();
-  for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
-       I != E; ++I) {
-
-    if (I.getUse().get() != Value)
+  for (SDUse &U : Value->uses()) {
+    if (U.get() != Value)
       continue;
 
-    if (I->getOpcode() == Opcode)
-      return *I;
+    if (U.getUser()->getOpcode() == Opcode)
+      return U.getUser();
   }
   return nullptr;
 }
@@ -15142,29 +15139,30 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
   }
 
   // Try to figure out the used register components
-  for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end(); I != E;
-       ++I) {
+  for (SDUse &Use : Node->uses()) {
 
     // Don't look at users of the chain.
-    if (I.getUse().getResNo() != 0)
+    if (Use.getResNo() != 0)
       continue;
 
+    SDNode *User = Use.getUser();
+
     // Abort if we can't understand the usage
-    if (!I->isMachineOpcode() ||
-        I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
+    if (!User->isMachineOpcode() ||
+        User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
       return Node;
 
     // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
     // Note that subregs are packed, i.e. Lane==0 is the first bit set
     // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
     // set, etc.
-    Lane = SubIdx2Lane(I->getConstantOperandVal(1));
+    Lane = SubIdx2Lane(User->getConstantOperandVal(1));
     if (Lane == ~0u)
       return Node;
 
     // Check if the use is for the TFE/LWE generated result at VGPRn+1.
     if (UsesTFC && Lane == TFCLane) {
-      Users[Lane] = *I;
+      Users[Lane] = User;
     } else {
       // Set which texture component corresponds to the lane.
       unsigned Comp;
@@ -15177,7 +15175,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
       if (Users[Lane])
         return Node;
 
-      Users[Lane] = *I;
+      Users[Lane] = User;
       NewDmask |= 1 << Comp;
     }
   }
@@ -16878,7 +16876,7 @@ bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
 bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {
   SDNode::use_iterator I = N->use_begin(), E = N->use_end();
   for (; I != E; ++I) {
-    if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
+    if (MemSDNode *M = dyn_cast<MemSDNode>(I->getUser())) {
       if (getBasePtrIndex(M) == I.getOperandNo())
         return true;
     }
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 860d13f3d1217..d8d3875002125 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -16221,9 +16221,8 @@ static SDValue CombineBaseUpdate(SDNode *N,
   // Search for a use of the address operand that is an increment.
   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
          UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
-    SDNode *User = *UI;
-    if (UI.getUse().getResNo() != Addr.getResNo() ||
-        User->getNumOperands() != 2)
+    SDNode *User = UI->getUser();
+    if (UI->getResNo() != Addr.getResNo() || User->getNumOperands() != 2)
       continue;
 
     SDValue Inc = User->getOperand(UI.getOperandNo() == 1 ? 0 : 1);
@@ -16244,8 +16243,8 @@ static SDValue CombineBaseUpdate(SDNode *N,
     for (SDNode::use_iterator UI = Base->use_begin(), UE = Base->use_end();
          UI != UE; ++UI) {
 
-      SDNode *User = *UI;
-      if (UI.getUse().getResNo() != Base.getResNo() || User == Addr.getNode() ||
+      SDNode *User = UI->getUser();
+      if (UI->getResNo() != Base.getResNo() || User == Addr.getNode() ||
           User->getNumOperands() != 2)
         continue;
 
@@ -16322,12 +16321,9 @@ static SDValue PerformMVEVLDCombine(SDNode *N,
     return SDValue();
 
   // Search for a use of the address operand that is an increment.
-  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
-                            UE = Addr.getNode()->use_end();
-       UI != UE; ++UI) {
-    SDNode *User = *UI;
-    if (User->getOpcode() != ISD::ADD ||
-        UI.getUse().getResNo() != Addr.getResNo())
+  for (SDUse &Use : Addr->uses()) {
+    SDNode *User = Use.getUser();
+    if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
       continue;
 
     // Check that the add is independent of the load/store.  Otherwise, folding
@@ -16457,12 +16453,11 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   // First check that all the vldN-lane uses are VDUPLANEs and that the lane
   // numbers match the load.
   unsigned VLDLaneNo = VLD->getConstantOperandVal(NumVecs + 3);
-  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
-       UI != UE; ++UI) {
+  for (SDUse &Use : VLD->uses()) {
     // Ignore uses of the chain result.
-    if (UI.getUse().getResNo() == NumVecs)
+    if (Use.getResNo() == NumVecs)
       continue;
-    SDNode *User = *UI;
+    SDNode *User = Use.getUser();
     if (User->getOpcode() != ARMISD::VDUPLANE ||
         VLDLaneNo != User->getConstantOperandVal(1))
       return false;
@@ -16482,14 +16477,12 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
                                            VLDMemInt->getMemOperand());
 
   // Update the uses.
-  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
-       UI != UE; ++UI) {
-    unsigned ResNo = UI.getUse().getResNo();
+  for (SDUse &Use : VLD->uses()) {
+    unsigned ResNo = Use.getResNo();
     // Ignore uses of the chain result.
     if (ResNo == NumVecs)
       continue;
-    SDNode *User = *UI;
-    DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
+    DCI.CombineTo(Use.getUser(), SDValue(VLDDup.getNode(), ResNo));
   }
 
   // Now the vldN-lane intrinsic is dead except for its chain result.
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 2a267e52610b3..43ca5456ee3e2 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -1303,7 +1303,7 @@ void HexagonDAGToDAGISel::ppHoistZextI1(std::vector<SDNode*> &&Nodes) {
     if (!OpVT.isSimple() || OpVT.getSimpleVT() != MVT::i1)
       continue;
     for (auto I = N->use_begin(), E = N->use_end(); I != E; ++I) {
-      SDNode *U = *I;
+      SDNode *U = I->getUser();
       if (U->getNumValues() != 1)
         continue;
       EVT UVT = U->getValueType(0);
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 5445a0a06bef1..4fe8383557b32 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -954,24 +954,25 @@ static unsigned allUsesTruncate(SelectionDAG *CurDAG, SDNode *N) {
   // Cannot use range-based for loop here as we need the actual use (i.e. we
   // need the operand number corresponding to the use). A range-based for
   // will unbox the use and provide an SDNode*.
-  for (SDNode::use_iterator Use = N->use_begin(), UseEnd = N->use_end();
-       Use != UseEnd; ++Use) {
+  for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
+       ++UI) {
+    SDNode *User = UI->getUser();
     unsigned Opc =
-      Use->isMachineOpcode() ? Use->getMachineOpcode() : Use->getOpcode();
+        User->isMachineOpcode() ? User->getMachineOpcode() : User->getOpcode();
     switch (Opc) {
     default: return 0;
     case ISD::TRUNCATE:
-      if (Use->isMachineOpcode())
+      if (User->isMachineOpcode())
         return 0;
-      MaxTruncation =
-        std::max(MaxTruncation, (unsigned)Use->getValueType(0).getSizeInBits());
+      MaxTruncation = std::max(MaxTruncation,
+                               (unsigned)User->getValueType(0).getSizeInBits());
       continue;
     case ISD::STORE: {
-      if (Use->isMachineOpcode())
+      if (User->isMachineOpcode())
         return 0;
-      StoreSDNode *STN = cast<StoreSDNode>(*Use);
+      StoreSDNode *STN = cast<StoreSDNode>(User);
       unsigned MemVTSize = STN->getMemoryVT().getSizeInBits();
-      if (MemVTSize == 64 || Use.getOperandNo() != 0)
+      if (MemVTSize == 64 || UI.getOperandNo() != 0)
         return 0;
       MaxTruncation = std::max(MaxTruncation, MemVTSize);
       continue;
@@ -980,7 +981,7 @@ static unsigned allUsesTruncate(SelectionDAG *CurDAG, SDNode *N) {
     case PPC::STWX8:
     case PPC::STWU8:
     case PPC::STWUX8:
-      if (Use.getOperandNo() != 0)
+      if (UI.getOperandNo() != 0)
         return 0;
       MaxTruncation = std::max(MaxTruncation, 32u);
       continue;
@@ -988,7 +989,7 @@ static unsigned allUsesTruncate(SelectionDAG *CurDAG, SDNode *N) {
     case PPC::STHX8:
     case PPC::STHU8:
     case PPC::STHUX8:
-      if (Use.getOperandNo() != 0)
+      if (UI.getOperandNo() != 0)
         return 0;
       MaxTruncation = std::max(MaxTruncation, 16u);
       continue;
@@ -996,7 +997,7 @@ static unsigned allUsesTruncate(SelectionDAG *CurDAG, SDNode *N) {
     case PPC::STBX8:
     case PPC::STBU8:
     case PPC::STBUX8:
-      if (Use.getOperandNo() != 0)
+      if (UI.getOperandNo() != 0)
         return 0;
       MaxTruncation = std::max(MaxTruncation, 8u);
       continue;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 3b3842bb14456..691107abf3e89 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -3049,11 +3049,10 @@ static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) {
   if (!LoadedVal.hasOneUse())
     return false;
 
-  for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end();
-       UI != UE; ++UI)
-    if (UI.getUse().get().getResNo() == 0 &&
-        UI->getOpcode() != ISD::SCALAR_TO_VECTOR &&
-        UI->getOpcode() != PPCISD::SCALAR_TO_VECTOR_PERMUTED)
+  for (SDUse &Use : LD->uses())
+    if (Use.getResNo() == 0 &&
+        Use.getUser()->getOpcode() != ISD::SCALAR_TO_VECTOR &&
+        Use.getUser()->getOpcode() != PPCISD::SCALAR_TO_VECTOR_PERMUTED)
       return false;
 
   return true;
@@ -8684,18 +8683,17 @@ bool PPCTargetLowering::directMoveIsProfitable(const SDValue &Op) const {
       (!MMO->getSize().hasValue() || MMO->getSize().getValue() <= 2))
     return true;
 
-  for (SDNode::use_iterator UI = Origin->use_begin(),
-                            UE = Origin->use_end();
-       UI != UE; ++UI) {
+  for (SDUse &Use : Origin->uses()) {
 
     // Only look at the users of the loaded value.
-    if (UI.getUse().get().getResNo() != 0)
+    if (Use.getResNo() != 0)
       continue;
 
-    if (UI->getOpcode() != ISD::SINT_TO_FP &&
-        UI->getOpcode() != ISD::UINT_TO_FP &&
-        UI->getOpcode() != ISD::STRICT_SINT_TO_FP &&
-        UI->getOpcode() != ISD::STRICT_UINT_TO_FP)
+    SDNode *User = Use.getUser();
+    if (User->getOpcode() != ISD::SINT_TO_FP &&
+        User->getOpcode() != ISD::UINT_TO_FP &&
+        User->getOpcode() != ISD::STRICT_SINT_TO_FP &&
+        User->getOpcode() != ISD::STRICT_UINT_TO_FP)
       return true;
   }
 
@@ -16081,9 +16079,9 @@ SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
     // If the load return value 0 has more than one user except the
     // shufflevector instruction, it is not profitable to replace the
     // shufflevector with a reverse load.
-    for (SDNode::use_iterator UI = LSBase->use_begin(), UE = LSBase->use_end();
-         UI != UE; ++UI)
-      if (UI.getUse().getResNo() == 0 && UI->getOpcode() != ISD::VECTOR_SHUFFLE)
+    for (SDUse &Use : LSBase->uses())
+      if (Use.getResNo() == 0 &&
+          Use.getUser()->getOpcode() != ISD::VECTOR_SHUFFLE)
         return SDValue();
 
     SDLoc dl(LSBase);
@@ -16755,13 +16753,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       SDNode *VCMPrecNode = nullptr;
 
       SDNode *LHSN = N->getOperand(0).getNode();
-      for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();
-           UI != E; ++UI)
-        if (UI->getOpcode() == PPCISD::VCMP_rec &&
-            UI->getOperand(1) == N->getOperand(1) &&
-            UI->getOperand(2) == N->getOperand(2) &&
-            UI->getOperand(0) == N->getOperand(0)) {
-          VCMPrecNode = *UI;
+      for (SDNode *User : LHSN->users())
+        if (User->getOpcode() == PPCISD::VCMP_rec &&
+            User->getOperand(1) == N->getOperand(1) &&
+            User->getOperand(2) == N->getOperand(2) &&
+            User->getOperand(0) == N->getOperand(0)) {
+          VCMPrecNode = User;
           break;
         }
 
@@ -16777,7 +16774,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       for (SDNode::use_iterator UI = VCMPrecNode->use_begin();
            FlagUser == nullptr; ++UI) {
         assert(UI != VCMPrecNode->use_end() && "Didn't find user!");
-        SDNode *User = *UI;
+        SDNode *User = UI->getUser();
         for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
           if (User->getOperand(i) == SDValue(VCMPrecNode, 1)) {
             FlagUser = User;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 4393d33021760..dc45108e55e20 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3294,7 +3294,7 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits,
     return false;
 
   for (auto UI = Node->use_begin(), UE = Node->use_end(); UI != UE; ++UI) {
-    SDNode *User = *UI;
+    SDNode *User = UI->getUser();
     // Users of this node should have already been instruction selected
     if (!User->isMachineOpcode())
       return false;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f0afd26598d6d..01ef101ff8947 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -15691,14 +15691,14 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N,
         for (SDNode::use_iterator UI = Op.OrigOperand->use_begin(),
                                   UE = Op.OrigOperand->use_end();
              UI != UE; ++UI) {
-          SDNode *TheUse = *UI;
-          if (!NodeExtensionHelper::isSupportedRoot(TheUse, Subtarget))
+          SDNode *TheUser = UI->getUser();
+          if (!NodeExtensionHelper::isSupportedRoot(TheUser, Subtarget))
             return false;
           // We only support the first 2 operands of FMA.
           if (UI.getOperandNo() >= 2)
             return false;
-          if (Inserted.insert(TheUse).second)
-            Worklist.push_back(TheUse);
+          if (Inserted.insert(TheUser).second)
+            Worklist.push_back(TheUser);
         }
       }
       return true;
@@ -15916,9 +15916,7 @@ static SDValue performMemPairCombine(SDNode *N,
   auto [Base1, Offset1] = ExtractBaseAndOffset(LSNode1->getOperand(OpNum));
 
   SDValue Chain = N->getOperand(0);
-  for (SDNode::use_iterator UI = Chain->use_begin(), UE = Chain->use_end();
-       UI != UE; ++UI) {
-    SDUse &Use = UI.getUse();
+  for (SDUse &Use : Chain->uses()) {
     if (Use.getUser() != N && Use.getResNo() == 0 &&
         Use.getUser()->getOpcode() == N->getOpcode()) {
       LSBaseSDNode *LSNode2 = cast<LSBaseSDNode>(Use.getUser());
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 331d3a4d494c9..d664b4a41fce7 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -7105,14 +7105,13 @@ static bool isI128MovedToParts(LoadSDNode *LD, SDNode *&LoPart,
   LoPart = HiPart = nullptr;
 
   // Scan through all users.
-  for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
-       UI != UIEnd; ++UI) {
+  for (SDUse &Use : LD->uses()) {
     // Skip the uses of the chain.
-    if (UI.getUse().getResNo() != 0)
+    if (Use.getResNo() != 0)
       continue;
 
     // Verify every user is a TRUNCATE to i64 of the low or high half.
-    SDNode *User = *UI;
+    SDNode *User = Use.getUser();
     bool IsLoPart = true;
     if (User->getOpcode() == ISD::SRL &&
         User->getOperand(1).getOpcode() == ISD::Constant &&
@@ -7141,14 +7140,13 @@ static bool isF128MovedToParts(LoadSDNode *LD, SDNode *&LoPart,
   LoPart = HiPart = nullptr;
 
   // Scan through all users.
-  for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
-       UI != UIEnd; ++UI) {
+  for (SDUse &Use : LD->uses()) {
     // Skip the uses of the chain.
-    if (UI.getUse().getResNo() != 0)
+    if (Use.getResNo() != 0)
       continue;
 
     // Verify every user is an EXTRACT_SUBREG of the low or high half.
-    SDNode *User = *UI;
+    SDNode *User = Use.getUser();
     if (!User->hasOneUse() || !User->isMachineOpcode() ||
         User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
       return false;
@@ -7238,15 +7236,13 @@ SDValue SystemZTargetLowering::combineLOAD(
 
   SDValue Replicate;
   SmallVector<SDNode*, 8> OtherUses;
-  for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
-       UI != UE; ++UI) {
-    if (UI->getOpcode() == SystemZISD::REPLICATE) {
+  for (SDUse &Use : N->uses()) {
+    if (Use.getUser()->getOpcode() == SystemZISD::REPLICATE) {
       if (Replicate)
         return SDValue(); // Should never happen
-      Replicate = SDValue(*UI, 0);
-    }
-    else if (UI.getUse().getResNo() == 0)
-      OtherUses.push_back(*UI);
+      Replicate = SDValue(Use.getUser(), 0);
+    } else if (Use.getResNo() == 0)
+      OtherUses.push_back(Use.getUser());
   }
   if (!Replicate || OtherUses.empty())
     return SDValue();
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index bb20e6ecf281b..9b340a778b36a 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -3318,24 +3318,25 @@ X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
 /// other than ZF.
 bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
   // Examine each user of the node.
-  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
-         UI != UE; ++UI) {
+  for (SDUse &Use : Flags->uses()) {
     // Only check things that use the flags.
-    if (UI.getUse().getResNo() != Flags.getResNo())
+    if (Use.getResNo() != Flags.getResNo())
       continue;
+    SDNode *User = Use.getUser();
     // Only examine CopyToReg uses that copy to EFLAGS.
-    if (UI->getOpcode() != ISD::CopyToReg ||
-        cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
+    if (User->getOpcode() != ISD::CopyToReg ||
+        cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
       return false;
     // Examine each user of the CopyToReg use.
-    for (SDNode::use_iterator FlagUI = UI->use_begin(),
-           FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
+    for (SDUse &FlagUse : User->uses()) {
       // Only examine the Flag result.
-      if (FlagUI.getUse().getResNo() != 1) continue;
+      if (FlagUse.getResNo() != 1)
+        continue;
       // Anything unusual: assume conservatively.
-      if (!FlagUI->isMachineOpcode()) return false;
+      if (!FlagUse.getUser()->isMachineOpcode())
+        return false;
       // Examine the condition code of the user.
-      X86::CondCode CC = getCondFromNode(*FlagUI);
+      X86::CondCode CC = getCondFromNode(FlagUse.getUser());
 
       switch (CC) {
       // Comparisons which only use the zero flag.
@@ -3354,24 +3355,25 @@ bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
 /// flag to be accurate.
 bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
   // Examine each user of the node.
-  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
-         UI != UE; ++UI) {
+  for (SDUse &Use : Flags->uses()) {
     // Only check things that use the flags.
-    if (UI.getUse().getResNo() != Flags.getResNo())
+    if (Use.getResNo() != Flags.getResNo())
       continue;
+    SDNode *User = Use.getUser();
     // Only examine CopyToReg uses that copy to EFLAGS.
-    if (UI->getOpcode() != ISD::CopyToReg ||
-        cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
+    if (User->getOpcode() != ISD::CopyToReg ||
+        cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
       return false;
     // Examine each user of the CopyToReg use.
-    for (SDNode::use_iterator FlagUI = UI->use_begin(),
-           FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
+    for (SDUse &FlagUse : User->uses()) {
       // Only examine the Flag result.
-      if (FlagUI.getUse().getResNo() != 1) continue;
+      if (FlagUse.getResNo() != 1)
+        continue;
       // Anything unusual: assume conservatively.
-      if (!FlagUI->isMachineOpcode()) return false;
+      if (!FlagUse.getUser()->isMachineOpcode())
+        return false;
       // Examine the condition code of the user.
-      X86::CondCode CC = getCondFromNode(*FlagUI);
+      X86::CondCode CC = getCondFromNode(FlagUse.getUser());
 
       switch (CC) {
       // Comparisons which don't examine the SF flag.
@@ -3410,29 +3412,28 @@ static bool mayUseCarryFlag(X86::CondCode CC) {
 /// CF flag to be accurate.
  bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
   // Examine each user of the node.
-  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
-         UI != UE; ++UI) {
+  for (SDUse &Use : Flags->uses()) {
     // Only check things that use the flags.
-    if (UI.getUse().getResNo() != Flags.getResNo())
+    if (Use.getResNo() != Flags.getResNo())
       continue;
 
-    unsigned UIOpc = UI->getOpcode();
+    SDNode *User = Use.getUser();
+    unsigned UserOpc = User->getOpcode();
 
-    if (UIOpc == ISD::CopyToReg) {
+    if (UserOpc == ISD::CopyToReg) {
       // Only examine CopyToReg uses that copy to EFLAGS.
-      if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
+      if (cast<RegisterSDNode>(User->getOperand(1))->getReg() != X86::EFLAGS)
         return false;
       // Examine each user of the CopyToReg use.
-      for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
-           FlagUI != FlagUE; ++FlagUI) {
+      for (SDUse &FlagUse : User->uses()) {
         // Only examine the Flag result.
-        if (FlagUI.getUse().getResNo() != 1)
+        if (FlagUse.getResNo() != 1)
           continue;
         // Anything unusual: assume conservatively.
-        if (!FlagUI->isMachineOpcode())
+        if (!FlagUse.getUser()->isMachineOpcode())
           return false;
         // Examine the condition code of the user.
-        X86::CondCode CC = getCondFromNode(*FlagUI);
+        X86::CondCode CC = getCondFromNode(FlagUse.getUser());
 
         if (mayUseCarryFlag(CC))
           return false;
@@ -3445,7 +3446,7 @@ static bool mayUseCarryFlag(X86::CondCode CC) {
     // This might be an unselected node. So look for the pre-isel opcodes that
     // use flags.
     unsigned CCOpNo;
-    switch (UIOpc) {
+    switch (UserOpc) {
     default:
       // Something unusual. Be conservative.
       return false;
@@ -3455,7 +3456,7 @@ static bool mayUseCarryFlag(X86::CondCode CC) {
     case X86ISD::BRCOND:      CCOpNo = 2; break;
     }
 
-    X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
+    X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo);
     if (mayUseCarryFlag(CC))
       return false;
   }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 874aaf8c3d645..826735d97226a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3208,14 +3208,16 @@ bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
   // can be store-folded. Therefore, it's probably not worth splitting the load.
   EVT VT = Load->getValueType(0);
   if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) {
-    for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) {
+    for (SDUse &Use : Load->uses()) {
       // Skip uses of the chain value. Result 0 of the node is the load value.
-      if (UI.getUse().getResNo() != 0)
+      if (Use.getResNo() != 0)
         continue;
 
+      SDNode *User = Use.getUser();
+
       // If this use is not an extract + store, it's probably worth splitting.
-      if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() ||
-          UI->user_begin()->getOpcode() != ISD::STORE)
+      if (User->getOpcode() != ISD::EXTRACT_SUBVECTOR || !User->hasOneUse() ||
+          User->user_begin()->getOpcode() != ISD::STORE)
         return true;
     }
     // All non-chain uses are extract + store.
@@ -18965,11 +18967,10 @@ static SDValue GetTLSADDR(SelectionDAG &DAG, GlobalAddressSDNode *GA,
   SDValue Ret;
   if (LocalDynamic && UseTLSDESC) {
     TGA = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, OperandFlags);
-    auto UI = TGA->use_begin();
     // Reuse existing GetTLSADDR node if we can find it.
-    if (UI != TGA->use_end()) {
+    if (TGA->hasOneUse()) {
       // TLSDESC uses TGA.
-      auto TLSDescOp = UI;
+      SDNode *TLSDescOp = *TGA->user_begin();
       assert(TLSDescOp->getOpcode() == X86ISD::TLSDESC &&
              "Unexpected TLSDESC DAG");
       // CALLSEQ_END uses TGA via a chain and glue.
@@ -22869,12 +22870,12 @@ static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS,
 static bool hasNonFlagsUse(SDValue Op) {
   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
        ++UI) {
-    SDNode *User = *UI;
+    SDNode *User = UI->getUser();
     unsigned UOpNo = UI.getOperandNo();
     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
-      // Look pass truncate.
-      UOpNo = User->user_begin().getOperandNo();
-      User = *User->user_begin();
+      // Look past truncate.
+      UOpNo = User->use_begin().getOperandNo();
+      User = User->use_begin()->getUser();
     }
 
     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
@@ -46722,8 +46723,8 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
   auto OnlyUsedAsSelectCond = [](SDValue Cond) {
     for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
          UI != UE; ++UI)
-      if ((UI->getOpcode() != ISD::VSELECT &&
-           UI->getOpcode() != X86ISD::BLENDV) ||
+      if ((UI->getUser()->getOpcode() != ISD::VSELECT &&
+           UI->getUser()->getOpcode() != X86ISD::BLENDV) ||
           UI.getOperandNo() != 0)
         return false;
 

From fafdf97047b1d9622ffbb59919d2422e062882f2 Mon Sep 17 00:00:00 2001
From: Peng Liu <winner245@hotmail.com>
Date: Thu, 19 Dec 2024 11:48:51 -0500
Subject: [PATCH 074/209] [libc++] Simplify vector<bool>::flip() and add new
 tests (#119607)

This PR simplifies the internal bitwise logic of the `flip()` function
for `vector<bool>`, and creates new tests to validate the changes.
---
 libcxx/include/__vector/vector_bool.h         | 13 ++---
 .../sequences/vector.bool/flip.pass.cpp       | 54 +++++++++++++++++++
 2 files changed, 57 insertions(+), 10 deletions(-)
 create mode 100644 libcxx/test/std/containers/sequences/vector.bool/flip.pass.cpp

diff --git a/libcxx/include/__vector/vector_bool.h b/libcxx/include/__vector/vector_bool.h
index 36eb7f350ac40..525fc35b26cc9 100644
--- a/libcxx/include/__vector/vector_bool.h
+++ b/libcxx/include/__vector/vector_bool.h
@@ -1049,18 +1049,11 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<bool, _Allocator>::resize(size_type __
 
 template <class _Allocator>
 _LIBCPP_CONSTEXPR_SINCE_CXX20 void vector<bool, _Allocator>::flip() _NOEXCEPT {
-  // do middle whole words
-  size_type __n         = __size_;
+  // Flip each storage word entirely, including the last potentially partial word.
+  // The unused bits in the last word are safe to flip as they won't be accessed.
   __storage_pointer __p = __begin_;
-  for (; __n >= __bits_per_word; ++__p, __n -= __bits_per_word)
+  for (size_type __n = __external_cap_to_internal(size()); __n != 0; ++__p, --__n)
     *__p = ~*__p;
-  // do last partial word
-  if (__n > 0) {
-    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-    __storage_type __b = *__p & __m;
-    *__p &= ~__m;
-    *__p |= ~__b & __m;
-  }
 }
 
 template <class _Allocator>
diff --git a/libcxx/test/std/containers/sequences/vector.bool/flip.pass.cpp b/libcxx/test/std/containers/sequences/vector.bool/flip.pass.cpp
new file mode 100644
index 0000000000000..f8f575cdc0e21
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/vector.bool/flip.pass.cpp
@@ -0,0 +1,54 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <vector>
+
+// void flip();
+
+#include <cassert>
+#include <memory>
+#include <vector>
+
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_macros.h"
+
+template <typename Allocator>
+TEST_CONSTEXPR_CXX20 void test_vector_flip(std::size_t n, Allocator a) {
+  std::vector<bool, Allocator> v(n, false, a);
+  for (std::size_t i = 0; i < n; ++i)
+    v[i] = i & 1;
+  std::vector<bool, Allocator> original = v;
+  v.flip();
+  for (size_t i = 0; i < n; ++i)
+    assert(v[i] == !original[i]);
+  v.flip();
+  assert(v == original);
+}
+
+TEST_CONSTEXPR_CXX20 bool tests() {
+  // Test small vectors with different allocators
+  test_vector_flip(3, std::allocator<bool>());
+  test_vector_flip(3, min_allocator<bool>());
+  test_vector_flip(3, test_allocator<bool>(5));
+
+  // Test large vectors with different allocators
+  test_vector_flip(1000, std::allocator<bool>());
+  test_vector_flip(1000, min_allocator<bool>());
+  test_vector_flip(1000, test_allocator<bool>(5));
+
+  return true;
+}
+
+int main(int, char**) {
+  tests();
+#if TEST_STD_VER >= 20
+  static_assert(tests());
+#endif
+  return 0;
+}

From 145ddf7ede28d9131a65b7f86ad07736a824ee21 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 19 Dec 2024 08:52:58 -0800
Subject: [PATCH 075/209] [M68k] Fix build after splitting
 SDNode::use_iterator.

---
 llvm/lib/Target/M68k/M68kISelLowering.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp
index 98ed46d91da60..79f5ec3ca5572 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp
@@ -1848,12 +1848,12 @@ static SDValue LowerTruncateToBTST(SDValue Op, ISD::CondCode CC,
 static bool hasNonFlagsUse(SDValue Op) {
   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
        ++UI) {
-    SDNode *User = *UI;
+    SDNode *User = UI->getUser();
     unsigned UOpNo = UI.getOperandNo();
     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
       // Look pass truncate.
       UOpNo = User->use_begin().getOperandNo();
-      User = *User->use_begin();
+      User = User->use_begin()->getUser();
     }
 
     if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
@@ -2542,7 +2542,7 @@ SDValue M68kTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
               (M68k::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
           CCode = M68k::GetOppositeBranchCondition(CCode);
           CC = DAG.getConstant(CCode, DL, MVT::i8);
-          SDNode *User = *Op.getNode()->use_begin();
+          SDNode *User = *Op.getNode()->user_begin();
           // Look for an unconditional branch following this conditional branch.
           // We need this because we need to reverse the successors in order
           // to implement FCMP_OEQ.

From 527595f92789f9701a4b91ab32b792034352f78d Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Thu, 19 Dec 2024 08:58:36 -0800
Subject: [PATCH 076/209] [lldb][Mach-O] Initialize cputype/cpusubtype in test
 corefiles (#120518)

TestFirmwareCorefiles.py has a helper utility,
create-empty-corefile.cpp, which creates corefiles with different
metadata to specify the binary that should be loaded. It normally uses
an actual binary's UUID for the metadata, and it uses the binary's
cputype/cpusubtype for the corefile's mach header.

There is one test where it creates a corefile with metadata for a UUID
that cannot be found -- it is given no binary -- and in that case, the
cputype/cpusubtype it sets in the core file mach header was
uninitialized data. Through luck, on Darwin systems, the uninitialized
data typically matched a CPU_TYPE from machine.h and the test would
work. But when the value doens't match one of thoes defines, lldb would
reject the corefile entirely, and the test would fail. This has been an
infrequent failure on the CI bots for a while and I couldn't ever repo
it. There's a recent configuration where it was happening every time and
I was able to track it down.

rdar://141727563
---
 .../lc-note/firmware-corefile/create-empty-corefile.cpp   | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/lldb/test/API/macosx/lc-note/firmware-corefile/create-empty-corefile.cpp b/lldb/test/API/macosx/lc-note/firmware-corefile/create-empty-corefile.cpp
index d7c2d422412e4..6c163527b15d7 100644
--- a/lldb/test/API/macosx/lc-note/firmware-corefile/create-empty-corefile.cpp
+++ b/lldb/test/API/macosx/lc-note/firmware-corefile/create-empty-corefile.cpp
@@ -189,6 +189,14 @@ void add_lc_segment(std::vector<std::vector<uint8_t>> &loadcmds,
 
 std::string get_uuid_from_binary(const char *fn, cpu_type_t &cputype,
                                  cpu_subtype_t &cpusubtype) {
+// We may be given a file, set reasonable values.
+#if defined(__x86_64__)
+  cputype = CPU_TYPE_X86;
+  cpusubtype = CPU_SUBTYPE_X86_ALL;
+#else
+  cputype = CPU_TYPE_ARM64;
+  cpusubtype = CPU_SUBTYPE_ARM64_ALL;
+#endif
   if (strlen(fn) == 0)
     return {};
 

From 2b9abf0db2d106c7208b4372e662ef5df869e6f1 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Thu, 19 Dec 2024 17:02:16 +0000
Subject: [PATCH 077/209] Revert "[analyzer] Handle [[assume(cond)]] as
 __builtin_assume(cond) (#116462)"

This reverts commit 89da344e5879e5347b5057520d5230e40ae24831.

Reason: buildbot breakages e.g., https://lab.llvm.org/buildbot/#/builders/55/builds/4556 (for which the reverted patch is the only code change)
---
 clang/include/clang/AST/AttrIterator.h        | 12 ----
 .../Core/PathSensitive/ExprEngine.h           |  4 --
 clang/lib/Analysis/CFG.cpp                    | 72 ++++++-------------
 clang/lib/StaticAnalyzer/Core/ExprEngine.cpp  |  8 +--
 clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp |  7 +-
 .../lib/StaticAnalyzer/Core/ExprEngineCXX.cpp | 18 -----
 .../test/Analysis/cxx23-assume-attribute.cpp  | 58 ---------------
 clang/test/Analysis/out-of-bounds-new.cpp     | 64 +----------------
 8 files changed, 25 insertions(+), 218 deletions(-)
 delete mode 100644 clang/test/Analysis/cxx23-assume-attribute.cpp

diff --git a/clang/include/clang/AST/AttrIterator.h b/clang/include/clang/AST/AttrIterator.h
index 2f39c144dc160..7e2bb0381d4c8 100644
--- a/clang/include/clang/AST/AttrIterator.h
+++ b/clang/include/clang/AST/AttrIterator.h
@@ -16,7 +16,6 @@
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/ADL.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
 #include <cassert>
 #include <cstddef>
@@ -125,17 +124,6 @@ inline auto *getSpecificAttr(const Container &container) {
   return It != specific_attr_end<IterTy>(container) ? *It : nullptr;
 }
 
-template <typename SpecificAttr, typename Container>
-inline auto getSpecificAttrs(const Container &container) {
-  using ValueTy = llvm::detail::ValueOfRange<Container>;
-  using ValuePointeeTy = std::remove_pointer_t<ValueTy>;
-  using IterTy = std::conditional_t<std::is_const_v<ValuePointeeTy>,
-                                    const SpecificAttr, SpecificAttr>;
-  auto Begin = specific_attr_begin<IterTy>(container);
-  auto End = specific_attr_end<IterTy>(container);
-  return llvm::make_range(Begin, End);
-}
-
 } // namespace clang
 
 #endif // LLVM_CLANG_AST_ATTRITERATOR_H
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
index 078a1d840d051..8c7493e27fcaa 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
@@ -498,10 +498,6 @@ class ExprEngine {
   void VisitInitListExpr(const InitListExpr *E, ExplodedNode *Pred,
                          ExplodedNodeSet &Dst);
 
-  /// VisitAttributedStmt - Transfer function logic for AttributedStmt
-  void VisitAttributedStmt(const AttributedStmt *A, ExplodedNode *Pred,
-                           ExplodedNodeSet &Dst);
-
   /// VisitLogicalExpr - Transfer function logic for '&&', '||'
   void VisitLogicalExpr(const BinaryOperator* B, ExplodedNode *Pred,
                         ExplodedNodeSet &Dst);
diff --git a/clang/lib/Analysis/CFG.cpp b/clang/lib/Analysis/CFG.cpp
index 65f915ef087af..304bbb2b422c6 100644
--- a/clang/lib/Analysis/CFG.cpp
+++ b/clang/lib/Analysis/CFG.cpp
@@ -433,7 +433,7 @@ class reverse_children {
   ArrayRef<Stmt *> children;
 
 public:
-  reverse_children(Stmt *S, ASTContext &Ctx);
+  reverse_children(Stmt *S);
 
   using iterator = ArrayRef<Stmt *>::reverse_iterator;
 
@@ -443,47 +443,28 @@ class reverse_children {
 
 } // namespace
 
-reverse_children::reverse_children(Stmt *S, ASTContext &Ctx) {
-  switch (S->getStmtClass()) {
-  case Stmt::CallExprClass: {
-    children = cast<CallExpr>(S)->getRawSubExprs();
+reverse_children::reverse_children(Stmt *S) {
+  if (CallExpr *CE = dyn_cast<CallExpr>(S)) {
+    children = CE->getRawSubExprs();
     return;
   }
-
-  // Note: Fill in this switch with more cases we want to optimize.
-  case Stmt::InitListExprClass: {
-    InitListExpr *IE = cast<InitListExpr>(S);
-    children = llvm::ArrayRef(reinterpret_cast<Stmt **>(IE->getInits()),
-                              IE->getNumInits());
-    return;
+  switch (S->getStmtClass()) {
+    // Note: Fill in this switch with more cases we want to optimize.
+    case Stmt::InitListExprClass: {
+      InitListExpr *IE = cast<InitListExpr>(S);
+      children = llvm::ArrayRef(reinterpret_cast<Stmt **>(IE->getInits()),
+                                IE->getNumInits());
+      return;
+    }
+    default:
+      break;
   }
-  case Stmt::AttributedStmtClass: {
-    auto *AS = cast<AttributedStmt>(S);
 
-    // for an attributed stmt, the "children()" returns only the NullStmt
-    // (;) but semantically the "children" are supposed to be the
-    // expressions _within_ i.e. the two square brackets i.e. [[ HERE ]]
-    // so we add the subexpressions first, _then_ add the "children"
+  // Default case for all other statements.
+  llvm::append_range(childrenBuf, S->children());
 
-    for (const auto *Attr : AS->getAttrs()) {
-      if (const auto *AssumeAttr = dyn_cast<CXXAssumeAttr>(Attr)) {
-        Expr *AssumeExpr = AssumeAttr->getAssumption();
-        if (!AssumeExpr->HasSideEffects(Ctx)) {
-          childrenBuf.push_back(AssumeExpr);
-        }
-      }
-      // Visit the actual children AST nodes.
-      // For CXXAssumeAttrs, this is always a NullStmt.
-      llvm::append_range(childrenBuf, AS->children());
-      children = childrenBuf;
-    }
-    return;
-  }
-  default:
-    // Default case for all other statements.
-    llvm::append_range(childrenBuf, S->children());
-    children = childrenBuf;
-  }
+  // This needs to be done *after* childrenBuf has been populated.
+  children = childrenBuf;
 }
 
 namespace {
@@ -2450,7 +2431,7 @@ CFGBlock *CFGBuilder::VisitChildren(Stmt *S) {
 
   // Visit the children in their reverse order so that they appear in
   // left-to-right (natural) order in the CFG.
-  reverse_children RChildren(S, *Context);
+  reverse_children RChildren(S);
   for (Stmt *Child : RChildren) {
     if (Child)
       if (CFGBlock *R = Visit(Child))
@@ -2466,7 +2447,7 @@ CFGBlock *CFGBuilder::VisitInitListExpr(InitListExpr *ILE, AddStmtChoice asc) {
   }
   CFGBlock *B = Block;
 
-  reverse_children RChildren(ILE, *Context);
+  reverse_children RChildren(ILE);
   for (Stmt *Child : RChildren) {
     if (!Child)
       continue;
@@ -2501,14 +2482,6 @@ static bool isFallthroughStatement(const AttributedStmt *A) {
   return isFallthrough;
 }
 
-static bool isCXXAssumeAttr(const AttributedStmt *A) {
-  bool hasAssumeAttr = hasSpecificAttr<CXXAssumeAttr>(A->getAttrs());
-
-  assert((!hasAssumeAttr || isa<NullStmt>(A->getSubStmt())) &&
-         "expected [[assume]] not to have children");
-  return hasAssumeAttr;
-}
-
 CFGBlock *CFGBuilder::VisitAttributedStmt(AttributedStmt *A,
                                           AddStmtChoice asc) {
   // AttributedStmts for [[likely]] can have arbitrary statements as children,
@@ -2524,11 +2497,6 @@ CFGBlock *CFGBuilder::VisitAttributedStmt(AttributedStmt *A,
     appendStmt(Block, A);
   }
 
-  if (isCXXAssumeAttr(A) && asc.alwaysAdd(*this, A)) {
-    autoCreateBlock();
-    appendStmt(Block, A);
-  }
-
   return VisitChildren(A);
 }
 
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 44c9e54bde5e3..0a74a80a6a62f 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1941,6 +1941,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     // to be explicitly evaluated.
     case Stmt::PredefinedExprClass:
     case Stmt::AddrLabelExprClass:
+    case Stmt::AttributedStmtClass:
     case Stmt::IntegerLiteralClass:
     case Stmt::FixedPointLiteralClass:
     case Stmt::CharacterLiteralClass:
@@ -1971,13 +1972,6 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
       break;
     }
 
-    case Stmt::AttributedStmtClass: {
-      Bldr.takeNodes(Pred);
-      VisitAttributedStmt(cast<AttributedStmt>(S), Pred, Dst);
-      Bldr.addNodes(Dst);
-      break;
-    }
-
     case Stmt::CXXDefaultArgExprClass:
     case Stmt::CXXDefaultInitExprClass: {
       Bldr.takeNodes(Pred);
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
index 1315bd10496f5..7a900780384a9 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
@@ -794,10 +794,9 @@ void ExprEngine::VisitGuardedExpr(const Expr *Ex,
 
   // Find the predecessor block.
   ProgramStateRef SrcState = state;
-
   for (const ExplodedNode *N = Pred ; N ; N = *N->pred_begin()) {
-    auto Edge = N->getLocationAs<BlockEdge>();
-    if (!Edge.has_value()) {
+    ProgramPoint PP = N->getLocation();
+    if (PP.getAs<PreStmtPurgeDeadSymbols>() || PP.getAs<BlockEntrance>()) {
       // If the state N has multiple predecessors P, it means that successors
       // of P are all equivalent.
       // In turn, that means that all nodes at P are equivalent in terms
@@ -805,7 +804,7 @@ void ExprEngine::VisitGuardedExpr(const Expr *Ex,
       // FIXME: a more robust solution which does not walk up the tree.
       continue;
     }
-    SrcBlock = Edge->getSrc();
+    SrcBlock = PP.castAs<BlockEdge>().getSrc();
     SrcState = N->getState();
     break;
   }
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
index 5f9dbcb55e811..f7020da2e6da2 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
@@ -10,7 +10,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clang/AST/AttrIterator.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/ParentMap.h"
 #include "clang/AST/StmtCXX.h"
@@ -1201,20 +1200,3 @@ void ExprEngine::VisitLambdaExpr(const LambdaExpr *LE, ExplodedNode *Pred,
   // FIXME: Move all post/pre visits to ::Visit().
   getCheckerManager().runCheckersForPostStmt(Dst, Tmp, LE, *this);
 }
-
-void ExprEngine::VisitAttributedStmt(const AttributedStmt *A,
-                                     ExplodedNode *Pred, ExplodedNodeSet &Dst) {
-  ExplodedNodeSet CheckerPreStmt;
-  getCheckerManager().runCheckersForPreStmt(CheckerPreStmt, Pred, A, *this);
-
-  ExplodedNodeSet EvalSet;
-  StmtNodeBuilder Bldr(CheckerPreStmt, EvalSet, *currBldrCtx);
-
-  for (const auto *Attr : getSpecificAttrs<CXXAssumeAttr>(A->getAttrs())) {
-    for (ExplodedNode *N : CheckerPreStmt) {
-      Visit(Attr->getAssumption(), N, EvalSet);
-    }
-  }
-
-  getCheckerManager().runCheckersForPostStmt(Dst, EvalSet, A, *this);
-}
diff --git a/clang/test/Analysis/cxx23-assume-attribute.cpp b/clang/test/Analysis/cxx23-assume-attribute.cpp
deleted file mode 100644
index defcdeec282f6..0000000000000
--- a/clang/test/Analysis/cxx23-assume-attribute.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-// RUN: %clang_analyze_cc1 -std=c++23 -triple x86_64-pc-linux-gnu \
-// RUN:   -analyzer-checker=core,debug.ExprInspection -verify %s
-
-template <typename T> void clang_analyzer_dump(T);
-template <typename T> void clang_analyzer_value(T);
-
-int ternary_in_builtin_assume(int a, int b) {
-  __builtin_assume(a > 10 ? b == 4 : b == 10);
-
-  clang_analyzer_value(a);
-  // expected-warning@-1 {{[-2147483648, 10]}}
-  // expected-warning@-2 {{[11, 2147483647]}}
-
-  clang_analyzer_dump(b); // expected-warning{{4}} expected-warning{{10}}
-
-  if (a > 20) {
-    clang_analyzer_dump(b + 100); // expected-warning {{104}}
-    return 2;
-  }
-  if (a > 10) {
-    clang_analyzer_dump(b + 200); // expected-warning {{204}}
-    return 1;
-  }
-  clang_analyzer_dump(b + 300); // expected-warning {{310}}
-  return 0;
-}
-
-// From: https://github.com/llvm/llvm-project/pull/116462#issuecomment-2517853226
-int ternary_in_assume(int a, int b) {
-  // FIXME notes
-  // Currently, if this test is run without the core.builtin.Builtin checker, the above function with the __builtin_assume behaves identically to the following test
-  // i.e. calls to `clang_analyzer_dump` result in "extraneous"  prints of the SVal(s) `reg<int b> ...`
-  // as opposed to 4 or 10
-  // which likely implies the Program State(s) did not get narrowed.
-  // A new checker is likely needed to be implemented to properly handle the expressions within `[[assume]]` to eliminate the states where `b` is not narrowed.
-
-  [[assume(a > 10 ? b == 4 : b == 10)]];
-  clang_analyzer_value(a);
-  // expected-warning@-1 {{[-2147483648, 10]}}
-  // expected-warning@-2 {{[11, 2147483647]}}
-
-  clang_analyzer_dump(b); // expected-warning {{4}} expected-warning {{10}}
-  // expected-warning-re@-1 {{reg_${{[0-9]+}}<int b>}} FIXME: We shouldn't have this dump.
-
-  if (a > 20) {
-    clang_analyzer_dump(b + 100); // expected-warning {{104}}
-    // expected-warning-re@-1 {{(reg_${{[0-9]+}}<int b>) + 100}} FIXME: We shouldn't have this dump.
-    return 2;
-  }
-  if (a > 10) {
-    clang_analyzer_dump(b + 200); // expected-warning {{204}}
-    // expected-warning-re@-1 {{(reg_${{[0-9]+}}<int b>) + 200}} FIXME: We shouldn't have this dump.
-    return 1;
-  }
-  clang_analyzer_dump(b + 300); // expected-warning {{310}}
-  // expected-warning-re@-1 {{(reg_${{[0-9]+}}<int b>) + 300}} FIXME: We shouldn't have this dump.
-  return 0;
-}
diff --git a/clang/test/Analysis/out-of-bounds-new.cpp b/clang/test/Analysis/out-of-bounds-new.cpp
index 39a40eb10bea7..f541bdf810d79 100644
--- a/clang/test/Analysis/out-of-bounds-new.cpp
+++ b/clang/test/Analysis/out-of-bounds-new.cpp
@@ -1,11 +1,4 @@
-// RUN: %clang_analyze_cc1 -std=c++11 -Wno-array-bounds -verify %s \
-// RUN:   -analyzer-checker=unix,core,alpha.security.ArrayBoundV2,debug.ExprInspection
-
-template <typename T> void clang_analyzer_dump(T);
-template <typename T> void clang_analyzer_value(T);
-void clang_analyzer_eval(bool);
-template <typename T>
-void clang_analyzer_explain(T);
+// RUN: %clang_analyze_cc1 -std=c++11 -Wno-array-bounds -analyzer-checker=unix,core,alpha.security.ArrayBoundV2 -verify %s
 
 // Tests doing an out-of-bounds access after the end of an array using:
 // - constant integer index
@@ -187,58 +180,3 @@ int test_reference_that_might_be_after_the_end(int idx) {
   return ref;
 }
 
-// From: https://github.com/llvm/llvm-project/issues/100762
-extern int arrOf10[10];
-void using_builtin(int x) {
-  __builtin_assume(x > 101); // CallExpr
-  arrOf10[x] = 404; // expected-warning {{Out of bound access to memory}}
-}
-
-void using_assume_attr(int ax) {
-  [[assume(ax > 100)]]; // NullStmt with an "assume" attribute.
-  arrOf10[ax] = 405; // expected-warning {{Out of bound access to memory}}
-}
-
-void using_many_assume_attr(int yx) {
-  [[assume(yx > 104), assume(yx > 200), assume(yx < 300)]]; // NullStmt with an attribute
-  arrOf10[yx] = 406; // expected-warning{{Out of bound access to memory}}
-}
-
-
-int using_builtin_assume_has_no_sideeffects(int y) {
-  // We should not apply sideeffects of the argument of [[assume(...)]].
-  // "y" should not get incremented;
-  __builtin_assume(++y == 43); // expected-warning {{assumption is ignored because it contains (potential) side-effects}}
-  clang_analyzer_eval(y == 42); // expected-warning {{FALSE}}
-  return y;
-}
-
-
-
-int using_assume_attr_has_no_sideeffects(int y) {
-
-  // We should not apply sideeffects of the argument of [[assume(...)]].
-  // "y" should not get incremented;
-  [[assume(++y == 43)]]; // expected-warning {{assumption is ignored because it contains (potential) side-effects}}
- 
-  clang_analyzer_eval(y == 42); // expected-warning {{TRUE}} expected-warning {{FALSE}} FIXME: This should be only TRUE.
-
-  clang_analyzer_eval(y == 43); // expected-warning {{FALSE}} expected-warning {{TRUE}} FIXME: This should be only FALSE.
-
-  return y;
-}
-
-
-int using_builtinassume_has_no_sideeffects(int u) {
-  // We should not apply sideeffects of the argument of __builtin_assume(...)
-  // "u" should not get incremented;
-  __builtin_assume(++u == 43); // expected-warning {{assumption is ignored because it contains (potential) side-effects}}
- 
-  // FIXME: evaluate this to true
-  clang_analyzer_eval(u == 42); // expected-warning {{FALSE}}  current behavior 
-
-  // FIXME: evaluate this to false
-  clang_analyzer_eval(u == 43); // expected-warning {{TRUE}}  current behavior 
-
-  return u;
-}

From f139bde8d85e4f7666f2fd739b61894fa58f2f18 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 19 Dec 2024 09:07:42 -0800
Subject: [PATCH 078/209] [SelectionDAG] Move
 SDNode::use_iterator::getOperandNo to SDUse. (#120536)

This allows us to write more range based for loops because we no
longer need the iterator. It also matches IR's Use class.
---
 llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 12 ++++-----
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 25 ++++++++-----------
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  4 +--
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  7 +++---
 llvm/lib/Target/ARM/ARMISelLowering.cpp       | 18 ++++++-------
 .../Target/Hexagon/HexagonISelDAGToDAG.cpp    |  6 ++---
 llvm/lib/Target/M68k/M68kISelLowering.cpp     |  6 ++---
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp   | 13 +++++-----
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   | 16 ++++++------
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  8 +++---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 18 ++++++-------
 11 files changed, 61 insertions(+), 72 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index e13f41162628a..5e2440c1b57e3 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -310,6 +310,9 @@ class SDUse {
   /// Get the next SDUse in the use list.
   SDUse *getNext() const { return Next; }
 
+  /// Return the operand # of this use in its user.
+  inline unsigned getOperandNo() const;
+
   /// Convenience function for get().getNode().
   SDNode *getNode() const { return Val.getNode(); }
   /// Convenience function for get().getResNo().
@@ -824,12 +827,6 @@ END_TWO_BYTE_PACK()
     }
 
     SDUse *operator->() const { return &operator*(); }
-
-    /// Retrieve the operand # of this use in its user.
-    unsigned getOperandNo() const {
-      assert(Op && "Cannot dereference end iterator!");
-      return (unsigned)(Op - Op->getUser()->OperandList);
-    }
   };
 
   class user_iterator {
@@ -1303,6 +1300,9 @@ inline void SDValue::dumpr(const SelectionDAG *G) const {
 }
 
 // Define inline functions from the SDUse class.
+inline unsigned SDUse::getOperandNo() const {
+  return this - getUser()->op_begin();
+}
 
 inline void SDUse::set(const SDValue &V) {
   if (Val.getNode()) removeFromList();
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7a458ff830ab4..6cbfef2d238bb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18921,10 +18921,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
   SmallVector<SDNode *, 16> OtherUses;
   unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps();
   if (isa<ConstantSDNode>(Offset))
-    for (SDNode::use_iterator UI = BasePtr->use_begin(),
-                              UE = BasePtr->use_end();
-         UI != UE; ++UI) {
-      SDUse &Use = *UI;
+    for (SDUse &Use : BasePtr->uses()) {
       // Skip the use that is Ptr and uses of other results from BasePtr's
       // node (important for nodes that return multiple results).
       if (Use.getUser() == Ptr.getNode() || Use != BasePtr)
@@ -18940,7 +18937,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
         break;
       }
 
-      SDValue Op1 = Use.getUser()->getOperand((UI.getOperandNo() + 1) & 1);
+      SDValue Op1 = Use.getUser()->getOperand((Use.getOperandNo() + 1) & 1);
       if (!isa<ConstantSDNode>(Op1)) {
         OtherUses.clear();
         break;
@@ -20931,11 +20928,11 @@ DAGCombiner::getStoreMergeCandidates(StoreSDNode *St,
            RootCount->second.second > StoreMergeDependenceLimit;
   };
 
-  auto TryToAddCandidate = [&](SDNode::use_iterator UseIter) {
+  auto TryToAddCandidate = [&](SDUse &Use) {
     // This must be a chain use.
-    if (UseIter.getOperandNo() != 0)
+    if (Use.getOperandNo() != 0)
       return;
-    if (auto *OtherStore = dyn_cast<StoreSDNode>(UseIter->getUser())) {
+    if (auto *OtherStore = dyn_cast<StoreSDNode>(Use.getUser())) {
       BaseIndexOffset Ptr;
       int64_t PtrDiff;
       if (CandidateMatch(OtherStore, Ptr, PtrDiff) &&
@@ -20954,19 +20951,19 @@ DAGCombiner::getStoreMergeCandidates(StoreSDNode *St,
     for (auto I = RootNode->use_begin(), E = RootNode->use_end();
          I != E && NumNodesExplored < MaxSearchNodes; ++I, ++NumNodesExplored) {
       SDNode *User = I->getUser();
-      if (I.getOperandNo() == 0 && isa<LoadSDNode>(User)) { // walk down chain
-        for (auto I2 = User->use_begin(), E2 = User->use_end(); I2 != E2; ++I2)
-          TryToAddCandidate(I2);
+      if (I->getOperandNo() == 0 && isa<LoadSDNode>(User)) { // walk down chain
+        for (SDUse &U2 : User->uses())
+          TryToAddCandidate(U2);
       }
       // Check stores that depend on the root (e.g. Store 3 in the chart above).
-      if (I.getOperandNo() == 0 && isa<StoreSDNode>(User)) {
-        TryToAddCandidate(I);
+      if (I->getOperandNo() == 0 && isa<StoreSDNode>(User)) {
+        TryToAddCandidate(*I);
       }
     }
   } else {
     for (auto I = RootNode->use_begin(), E = RootNode->use_end();
          I != E && NumNodesExplored < MaxSearchNodes; ++I, ++NumNodesExplored)
-      TryToAddCandidate(I);
+      TryToAddCandidate(*I);
   }
 
   return RootNode;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index ed699e1cc72e3..1abb75eb72b4a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3755,7 +3755,7 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
   for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
     Limit < 10 && U != E; ++U, ++Limit) {
     const TargetRegisterClass *RC =
-        getOperandRegClass(U->getUser(), U.getOperandNo());
+        getOperandRegClass(U->getUser(), U->getOperandNo());
 
     // If the register class is unknown, it could be an unknown
     // register class that needs to be an SGPR, e.g. an inline asm
@@ -3770,7 +3770,7 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
         unsigned Opc = User->getMachineOpcode();
         const MCInstrDesc &Desc = SII->get(Opc);
         if (Desc.isCommutable()) {
-          unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
+          unsigned OpIdx = Desc.getNumDefs() + U->getOperandNo();
           unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
           if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
             unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3423ea1818579..9addeae6a1a4a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16874,10 +16874,9 @@ bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
 }
 
 bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {
-  SDNode::use_iterator I = N->use_begin(), E = N->use_end();
-  for (; I != E; ++I) {
-    if (MemSDNode *M = dyn_cast<MemSDNode>(I->getUser())) {
-      if (getBasePtrIndex(M) == I.getOperandNo())
+  for (SDUse &Use : N->uses()) {
+    if (MemSDNode *M = dyn_cast<MemSDNode>(Use.getUser())) {
+      if (getBasePtrIndex(M) == Use.getOperandNo())
         return true;
     }
   }
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index d8d3875002125..92df91534fe07 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -16219,13 +16219,12 @@ static SDValue CombineBaseUpdate(SDNode *N,
   SmallVector<BaseUpdateUser, 8> BaseUpdates;
 
   // Search for a use of the address operand that is an increment.
-  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
-         UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
-    SDNode *User = UI->getUser();
-    if (UI->getResNo() != Addr.getResNo() || User->getNumOperands() != 2)
+  for (SDUse &Use : Addr->uses()) {
+    SDNode *User = Use.getUser();
+    if (Use.getResNo() != Addr.getResNo() || User->getNumOperands() != 2)
       continue;
 
-    SDValue Inc = User->getOperand(UI.getOperandNo() == 1 ? 0 : 1);
+    SDValue Inc = User->getOperand(Use.getOperandNo() == 1 ? 0 : 1);
     unsigned ConstInc =
         getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
 
@@ -16240,15 +16239,14 @@ static SDValue CombineBaseUpdate(SDNode *N,
   if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
     unsigned Offset =
         getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
-    for (SDNode::use_iterator UI = Base->use_begin(), UE = Base->use_end();
-         UI != UE; ++UI) {
+    for (SDUse &Use : Base->uses()) {
 
-      SDNode *User = UI->getUser();
-      if (UI->getResNo() != Base.getResNo() || User == Addr.getNode() ||
+      SDNode *User = Use.getUser();
+      if (Use.getResNo() != Base.getResNo() || User == Addr.getNode() ||
           User->getNumOperands() != 2)
         continue;
 
-      SDValue UserInc = User->getOperand(UI.getOperandNo() == 0 ? 1 : 0);
+      SDValue UserInc = User->getOperand(Use.getOperandNo() == 0 ? 1 : 0);
       unsigned UserOffset =
           getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
 
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 43ca5456ee3e2..10db4f552cdcf 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -1302,8 +1302,8 @@ void HexagonDAGToDAGISel::ppHoistZextI1(std::vector<SDNode*> &&Nodes) {
     EVT OpVT = OpI1.getValueType();
     if (!OpVT.isSimple() || OpVT.getSimpleVT() != MVT::i1)
       continue;
-    for (auto I = N->use_begin(), E = N->use_end(); I != E; ++I) {
-      SDNode *U = I->getUser();
+    for (SDUse &Use : N->uses()) {
+      SDNode *U = Use.getUser();
       if (U->getNumValues() != 1)
         continue;
       EVT UVT = U->getValueType(0);
@@ -1316,7 +1316,7 @@ void HexagonDAGToDAGISel::ppHoistZextI1(std::vector<SDNode*> &&Nodes) {
         continue;
 
       // Potentially simplifiable operation.
-      unsigned I1N = I.getOperandNo();
+      unsigned I1N = Use.getOperandNo();
       SmallVector<SDValue,2> Ops(U->getNumOperands());
       for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i)
         Ops[i] = U->getOperand(i);
diff --git a/llvm/lib/Target/M68k/M68kISelLowering.cpp b/llvm/lib/Target/M68k/M68kISelLowering.cpp
index 79f5ec3ca5572..4297325cf0e64 100644
--- a/llvm/lib/Target/M68k/M68kISelLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kISelLowering.cpp
@@ -1849,10 +1849,10 @@ static bool hasNonFlagsUse(SDValue Op) {
   for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
        ++UI) {
     SDNode *User = UI->getUser();
-    unsigned UOpNo = UI.getOperandNo();
+    unsigned UOpNo = UI->getOperandNo();
     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
-      // Look pass truncate.
-      UOpNo = User->use_begin().getOperandNo();
+      // Look past truncate.
+      UOpNo = User->use_begin()->getOperandNo();
       User = User->use_begin()->getUser();
     }
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 4fe8383557b32..2e0ee59d901b8 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -954,9 +954,8 @@ static unsigned allUsesTruncate(SelectionDAG *CurDAG, SDNode *N) {
   // Cannot use range-based for loop here as we need the actual use (i.e. we
   // need the operand number corresponding to the use). A range-based for
   // will unbox the use and provide an SDNode*.
-  for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
-       ++UI) {
-    SDNode *User = UI->getUser();
+  for (SDUse &Use : N->uses()) {
+    SDNode *User = Use.getUser();
     unsigned Opc =
         User->isMachineOpcode() ? User->getMachineOpcode() : User->getOpcode();
     switch (Opc) {
@@ -972,7 +971,7 @@ static unsigned allUsesTruncate(SelectionDAG *CurDAG, SDNode *N) {
         return 0;
       StoreSDNode *STN = cast<StoreSDNode>(User);
       unsigned MemVTSize = STN->getMemoryVT().getSizeInBits();
-      if (MemVTSize == 64 || UI.getOperandNo() != 0)
+      if (MemVTSize == 64 || Use.getOperandNo() != 0)
         return 0;
       MaxTruncation = std::max(MaxTruncation, MemVTSize);
       continue;
@@ -981,7 +980,7 @@ static unsigned allUsesTruncate(SelectionDAG *CurDAG, SDNode *N) {
     case PPC::STWX8:
     case PPC::STWU8:
     case PPC::STWUX8:
-      if (UI.getOperandNo() != 0)
+      if (Use.getOperandNo() != 0)
         return 0;
       MaxTruncation = std::max(MaxTruncation, 32u);
       continue;
@@ -989,7 +988,7 @@ static unsigned allUsesTruncate(SelectionDAG *CurDAG, SDNode *N) {
     case PPC::STHX8:
     case PPC::STHU8:
     case PPC::STHUX8:
-      if (UI.getOperandNo() != 0)
+      if (Use.getOperandNo() != 0)
         return 0;
       MaxTruncation = std::max(MaxTruncation, 16u);
       continue;
@@ -997,7 +996,7 @@ static unsigned allUsesTruncate(SelectionDAG *CurDAG, SDNode *N) {
     case PPC::STBX8:
     case PPC::STBU8:
     case PPC::STBUX8:
-      if (UI.getOperandNo() != 0)
+      if (Use.getOperandNo() != 0)
         return 0;
       MaxTruncation = std::max(MaxTruncation, 8u);
       continue;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index dc45108e55e20..fa3357f3eacc3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3293,8 +3293,8 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits,
   if (Depth == 0 && !Node->getValueType(0).isScalarInteger())
     return false;
 
-  for (auto UI = Node->use_begin(), UE = Node->use_end(); UI != UE; ++UI) {
-    SDNode *User = UI->getUser();
+  for (SDUse &Use : Node->uses()) {
+    SDNode *User = Use.getUser();
     // Users of this node should have already been instruction selected
     if (!User->isMachineOpcode())
       return false;
@@ -3302,7 +3302,7 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits,
     // TODO: Add more opcodes?
     switch (User->getMachineOpcode()) {
     default:
-      if (vectorPseudoHasAllNBitUsers(User, UI.getOperandNo(), Bits, TII))
+      if (vectorPseudoHasAllNBitUsers(User, Use.getOperandNo(), Bits, TII))
         break;
       return false;
     case RISCV::ADDW:
@@ -3353,7 +3353,7 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits,
     case RISCV::BCLR:
     case RISCV::BINV:
       // Shift amount operands only use log2(Xlen) bits.
-      if (UI.getOperandNo() == 1 && Bits >= Log2_32(Subtarget->getXLen()))
+      if (Use.getOperandNo() == 1 && Bits >= Log2_32(Subtarget->getXLen()))
         break;
       return false;
     case RISCV::SLLI:
@@ -3417,19 +3417,19 @@ bool RISCVDAGToDAGISel::hasAllNBitUsers(SDNode *Node, unsigned Bits,
     case RISCV::SH3ADD_UW:
       // The first operand to add.uw/shXadd.uw is implicitly zero extended from
       // 32 bits.
-      if (UI.getOperandNo() == 0 && Bits >=  32)
+      if (Use.getOperandNo() == 0 && Bits >= 32)
         break;
       return false;
     case RISCV::SB:
-      if (UI.getOperandNo() == 0 && Bits >= 8)
+      if (Use.getOperandNo() == 0 && Bits >= 8)
         break;
       return false;
     case RISCV::SH:
-      if (UI.getOperandNo() == 0 && Bits >= 16)
+      if (Use.getOperandNo() == 0 && Bits >= 16)
         break;
       return false;
     case RISCV::SW:
-      if (UI.getOperandNo() == 0 && Bits >= 32)
+      if (Use.getOperandNo() == 0 && Bits >= 32)
         break;
       return false;
     }
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 01ef101ff8947..ea8814aa2b4fc 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -15688,14 +15688,12 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N,
     auto AppendUsersIfNeeded = [&Worklist, &Subtarget,
                                 &Inserted](const NodeExtensionHelper &Op) {
       if (Op.needToPromoteOtherUsers()) {
-        for (SDNode::use_iterator UI = Op.OrigOperand->use_begin(),
-                                  UE = Op.OrigOperand->use_end();
-             UI != UE; ++UI) {
-          SDNode *TheUser = UI->getUser();
+        for (SDUse &Use : Op.OrigOperand->uses()) {
+          SDNode *TheUser = Use.getUser();
           if (!NodeExtensionHelper::isSupportedRoot(TheUser, Subtarget))
             return false;
           // We only support the first 2 operands of FMA.
-          if (UI.getOperandNo() >= 2)
+          if (Use.getOperandNo() >= 2)
             return false;
           if (Inserted.insert(TheUser).second)
             Worklist.push_back(TheUser);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 826735d97226a..2528ca553d3e9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -22868,13 +22868,12 @@ static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS,
 
 /// return true if \c Op has a use that doesn't just read flags.
 static bool hasNonFlagsUse(SDValue Op) {
-  for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
-       ++UI) {
-    SDNode *User = UI->getUser();
-    unsigned UOpNo = UI.getOperandNo();
+  for (SDUse &Use : Op->uses()) {
+    SDNode *User = Use.getUser();
+    unsigned UOpNo = Use.getOperandNo();
     if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
       // Look past truncate.
-      UOpNo = User->use_begin().getOperandNo();
+      UOpNo = User->use_begin()->getOperandNo();
       User = User->use_begin()->getUser();
     }
 
@@ -46721,11 +46720,10 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   auto OnlyUsedAsSelectCond = [](SDValue Cond) {
-    for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
-         UI != UE; ++UI)
-      if ((UI->getUser()->getOpcode() != ISD::VSELECT &&
-           UI->getUser()->getOpcode() != X86ISD::BLENDV) ||
-          UI.getOperandNo() != 0)
+    for (SDUse &Use : Cond->uses())
+      if ((Use.getUser()->getOpcode() != ISD::VSELECT &&
+           Use.getUser()->getOpcode() != X86ISD::BLENDV) ||
+          Use.getOperandNo() != 0)
         return false;
 
     return true;

From 4044886c7c6f75a6d7d0205f8a33d9f404f7832f Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Thu, 19 Dec 2024 12:10:23 -0500
Subject: [PATCH 079/209] Revert "[AMDGPU][True16][MC] true16 for
 v_minmax/maxmin_f16 (#119586)" (#120594)

This reverts commit e0526b0780f56eede09b05a859a93626ecdc6e4d.

The `v_minmax/maxmin_f16`(GFX11) needs to be updated to t16 with
`v_minmax/maxmin_num_f16`(GFX12) together since they share the same
codegen pattern. Revert the old patch and resubmit
---
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  11 +-
 llvm/lib/Target/AMDGPU/VOP3Instructions.td    |   8 +-
 llvm/lib/Target/AMDGPU/VOPInstructions.td     |   4 +-
 llvm/test/MC/AMDGPU/gfx11_asm_vop3.s          | 144 ++++++-------
 llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s    | 202 +++++-------------
 llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s     | 174 +++++----------
 .../Disassembler/AMDGPU/gfx11_dasm_vop3.txt   | 198 +++--------------
 .../AMDGPU/gfx11_dasm_vop3_dpp16.txt          | 200 +++--------------
 .../AMDGPU/gfx11_dasm_vop3_dpp8.txt           | 180 +++-------------
 9 files changed, 256 insertions(+), 865 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 532df39e82a75..b1f93a447a7b8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3708,15 +3708,12 @@ def : IntMinMaxPat<V_MAXMIN_U32_e64, umin, umax_oneuse>;
 def : IntMinMaxPat<V_MINMAX_U32_e64, umax, umin_oneuse>;
 def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>;
 def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
+def : FPMinMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
 def : FPMinCanonMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>;
 def : FPMinCanonMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
-}
-
-let True16Predicate = UseFakeTrue16Insts in {
-def : FPMinMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
-def : FPMinMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
-def : FPMinCanonMaxPat<V_MINMAX_F16_fake16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
-def : FPMinCanonMaxPat<V_MAXMIN_F16_fake16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
 }
 
 let OtherPredicates = [isGFX9Plus] in {
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index df50113615ea9..22e457674c07a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1374,8 +1374,8 @@ class VOP3_DOT_Profile_fake16<VOPProfile P, VOP3Features Features = VOP3_REGULAR
 let SubtargetPredicate = isGFX11Plus in {
   defm V_MAXMIN_F32     : VOP3Inst<"v_maxmin_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
   defm V_MINMAX_F32     : VOP3Inst<"v_minmax_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
-  defm V_MAXMIN_F16     : VOP3Inst_t16<"v_maxmin_f16", VOP_F16_F16_F16_F16>;
-  defm V_MINMAX_F16     : VOP3Inst_t16<"v_minmax_f16", VOP_F16_F16_F16_F16>;
+  defm V_MAXMIN_F16     : VOP3Inst<"v_maxmin_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>;
+  defm V_MINMAX_F16     : VOP3Inst<"v_minmax_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>;
   defm V_MAXMIN_U32     : VOP3Inst<"v_maxmin_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
   defm V_MINMAX_U32     : VOP3Inst<"v_minmax_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
   defm V_MAXMIN_I32     : VOP3Inst<"v_maxmin_i32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -1730,8 +1730,8 @@ defm V_PERMLANE16_B32      : VOP3_Real_Base_gfx11_gfx12<0x25b>;
 defm V_PERMLANEX16_B32     : VOP3_Real_Base_gfx11_gfx12<0x25c>;
 defm V_MAXMIN_F32          : VOP3_Realtriple_gfx11<0x25e>;
 defm V_MINMAX_F32          : VOP3_Realtriple_gfx11<0x25f>;
-defm V_MAXMIN_F16          : VOP3_Realtriple_t16_and_fake16_gfx11<0x260, "v_maxmin_f16">;
-defm V_MINMAX_F16          : VOP3_Realtriple_t16_and_fake16_gfx11<0x261, "v_minmax_f16">;
+defm V_MAXMIN_F16          : VOP3_Realtriple_gfx11<0x260>;
+defm V_MINMAX_F16          : VOP3_Realtriple_gfx11<0x261>;
 defm V_MAXMIN_U32          : VOP3_Realtriple_gfx11_gfx12<0x262>;
 defm V_MINMAX_U32          : VOP3_Realtriple_gfx11_gfx12<0x263>;
 defm V_MAXMIN_I32          : VOP3_Realtriple_gfx11_gfx12<0x264>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 930ed9a5e2d0b..d236907b0eec5 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1909,8 +1909,8 @@ multiclass VOP3_Realtriple_t16_gfx11<bits<10> op, string asmName, string opName
 
 multiclass VOP3_Realtriple_t16_and_fake16_gfx11<bits<10> op, string asmName, string opName = NAME,
                                                 string pseudo_mnemonic = "", bit isSingle = 0> {
-  defm _t16: VOP3_Realtriple_t16_gfx11<op, asmName, opName#"_t16", pseudo_mnemonic, isSingle>;
-  defm _fake16: VOP3_Realtriple_t16_gfx11<op, asmName, opName#"_fake16", pseudo_mnemonic, isSingle>;
+  defm _t16: VOP3_Realtriple_t16_gfx11<op, opName#"_t16", asmName, pseudo_mnemonic, isSingle>;
+  defm _fake16: VOP3_Realtriple_t16_gfx11<op, opName#"_fake16", asmName, pseudo_mnemonic, isSingle>;
 }
 
 multiclass VOP3Only_Realtriple_t16_gfx11<bits<10> op, string asmName,
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
index 5aa19e57f1ea2..628edf7400576 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
@@ -3722,62 +3722,50 @@ v_max_u16 v5.l, v255.l, v255.h
 v_max_u16 v255.h, 0xfe0b, vcc_hi
 // GFX11: v_max_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_f16 v5.l, v1.l, v2.l, s3
-// GFX11: v_maxmin_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
+v_maxmin_f16 v5, v1, v2, s3
+// GFX11: v_maxmin_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
 
-v_maxmin_f16 v5.l, v255.l, s2, s105
-// GFX11: v_maxmin_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
+v_maxmin_f16 v5, v255, s2, s105
+// GFX11: v_maxmin_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
 
-v_maxmin_f16 v5.l, s1, v255.l, exec_hi
-// GFX11: v_maxmin_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
+v_maxmin_f16 v5, s1, v255, exec_hi
+// GFX11: v_maxmin_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
 
-v_maxmin_f16 v5.l, s105, s105, exec_lo
-// GFX11: v_maxmin_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
+v_maxmin_f16 v5, s105, s105, exec_lo
+// GFX11: v_maxmin_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
 
-v_maxmin_f16 v5.l, vcc_lo, ttmp15, v3.l
-// GFX11: v_maxmin_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
+v_maxmin_f16 v5, vcc_lo, ttmp15, v3
+// GFX11: v_maxmin_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.l
-// GFX11: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255
+// GFX11: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX11: v_maxmin_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
+v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX11: v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_maxmin_f16 v5.l, m0, 0.5, m0
-// GFX11: v_maxmin_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
+v_maxmin_f16 v5, m0, 0.5, m0
+// GFX11: v_maxmin_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_maxmin_f16 v5.l, |exec_lo|, -1, vcc_hi
-// GFX11: v_maxmin_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
+v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi
+// GFX11: v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
 
-v_maxmin_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
-// GFX11: v_maxmin_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo|
+// GFX11: v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_maxmin_f16 v5.l, null, exec_lo, -|0xfe0b|
-// GFX11: v_maxmin_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_maxmin_f16 v5, null, exec_lo, -|0xfe0b|
+// GFX11: v_maxmin_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_maxmin_f16 v5.l, -1, -|exec_hi|, -|src_scc|
-// GFX11: v_maxmin_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc|
+// GFX11: v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_maxmin_f16 v5.l, 0.5, -m0, 0.5 mul:2
-// GFX11: v_maxmin_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
+v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2
+// GFX11: v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
 
-v_maxmin_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4
-// GFX11: v_maxmin_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
+v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4
+// GFX11: v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
 
-v_maxmin_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2
-// GFX11: v_maxmin_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
-
-v_maxmin_f16 v5.l, v255.h, s2, s105
-// GFX11: v_maxmin_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01]
-
-v_maxmin_f16 v5.l, s1, v255.h, exec_hi
-// GFX11: v_maxmin_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01]
-
-v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.h
-// GFX11: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
-
-v_maxmin_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp div:2
-// GFX11: v_maxmin_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX11: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 v_maxmin_f32 v5, v1, v2, s3
 // GFX11: v_maxmin_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x5e,0xd6,0x01,0x05,0x0e,0x00]
@@ -4811,62 +4799,50 @@ v_min_u16 v5.l, v255.l, v255.h
 v_min_u16 v255.h, 0xfe0b, vcc_hi
 // GFX11: v_min_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_minmax_f16 v5.l, v1.l, v2.l, s3
-// GFX11: v_minmax_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
-
-v_minmax_f16 v5.l, v255.l, s2, s105
-// GFX11: v_minmax_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
-
-v_minmax_f16 v5.l, s1, v255.l, exec_hi
-// GFX11: v_minmax_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
-
-v_minmax_f16 v5.l, s105, s105, exec_lo
-// GFX11: v_minmax_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
-
-v_minmax_f16 v5.l, vcc_lo, ttmp15, v3.l
-// GFX11: v_minmax_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
+v_minmax_f16 v5, v1, v2, s3
+// GFX11: v_minmax_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
 
-v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.l
-// GFX11: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_minmax_f16 v5, v255, s2, s105
+// GFX11: v_minmax_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
 
-v_minmax_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX11: v_minmax_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
+v_minmax_f16 v5, s1, v255, exec_hi
+// GFX11: v_minmax_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
 
-v_minmax_f16 v5.l, m0, 0.5, m0
-// GFX11: v_minmax_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
+v_minmax_f16 v5, s105, s105, exec_lo
+// GFX11: v_minmax_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
 
-v_minmax_f16 v5.l, |exec_lo|, -1, vcc_hi
-// GFX11: v_minmax_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
+v_minmax_f16 v5, vcc_lo, ttmp15, v3
+// GFX11: v_minmax_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_minmax_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
-// GFX11: v_minmax_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_minmax_f16 v5, vcc_hi, 0xfe0b, v255
+// GFX11: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
-v_minmax_f16 v5.l, null, exec_lo, -|0xfe0b|
-// GFX11: v_minmax_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX11: v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_minmax_f16 v5.l, -1, -|exec_hi|, -|src_scc|
-// GFX11: v_minmax_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_minmax_f16 v5, m0, 0.5, m0
+// GFX11: v_minmax_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_minmax_f16 v5.l, 0.5, -m0, 0.5 mul:2
-// GFX11: v_minmax_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
+v_minmax_f16 v5, |exec_lo|, -1, vcc_hi
+// GFX11: v_minmax_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
 
-v_minmax_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4
-// GFX11: v_minmax_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
+v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo|
+// GFX11: v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_minmax_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2
-// GFX11: v_minmax_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+v_minmax_f16 v5, null, exec_lo, -|0xfe0b|
+// GFX11: v_minmax_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
-v_minmax_f16 v5.l, v255.h, s2, s105
-// GFX11: v_minmax_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01]
+v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc|
+// GFX11: v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_minmax_f16 v5.l, s1, v255.h, exec_hi
-// GFX11: v_minmax_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01]
+v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2
+// GFX11: v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
 
-v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.h
-// GFX11: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4
+// GFX11: v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
 
-v_minmax_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null clamp div:2
-// GFX11: v_minmax_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2
+// GFX11: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 v_minmax_f32 v5, v1, v2, s3
 // GFX11: v_minmax_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x5f,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
index 62d6c8e3d814f..acbdcfc39d983 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
@@ -2624,92 +2624,47 @@ v_max_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_max_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
+// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror
+// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
+// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1
+// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
+// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
+// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf
-// GFX11: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
+// GFX11: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
+// GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf
-// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
+// GFX11: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
-v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
+v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
 
-v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
-
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
-
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
-
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
-
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
-
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
-
-v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1
-// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff]
-
-v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff]
-
-v_maxmin_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_maxmin_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x50,0x01,0xff]
-
-v_maxmin_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
-
-v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
-
-v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
-
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
-
-v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
-
-v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
-
-v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
 v_maxmin_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX11: v_maxmin_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -3713,92 +3668,47 @@ v_min_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_
 v_min_u16_e64_dpp v255.h, v255.l, v255.l row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x40,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
-
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
-
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
-
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
-
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
-
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
-
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
-
-v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf
-// GFX11: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
-
-v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
-
-v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf
-// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
-
-v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
-
-v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
-
-v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
-
-v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
-
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3]
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
-
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x00,0x01,0x40,0x01,0xff]
+v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x41,0x01,0xff]
+v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3]
+// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xee,0x01,0x01,0x01,0x01,0xff]
+v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror
+// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xf6,0x81,0x01,0x1f,0x01,0xff]
+v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror
+// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1
-// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0xfe,0xa1,0x01,0x21,0x01,0xff]
+v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1
+// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xfa,0xc1,0x01,0x2f,0x01,0xff]
+v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15
+// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_minmax_f16_e64_dpp v5.l, |v1.l|, -v2.l, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xf2,0x41,0x01,0x50,0x01,0xff]
+v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1
+// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15
+// GFX11: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
+v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1
+// GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15
+// GFX11: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
-v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
+v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x09,0x13]
 
-v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
 v_minmax_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX11: v_minmax_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
index 75bbdbc74336c..cef501703e69d 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
@@ -1627,80 +1627,41 @@ v_max_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_max_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_max_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0x09,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x60,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x60,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
-v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x60,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
-
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
-
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
-
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
-
-v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0xfe,0xa1,0x01,0x77,0x39,0x05]
-
-v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xfa,0xc1,0x01,0x77,0x39,0x05]
-
-v_maxmin_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x77,0x39,0x05]
-
-v_maxmin_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
-
-v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x03,0x60,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
-
-v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
-
-v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
-
-v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
-
-v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x60,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
-
-v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x60,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x60,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 v_maxmin_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_maxmin_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -2440,80 +2401,41 @@ v_min_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_min_u16_e64_dpp v255.h, v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_min_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x40,0x0b,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
-
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
-
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
-
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
-
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
-
-v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
-
-v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
-
-v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
-
-v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
-
-v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
-
-v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x61,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
-
-v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x61,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
-
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x00,0x01,0x77,0x39,0x05]
-
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xee,0x01,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, -|m0| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xf6,0x81,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|exec_hi| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0xfe,0xa1,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xfa,0xc1,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.l, |v1.l|, -v2.l, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xf2,0x41,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.l, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x03,0x61,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x61,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
+v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x61,0xd6,0xea,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
-v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x61,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x61,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 v_minmax_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_minmax_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
index 9790473567dee..4990c62b9438c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
@@ -4289,118 +4289,49 @@
 # W64-FAKE16: v_max_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x09,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00
-# W32-REAL16: v_maxmin_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
-# W32-FAKE16: v_maxmin_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
-# W64-REAL16: v_maxmin_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
-# W64-FAKE16: v_maxmin_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
+# GFX11: v_maxmin_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x60,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01
-# W32-REAL16: v_maxmin_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
-# W32-FAKE16: v_maxmin_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
-# W64-REAL16: v_maxmin_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
-# W64-FAKE16: v_maxmin_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
+# GFX11: v_maxmin_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x60,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01
-# W32-REAL16: v_maxmin_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
-# W32-FAKE16: v_maxmin_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
-# W64-REAL16: v_maxmin_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
-# W64-FAKE16: v_maxmin_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
+# GFX11: v_maxmin_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x60,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01
-# W32-REAL16: v_maxmin_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
-# W32-FAKE16: v_maxmin_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
-# W64-REAL16: v_maxmin_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
-# W64-FAKE16: v_maxmin_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
+# GFX11: v_maxmin_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x60,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04
-# W32-REAL16: v_maxmin_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
-# W32-FAKE16: v_maxmin_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
-# W64-REAL16: v_maxmin_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
-# W64-FAKE16: v_maxmin_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
+# GFX11: v_maxmin_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x60,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# W32-REAL16: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
-# W32-FAKE16: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
-# W64-REAL16: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
-# W64-FAKE16: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# GFX11: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1
-# W32-REAL16: v_maxmin_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
-# W32-FAKE16: v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
-# W64-REAL16: v_maxmin_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
-# W64-FAKE16: v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
+# GFX11: v_maxmin_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x60,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01
-# W32-REAL16: v_maxmin_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
-# W32-FAKE16: v_maxmin_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
-# W64-REAL16: v_maxmin_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
-# W64-FAKE16: v_maxmin_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
+# GFX11: v_maxmin_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x60,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01
-# W32-REAL16: v_maxmin_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
-# W32-FAKE16: v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
-# W64-REAL16: v_maxmin_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
-# W64-FAKE16: v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
+# GFX11: v_maxmin_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x60,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1
-# W32-REAL16: v_maxmin_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
-# W32-FAKE16: v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
-# W64-REAL16: v_maxmin_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
-# W64-FAKE16: v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
+# GFX11: v_maxmin_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x60,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# W32-REAL16: v_maxmin_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
-# W32-FAKE16: v_maxmin_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
-# W64-REAL16: v_maxmin_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
-# W64-FAKE16: v_maxmin_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# GFX11: v_maxmin_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x60,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3
-# W32-REAL16: v_maxmin_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
-# W32-FAKE16: v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
-# W64-REAL16: v_maxmin_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
-# W64-FAKE16: v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
+# GFX11: v_maxmin_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x60,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b
-# W32-REAL16: v_maxmin_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
-# W32-FAKE16: v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
-# W64-REAL16: v_maxmin_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
-# W64-FAKE16: v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
+# GFX11: v_maxmin_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x60,0xd6,0xf0,0xfa,0xc0,0x4b]
 
 0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33
-# W32-REAL16: v_maxmin_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
-# W32-FAKE16: v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
-# W64-REAL16: v_maxmin_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
-# W64-FAKE16: v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
+# GFX11: v_maxmin_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x60,0xd6,0xfd,0xd4,0x04,0x33]
 
 0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
-# W32-REAL16: v_maxmin_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
-# W32-FAKE16: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
-# W64-REAL16: v_maxmin_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
-# W64-FAKE16: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
-
-0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01
-# W32-REAL16: v_maxmin_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01]
-# W32-FAKE16: v_maxmin_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01]
-# W64-REAL16: v_maxmin_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01]
-# W64-FAKE16: v_maxmin_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x60,0xd6,0xff,0x05,0xa4,0x01]
-
-0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01
-# W32-REAL16: v_maxmin_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01]
-# W32-FAKE16: v_maxmin_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01]
-# W64-REAL16: v_maxmin_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01]
-# W64-FAKE16: v_maxmin_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x60,0xd6,0x01,0xfe,0xff,0x01]
-
-0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# W32-REAL16: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
-# W32-FAKE16: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
-# W64-REAL16: v_maxmin_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
-# W64-FAKE16: v_maxmin_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
-
-0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
-# W32-REAL16: v_maxmin_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
-# W32-FAKE16: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
-# W64-REAL16: v_maxmin_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
-# W64-FAKE16: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# GFX11: v_maxmin_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x60,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x5e,0xd6,0x01,0x05,0x0e,0x00
 # GFX11: v_maxmin_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x5e,0xd6,0x01,0x05,0x0e,0x00]
@@ -5848,118 +5779,49 @@
 # W64-FAKE16: v_min_u16 v255, 0xfe0b, vcc_hi          ; encoding: [0xff,0x00,0x0b,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00
-# W32-REAL16: v_minmax_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
-# W32-FAKE16: v_minmax_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
-# W64-REAL16: v_minmax_f16 v5.l, v1.l, v2.l, s3       ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
-# W64-FAKE16: v_minmax_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
+# GFX11: v_minmax_f16 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x61,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01
-# W32-REAL16: v_minmax_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
-# W32-FAKE16: v_minmax_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
-# W64-REAL16: v_minmax_f16 v5.l, v255.l, s2, s105     ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
-# W64-FAKE16: v_minmax_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
+# GFX11: v_minmax_f16 v5, v255, s2, s105         ; encoding: [0x05,0x00,0x61,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01
-# W32-REAL16: v_minmax_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
-# W32-FAKE16: v_minmax_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
-# W64-REAL16: v_minmax_f16 v5.l, s1, v255.l, exec_hi  ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
-# W64-FAKE16: v_minmax_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
+# GFX11: v_minmax_f16 v5, s1, v255, exec_hi      ; encoding: [0x05,0x00,0x61,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01
-# W32-REAL16: v_minmax_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
-# W32-FAKE16: v_minmax_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
-# W64-REAL16: v_minmax_f16 v5.l, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
-# W64-FAKE16: v_minmax_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
+# GFX11: v_minmax_f16 v5, s105, s105, exec_lo    ; encoding: [0x05,0x00,0x61,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04
-# W32-REAL16: v_minmax_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
-# W32-FAKE16: v_minmax_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
-# W64-REAL16: v_minmax_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
-# W64-FAKE16: v_minmax_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
+# GFX11: v_minmax_f16 v5, vcc_lo, ttmp15, v3     ; encoding: [0x05,0x00,0x61,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# W32-REAL16: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
-# W32-FAKE16: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
-# W64-REAL16: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.l ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
-# W64-FAKE16: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
+# GFX11: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255   ; encoding: [0x05,0x00,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
 
 0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1
-# W32-REAL16: v_minmax_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
-# W32-FAKE16: v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
-# W64-REAL16: v_minmax_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
-# W64-FAKE16: v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
+# GFX11: v_minmax_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x61,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01
-# W32-REAL16: v_minmax_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
-# W32-FAKE16: v_minmax_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
-# W64-REAL16: v_minmax_f16 v5.l, m0, 0.5, m0          ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
-# W64-FAKE16: v_minmax_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
+# GFX11: v_minmax_f16 v5, m0, 0.5, m0            ; encoding: [0x05,0x00,0x61,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01
-# W32-REAL16: v_minmax_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
-# W32-FAKE16: v_minmax_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
-# W64-REAL16: v_minmax_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
-# W64-FAKE16: v_minmax_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
+# GFX11: v_minmax_f16 v5, |exec_lo|, -1, vcc_hi  ; encoding: [0x05,0x01,0x61,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1
-# W32-REAL16: v_minmax_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
-# W32-FAKE16: v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
-# W64-REAL16: v_minmax_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
-# W64-FAKE16: v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
+# GFX11: v_minmax_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x61,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00
-# W32-REAL16: v_minmax_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
-# W32-FAKE16: v_minmax_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
-# W64-REAL16: v_minmax_f16 v5.l, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
-# W64-FAKE16: v_minmax_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
+# GFX11: v_minmax_f16 v5, null, exec_lo, -|0xfe0b| ; encoding: [0x05,0x04,0x61,0xd6,0x7c,0xfc,0xfc,0x83,0x0b,0xfe,0x00,0x00]
 
 0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3
-# W32-REAL16: v_minmax_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
-# W32-FAKE16: v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
-# W64-REAL16: v_minmax_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
-# W64-FAKE16: v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
+# GFX11: v_minmax_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x61,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b
-# W32-REAL16: v_minmax_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
-# W32-FAKE16: v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
-# W64-REAL16: v_minmax_f16 v5.l, 0.5, -m0, 0.5 mul:2  ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
-# W64-FAKE16: v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
+# GFX11: v_minmax_f16 v5, 0.5, -m0, 0.5 mul:2    ; encoding: [0x05,0x00,0x61,0xd6,0xf0,0xfa,0xc0,0x4b]
 
 0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33
-# W32-REAL16: v_minmax_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
-# W32-FAKE16: v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
-# W64-REAL16: v_minmax_f16 v5.l, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
-# W64-FAKE16: v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
+# GFX11: v_minmax_f16 v5, -src_scc, |vcc_lo|, -1 mul:4 ; encoding: [0x05,0x02,0x61,0xd6,0xfd,0xd4,0x04,0x33]
 
 0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
-# W32-REAL16: v_minmax_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
-# W32-FAKE16: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
-# W64-REAL16: v_minmax_f16 v255.l, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
-# W64-FAKE16: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
-
-0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01
-# W32-REAL16: v_minmax_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01]
-# W32-FAKE16: v_minmax_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01]
-# W64-REAL16: v_minmax_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01]
-# W64-FAKE16: v_minmax_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x61,0xd6,0xff,0x05,0xa4,0x01]
-
-0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01
-# W32-REAL16: v_minmax_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01]
-# W32-FAKE16: v_minmax_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01]
-# W64-REAL16: v_minmax_f16 v5.l, s1, v255.h, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01]
-# W64-FAKE16: v_minmax_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,0] ; encoding: [0x05,0x10,0x61,0xd6,0x01,0xfe,0xff,0x01]
-
-0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00
-# W32-REAL16: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
-# W32-FAKE16: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
-# W64-REAL16: v_minmax_f16 v5.l, vcc_hi, 0xfe0b, v255.h op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
-# W64-FAKE16: v_minmax_f16 v5, vcc_hi, 0xfe0b, v255 op_sel:[0,0,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0x6b,0xfe,0xfd,0x07,0x0b,0xfe,0x00,0x00]
-
-0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00
-# W32-REAL16: v_minmax_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
-# W32-FAKE16: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
-# W64-REAL16: v_minmax_f16 v255.h, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
-# W64-FAKE16: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp div:2 ; encoding: [0xff,0xc3,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
+# GFX11: v_minmax_f16 v255, -|0xfe0b|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x61,0xd6,0xff,0xd6,0xf0,0x79,0x0b,0xfe,0x00,0x00]
 
 0x05,0x00,0x5f,0xd6,0x01,0x05,0x0e,0x00
 # GFX11: v_minmax_f32 v5, v1, v2, s3             ; encoding: [0x05,0x00,0x5f,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
index 4dc4f46cc0259..d734cd2bba1aa 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
@@ -2113,118 +2113,46 @@
 # W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x60,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# GFX11: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x60,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x60,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# GFX11: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x60,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x60,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x60,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
 0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x60,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
 
 0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
-# W32-REAL16: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
-# W64-REAL16: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
-
-0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x60,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
-
-0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x60,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
-
-0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x60,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
-
-0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x60,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
-
-0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
-# W32-REAL16: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
-# W64-REAL16: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# GFX11: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x5e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # GFX11: v_maxmin_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -2905,118 +2833,46 @@
 # W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x61,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# GFX11: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x61,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
 0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x61,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# GFX11: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x61,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# GFX11: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x61,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
+# GFX11: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x61,0xd6,0xfa,0x04,0x06,0xab,0x01,0x5f,0x01,0x01]
 
 0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
+# GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x06,0x61,0xd6,0xfa,0x04,0xc2,0xd3,0x01,0x60,0x01,0x13]
 
 0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
-# W32-REAL16: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
-# W32-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
-# W64-REAL16: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
-# W64-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
-
-0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# W32-REAL16: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x61,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
-
-0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x61,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
-
-0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x61,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
-
-0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x13,0x61,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x01,0x13]
-
-0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
-# W32-REAL16: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
-# W32-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
-# W64-REAL16: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
-# W64-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
+# GFX11: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
 
 0x05,0x00,0x5f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
 # GFX11: v_minmax_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x5f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
index dcf57ee2e19a3..3b3d39844b792 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
@@ -1141,106 +1141,40 @@
 # W64-FAKE16: v_max_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x09,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# GFX11: v_maxmin_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x60,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# GFX11: v_maxmin_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x60,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x60,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# GFX11: v_maxmin_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x60,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x60,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# GFX11: v_maxmin_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x60,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
 0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# GFX11: v_maxmin_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x60,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
 0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
-# W32-REAL16: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
-# W64-REAL16: v_maxmin_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
-
-0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x60,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
-
-0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x60,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
-
-0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x60,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
-
-0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05
-# W32-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_maxmin_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x60,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
-
-0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
-# W32-REAL16: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
-# W32-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
-# W64-REAL16: v_maxmin_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
-# W64-FAKE16: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# GFX11: v_maxmin_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x60,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x5e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # GFX11: v_maxmin_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -1651,106 +1585,40 @@
 # W64-FAKE16: v_min_u16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x0b,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# GFX11: v_minmax_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x61,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# GFX11: v_minmax_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x61,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x61,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# GFX11: v_minmax_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x61,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# GFX11: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x61,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
+# GFX11: v_minmax_f16_e64_dpp v5, -|v1|, v2, -|-1| mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x61,0xd6,0xe9,0x04,0x06,0xab,0x01,0x77,0x39,0x05]
 
 0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
+# GFX11: v_minmax_f16_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x06,0x61,0xd6,0xe9,0x04,0xc2,0xd3,0x01,0x77,0x39,0x05]
 
 0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
-# W32-REAL16: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
-# W32-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
-# W64-REAL16: v_minmax_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
-# W64-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
-
-0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# W32-REAL16: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x61,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
-
-0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, v1, v2, v255 op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x61,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
-
-0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x61,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
-
-0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05
-# W32-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
-# W32-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
-# W64-REAL16: v_minmax_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
-# W64-FAKE16: v_minmax_f16_e64_dpp v5, -|v1|, -|v2|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x13,0x61,0xd6,0xe9,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
-
-0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
-# W32-REAL16: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
-# W32-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
-# W64-REAL16: v_minmax_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
-# W64-FAKE16: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc7,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# GFX11: v_minmax_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x61,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 0x05,0x00,0x5f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
 # GFX11: v_minmax_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x5f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]

From c2dd61279764c9e525b73d37dae58ed77b773b1d Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 19 Dec 2024 12:28:09 -0500
Subject: [PATCH 080/209] [llvm-lib] Add /llvmlibindex:no to disable writing an
 index (#120596)

This can be used with /llvmlibthin to create thin archives without an
index, which is a prerequisite for porting
https://reviews.llvm.org/D117284 to lld-link.

Creating files like this is already possible with `llvm-ar rcS`, so this
doesn't add additional problems.
---
 lld/test/COFF/thin-archive.s                | 17 ++++++++++++++++-
 llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp |  8 +++++++-
 llvm/lib/ToolDrivers/llvm-lib/Options.td    |  3 +++
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/lld/test/COFF/thin-archive.s b/lld/test/COFF/thin-archive.s
index a35eabed688ce..55d71ea635673 100644
--- a/lld/test/COFF/thin-archive.s
+++ b/lld/test/COFF/thin-archive.s
@@ -5,7 +5,22 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64-windows-msvc -o %t.lib.obj \
 # RUN:     %S/Inputs/mangled-symbol.s
 # RUN: lld-link /lib /out:%t.lib %t.lib.obj
-# RUN: lld-link /lib /llvmlibthin /out:%t_thin.lib %t.lib.obj
+# RUN: lld-link /lib /llvmlibindex:no /out:%t_noindex.lib %t.lib.obj
+# RUN: lld-link /lib /llvmlibthin /llvmlibindex /out:%t_thin.lib %t.lib.obj
+# RUN: lld-link /lib /llvmlibthin /llvmlibindex:no \
+# RUN:     /out:%t_thin_noindex.lib %t.lib.obj
+
+# RUN: llvm-nm --print-armap %t.lib \
+# RUN:   | FileCheck %s --check-prefix=SYMTAB
+# RUN: llvm-nm --print-armap %t_noindex.lib \
+# RUN:   | FileCheck %s --check-prefix=NO-SYMTAB
+# RUN: llvm-nm --print-armap %t_thin.lib \
+# RUN:   | FileCheck %s --check-prefix=SYMTAB
+# RUN: llvm-nm --print-armap %t_thin_noindex.lib \
+# RUN:   | FileCheck %s --check-prefix=NO-SYMTAB
+
+# SYMTAB:        ?f@@YAHXZ in
+# NO-SYMTAB-NOT: ?f@@YAHXZ in
 
 # RUN: lld-link /entry:main %t.main.obj %t.lib /out:%t.exe 2>&1 | \
 # RUN:     FileCheck --allow-empty %s
diff --git a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
index 319aebffdbbba..138d9fc7f1d7f 100644
--- a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
+++ b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -516,8 +516,14 @@ int llvm::libDriverMain(ArrayRef<const char *> ArgsArr) {
   std::reverse(Members.begin(), Members.end());
 
   bool Thin = Args.hasArg(OPT_llvmlibthin);
+
+  auto Symtab = Args.hasFlag(OPT_llvmlibindex, OPT_llvmlibindex_no,
+                             /*default=*/true)
+                    ? SymtabWritingMode::NormalSymtab
+                    : SymtabWritingMode::NoSymtab;
+
   if (Error E = writeArchive(
-          OutputPath, Members, SymtabWritingMode::NormalSymtab,
+          OutputPath, Members, Symtab,
           Thin ? object::Archive::K_GNU : object::Archive::K_COFF,
           /*Deterministic=*/true, Thin, nullptr, COFF::isArm64EC(LibMachine))) {
     handleAllErrors(std::move(E), [&](const ErrorInfoBase &EI) {
diff --git a/llvm/lib/ToolDrivers/llvm-lib/Options.td b/llvm/lib/ToolDrivers/llvm-lib/Options.td
index a3d901d77054a..fbca73aeccd9a 100644
--- a/llvm/lib/ToolDrivers/llvm-lib/Options.td
+++ b/llvm/lib/ToolDrivers/llvm-lib/Options.td
@@ -28,6 +28,9 @@ def nativedeffile : P<"defArm64Native", "def file to use to generate native ARM6
 def llvmlibthin : F<"llvmlibthin">,
     HelpText<"Make .lib point to .obj files instead of copying their contents">;
 
+defm llvmlibindex : B<"llvmlibindex", "Write an index to the output (default)",
+                      "Do not write an index to the output">;
+
 def llvmlibempty : F<"llvmlibempty">,
     HelpText<"When given no contents, produce an empty .lib file">;
 

From 434819c35f4e0168248a30677077fe7c8c8ab29c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 19 Dec 2024 17:20:41 +0000
Subject: [PATCH 081/209] [PhaseOrdering][X86] Add test coverage for #34072

Add tests for horizontal add patterns with missing/undemanded elements - which typically prevents folding to the (add (shuffle a, b),(shuffle a, b)) optimal pattern
---
 .../test/Transforms/PhaseOrdering/X86/hadd.ll | 1241 +++++++++++++++++
 1 file changed, 1241 insertions(+)
 create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/hadd.ll

diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
new file mode 100644
index 0000000000000..664f144aa15ae
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -0,0 +1,1241 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+
+; PR34072 - failure to canonicalize to (add (shuffle a, b),(shuffle a, b)) for optimal horizontal add patterns (with undemanded elements)
+
+;
+; v8i16
+;
+
+define <8 x i16> @add_v8i16_01234567(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @add_v8i16_01234567(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i16> [[TMP3]]
+;
+  %a0 = extractelement <8 x i16> %a, i32 0
+  %a1 = extractelement <8 x i16> %a, i32 1
+  %a2 = extractelement <8 x i16> %a, i32 2
+  %a3 = extractelement <8 x i16> %a, i32 3
+  %a4 = extractelement <8 x i16> %a, i32 4
+  %a5 = extractelement <8 x i16> %a, i32 5
+  %a6 = extractelement <8 x i16> %a, i32 6
+  %a7 = extractelement <8 x i16> %a, i32 7
+  %a01 = add i16 %a0, %a1
+  %a23 = add i16 %a2, %a3
+  %a45 = add i16 %a4, %a5
+  %a67 = add i16 %a6, %a7
+  %b0 = extractelement <8 x i16> %b, i32 0
+  %b1 = extractelement <8 x i16> %b, i32 1
+  %b2 = extractelement <8 x i16> %b, i32 2
+  %b3 = extractelement <8 x i16> %b, i32 3
+  %b4 = extractelement <8 x i16> %b, i32 4
+  %b5 = extractelement <8 x i16> %b, i32 5
+  %b6 = extractelement <8 x i16> %b, i32 6
+  %b7 = extractelement <8 x i16> %b, i32 7
+  %b01 = add i16 %b0, %b1
+  %b23 = add i16 %b2, %b3
+  %b45 = add i16 %b4, %b5
+  %b67 = add i16 %b6, %b7
+  %hadd0 = insertelement <8 x i16> poison, i16 %a01, i32 0
+  %hadd1 = insertelement <8 x i16> %hadd0, i16 %a23, i32 1
+  %hadd2 = insertelement <8 x i16> %hadd1, i16 %a45, i32 2
+  %hadd3 = insertelement <8 x i16> %hadd2, i16 %a67, i32 3
+  %hadd4 = insertelement <8 x i16> %hadd3, i16 %b01, i32 4
+  %hadd5 = insertelement <8 x i16> %hadd4, i16 %b23, i32 5
+  %hadd6 = insertelement <8 x i16> %hadd5, i16 %b45, i32 6
+  %hadd7 = insertelement <8 x i16> %hadd6, i16 %b67, i32 7
+  %result = shufflevector <8 x i16> %hadd7, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %result
+}
+
+define <8 x i16> @add_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: @add_v8i16_u1234567(
+; SSE2-NEXT:    [[A2:%.*]] = extractelement <8 x i16> [[A:%.*]], i64 2
+; SSE2-NEXT:    [[A3:%.*]] = extractelement <8 x i16> [[A]], i64 3
+; SSE2-NEXT:    [[A4:%.*]] = extractelement <8 x i16> [[A]], i64 4
+; SSE2-NEXT:    [[A5:%.*]] = extractelement <8 x i16> [[A]], i64 5
+; SSE2-NEXT:    [[A6:%.*]] = extractelement <8 x i16> [[A]], i64 6
+; SSE2-NEXT:    [[A7:%.*]] = extractelement <8 x i16> [[A]], i64 7
+; SSE2-NEXT:    [[A23:%.*]] = add i16 [[A2]], [[A3]]
+; SSE2-NEXT:    [[A45:%.*]] = add i16 [[A4]], [[A5]]
+; SSE2-NEXT:    [[A67:%.*]] = add i16 [[A6]], [[A7]]
+; SSE2-NEXT:    [[HADD1:%.*]] = insertelement <8 x i16> poison, i16 [[A23]], i64 1
+; SSE2-NEXT:    [[HADD2:%.*]] = insertelement <8 x i16> [[HADD1]], i16 [[A45]], i64 2
+; SSE2-NEXT:    [[HADD3:%.*]] = insertelement <8 x i16> [[HADD2]], i16 [[A67]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD3]], <8 x i16> [[TMP3]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT:    ret <8 x i16> [[RESULT]]
+;
+; SSE4-LABEL: @add_v8i16_u1234567(
+; SSE4-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP1:%.*]] = add <8 x i16> [[A]], [[SHIFT]]
+; SSE4-NEXT:    [[HADD1:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+; SSE4-NEXT:    [[HADD32:%.*]] = shufflevector <8 x i16> [[HADD1]], <8 x i16> [[TMP4]], <8 x i32> <i32 poison, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP7:%.*]] = add <8 x i16> [[TMP5]], [[TMP6]]
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD32]], <8 x i16> [[TMP7]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE4-NEXT:    ret <8 x i16> [[RESULT]]
+;
+; AVX2-LABEL: @add_v8i16_u1234567(
+; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP1:%.*]] = add <8 x i16> [[A]], [[SHIFT]]
+; AVX2-NEXT:    [[HADD1:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    [[HADD32:%.*]] = shufflevector <8 x i16> [[HADD1]], <8 x i16> [[TMP4]], <8 x i32> <i32 poison, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP7:%.*]] = add <8 x i16> [[TMP5]], [[TMP6]]
+; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD32]], <8 x i16> [[TMP7]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT:    ret <8 x i16> [[RESULT]]
+;
+; AVX512-LABEL: @add_v8i16_u1234567(
+; AVX512-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP1:%.*]] = add <8 x i16> [[A]], [[SHIFT]]
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+; AVX512-NEXT:    [[HADD32:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP4]], <8 x i32> <i32 poison, i32 2, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP7:%.*]] = add <8 x i16> [[TMP5]], [[TMP6]]
+; AVX512-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD32]], <8 x i16> [[TMP7]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX512-NEXT:    ret <8 x i16> [[RESULT]]
+;
+  %a0 = extractelement <8 x i16> %a, i32 0
+  %a1 = extractelement <8 x i16> %a, i32 1
+  %a2 = extractelement <8 x i16> %a, i32 2
+  %a3 = extractelement <8 x i16> %a, i32 3
+  %a4 = extractelement <8 x i16> %a, i32 4
+  %a5 = extractelement <8 x i16> %a, i32 5
+  %a6 = extractelement <8 x i16> %a, i32 6
+  %a7 = extractelement <8 x i16> %a, i32 7
+  %a01 = add i16 %a0, %a1
+  %a23 = add i16 %a2, %a3
+  %a45 = add i16 %a4, %a5
+  %a67 = add i16 %a6, %a7
+  %b0 = extractelement <8 x i16> %b, i32 0
+  %b1 = extractelement <8 x i16> %b, i32 1
+  %b2 = extractelement <8 x i16> %b, i32 2
+  %b3 = extractelement <8 x i16> %b, i32 3
+  %b4 = extractelement <8 x i16> %b, i32 4
+  %b5 = extractelement <8 x i16> %b, i32 5
+  %b6 = extractelement <8 x i16> %b, i32 6
+  %b7 = extractelement <8 x i16> %b, i32 7
+  %b01 = add i16 %b0, %b1
+  %b23 = add i16 %b2, %b3
+  %b45 = add i16 %b4, %b5
+  %b67 = add i16 %b6, %b7
+  %hadd0 = insertelement <8 x i16> poison, i16 %a01, i32 0
+  %hadd1 = insertelement <8 x i16> %hadd0, i16 %a23, i32 1
+  %hadd2 = insertelement <8 x i16> %hadd1, i16 %a45, i32 2
+  %hadd3 = insertelement <8 x i16> %hadd2, i16 %a67, i32 3
+  %hadd4 = insertelement <8 x i16> %hadd3, i16 %b01, i32 4
+  %hadd5 = insertelement <8 x i16> %hadd4, i16 %b23, i32 5
+  %hadd6 = insertelement <8 x i16> %hadd5, i16 %b45, i32 6
+  %hadd7 = insertelement <8 x i16> %hadd6, i16 %b67, i32 7
+  %result = shufflevector <8 x i16> %hadd7, <8 x i16> %a, <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %result
+}
+
+;
+; v4i32
+;
+
+define <4 x i32> @add_v4i32_0123(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @add_v4i32_0123(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = add i32 %a0, %a1
+  %a23 = add i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = add i32 %b0, %b1
+  %b23 = add i32 %b2, %b3
+  %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
+  %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
+  %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @add_v4i32_u123(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @add_v4i32_u123(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 poison, i32 2, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = add i32 %a0, %a1
+  %a23 = add i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = add i32 %b0, %b1
+  %b23 = add i32 %b2, %b3
+  %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
+  %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
+  %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @add_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @add_v4i32_0u23(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = add i32 %a0, %a1
+  %a23 = add i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = add i32 %b0, %b1
+  %b23 = add i32 %b2, %b3
+  %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
+  %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
+  %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @add_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: @add_v4i32_01u3(
+; SSE2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B:%.*]], <4 x i32> <i32 2, i32 poison, i32 6, i32 poison>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 3, i32 poison, i32 7, i32 poison>
+; SSE2-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
+; SSE2-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+; SSE4-LABEL: @add_v4i32_01u3(
+; SSE4-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; SSE4-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[B]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
+; SSE4-NEXT:    ret <4 x i32> [[RESULT]]
+;
+; AVX2-LABEL: @add_v4i32_01u3(
+; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; AVX2-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[B]]
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
+; AVX2-NEXT:    ret <4 x i32> [[RESULT]]
+;
+; AVX512-LABEL: @add_v4i32_01u3(
+; AVX512-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B:%.*]], <4 x i32> <i32 2, i32 poison, i32 6, i32 poison>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 3, i32 poison, i32 7, i32 poison>
+; AVX512-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+; AVX512-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
+; AVX512-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = add i32 %a0, %a1
+  %a23 = add i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = add i32 %b0, %b1
+  %b23 = add i32 %b2, %b3
+  %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
+  %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
+  %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @add_v4i32_012u(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @add_v4i32_012u(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B:%.*]], <4 x i32> <i32 2, i32 4, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 3, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = add i32 %a0, %a1
+  %a23 = add i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = add i32 %b0, %b1
+  %b23 = add i32 %b2, %b3
+  %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
+  %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
+  %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @add_v4i32_uu23(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @add_v4i32_uu23(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 3>
+; CHECK-NEXT:    [[RESULT1:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[RESULT1]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = add i32 %a0, %a1
+  %a23 = add i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = add i32 %b0, %b1
+  %b23 = add i32 %b2, %b3
+  %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
+  %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
+  %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @add_v4i32_01uu(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @add_v4i32_01uu(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i32> [[TMP3]]
+;
+  %a0 = extractelement <4 x i32> %a, i32 0
+  %a1 = extractelement <4 x i32> %a, i32 1
+  %a2 = extractelement <4 x i32> %a, i32 2
+  %a3 = extractelement <4 x i32> %a, i32 3
+  %a01 = add i32 %a0, %a1
+  %a23 = add i32 %a2, %a3
+  %b0 = extractelement <4 x i32> %b, i32 0
+  %b1 = extractelement <4 x i32> %b, i32 1
+  %b2 = extractelement <4 x i32> %b, i32 2
+  %b3 = extractelement <4 x i32> %b, i32 3
+  %b01 = add i32 %b0, %b1
+  %b23 = add i32 %b2, %b3
+  %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+  %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
+  %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
+  %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
+  %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  ret <4 x i32> %result
+}
+
+;
+; v8i32
+;
+
+define <8 x i32> @add_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: @add_v8i32_01234567(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; AVX-LABEL: @add_v8i32_01234567(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %a01 = add i32 %a0, %a1
+  %a23 = add i32 %a2, %a3
+  %a45 = add i32 %a4, %a5
+  %a67 = add i32 %a6, %a7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %b01 = add i32 %b0, %b1
+  %b23 = add i32 %b2, %b3
+  %b45 = add i32 %b4, %b5
+  %b67 = add i32 %b6, %b7
+  %hadd0 = insertelement <8 x i32> poison, i32 %a01, i32 0
+  %hadd1 = insertelement <8 x i32> %hadd0, i32 %a23, i32 1
+  %hadd2 = insertelement <8 x i32> %hadd1, i32 %b01, i32 2
+  %hadd3 = insertelement <8 x i32> %hadd2, i32 %b23, i32 3
+  %hadd4 = insertelement <8 x i32> %hadd3, i32 %a45, i32 4
+  %hadd5 = insertelement <8 x i32> %hadd4, i32 %a67, i32 5
+  %hadd6 = insertelement <8 x i32> %hadd5, i32 %b45, i32 6
+  %hadd7 = insertelement <8 x i32> %hadd6, i32 %b67, i32 7
+  %result = shufflevector <8 x i32> %hadd7, <8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i32> %result
+}
+
+define <8 x i32> @add_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: @add_v8i32_01234u67(
+; SSE2-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]]
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[TMP2]], [[TMP3]]
+; SSE2-NEXT:    [[HADD4:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]]
+; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
+; SSE2-NEXT:    ret <8 x i32> [[RESULT]]
+;
+; SSE4-LABEL: @add_v8i32_01234u67(
+; SSE4-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
+; SSE4-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; SSE4-NEXT:    [[A45:%.*]] = add i32 [[A4]], [[A5]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[HADD4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
+; SSE4-NEXT:    ret <8 x i32> [[RESULT]]
+;
+; AVX-LABEL: @add_v8i32_01234u67(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]]
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[HADD4:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
+; AVX-NEXT:    ret <8 x i32> [[RESULT]]
+;
+  %a0 = extractelement <8 x i32> %a, i32 0
+  %a1 = extractelement <8 x i32> %a, i32 1
+  %a2 = extractelement <8 x i32> %a, i32 2
+  %a3 = extractelement <8 x i32> %a, i32 3
+  %a4 = extractelement <8 x i32> %a, i32 4
+  %a5 = extractelement <8 x i32> %a, i32 5
+  %a6 = extractelement <8 x i32> %a, i32 6
+  %a7 = extractelement <8 x i32> %a, i32 7
+  %a01 = add i32 %a0, %a1
+  %a23 = add i32 %a2, %a3
+  %a45 = add i32 %a4, %a5
+  %a67 = add i32 %a6, %a7
+  %b0 = extractelement <8 x i32> %b, i32 0
+  %b1 = extractelement <8 x i32> %b, i32 1
+  %b2 = extractelement <8 x i32> %b, i32 2
+  %b3 = extractelement <8 x i32> %b, i32 3
+  %b4 = extractelement <8 x i32> %b, i32 4
+  %b5 = extractelement <8 x i32> %b, i32 5
+  %b6 = extractelement <8 x i32> %b, i32 6
+  %b7 = extractelement <8 x i32> %b, i32 7
+  %b01 = add i32 %b0, %b1
+  %b23 = add i32 %b2, %b3
+  %b45 = add i32 %b4, %b5
+  %b67 = add i32 %b6, %b7
+  %hadd0 = insertelement <8 x i32> poison, i32 %a01, i32 0
+  %hadd1 = insertelement <8 x i32> %hadd0, i32 %a23, i32 1
+  %hadd2 = insertelement <8 x i32> %hadd1, i32 %b01, i32 2
+  %hadd3 = insertelement <8 x i32> %hadd2, i32 %b23, i32 3
+  %hadd4 = insertelement <8 x i32> %hadd3, i32 %a45, i32 4
+  %hadd5 = insertelement <8 x i32> %hadd4, i32 %a67, i32 5
+  %hadd6 = insertelement <8 x i32> %hadd5, i32 %b45, i32 6
+  %hadd7 = insertelement <8 x i32> %hadd6, i32 %b67, i32 7
+  %result = shufflevector <8 x i32> %hadd7, <8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 6, i32 7>
+  ret <8 x i32> %result
+}
+
+;
+; v4f32
+;
+
+define <4 x float> @add_v4f32_0123(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @add_v4f32_0123(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fadd float %a0, %a1
+  %a23 = fadd float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fadd float %b0, %b1
+  %b23 = fadd float %b2, %b3
+  %hadd0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hadd1 = insertelement <4 x float> %hadd0, float %a23, i32 1
+  %hadd2 = insertelement <4 x float> %hadd1, float %b01, i32 2
+  %hadd3 = insertelement <4 x float> %hadd2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hadd3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @add_v4f32_u123(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @add_v4f32_u123(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 poison, i32 2, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fadd float %a0, %a1
+  %a23 = fadd float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fadd float %b0, %b1
+  %b23 = fadd float %b2, %b3
+  %hadd0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hadd1 = insertelement <4 x float> %hadd0, float %a23, i32 1
+  %hadd2 = insertelement <4 x float> %hadd1, float %b01, i32 2
+  %hadd3 = insertelement <4 x float> %hadd2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hadd3, <4 x float> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @add_v4f32_0u23(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @add_v4f32_0u23(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
+; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fadd float %a0, %a1
+  %a23 = fadd float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fadd float %b0, %b1
+  %b23 = fadd float %b2, %b3
+  %hadd0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hadd1 = insertelement <4 x float> %hadd0, float %a23, i32 1
+  %hadd2 = insertelement <4 x float> %hadd1, float %b01, i32 2
+  %hadd3 = insertelement <4 x float> %hadd2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hadd3, <4 x float> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @add_v4f32_01u3(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: @add_v4f32_01u3(
+; SSE2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 2, i32 poison, i32 6, i32 poison>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 3, i32 poison, i32 7, i32 poison>
+; SSE2-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
+; SSE2-NEXT:    ret <4 x float> [[RESULT1]]
+;
+; SSE4-LABEL: @add_v4f32_01u3(
+; SSE4-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; SSE4-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[B]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
+; SSE4-NEXT:    ret <4 x float> [[RESULT]]
+;
+; AVX2-LABEL: @add_v4f32_01u3(
+; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; AVX2-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[B]]
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
+; AVX2-NEXT:    ret <4 x float> [[RESULT]]
+;
+; AVX512-LABEL: @add_v4f32_01u3(
+; AVX512-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 2, i32 poison, i32 6, i32 poison>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 3, i32 poison, i32 7, i32 poison>
+; AVX512-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; AVX512-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
+; AVX512-NEXT:    ret <4 x float> [[RESULT1]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fadd float %a0, %a1
+  %a23 = fadd float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fadd float %b0, %b1
+  %b23 = fadd float %b2, %b3
+  %hadd0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hadd1 = insertelement <4 x float> %hadd0, float %a23, i32 1
+  %hadd2 = insertelement <4 x float> %hadd1, float %b01, i32 2
+  %hadd3 = insertelement <4 x float> %hadd2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hadd3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @add_v4f32_012u(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: @add_v4f32_012u(
+; SSE2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 2, i32 4, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 3, i32 5, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; SSE2-NEXT:    ret <4 x float> [[RESULT1]]
+;
+; SSE4-LABEL: @add_v4f32_012u(
+; SSE4-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[B]], [[SHIFT]]
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
+; SSE4-NEXT:    ret <4 x float> [[RESULT]]
+;
+; AVX2-LABEL: @add_v4f32_012u(
+; AVX2-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[B]], [[SHIFT]]
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; AVX2-NEXT:    [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
+; AVX2-NEXT:    ret <4 x float> [[RESULT]]
+;
+; AVX512-LABEL: @add_v4f32_012u(
+; AVX512-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; AVX512-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 2, i32 4, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 3, i32 5, i32 poison, i32 poison>
+; AVX512-NEXT:    [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; AVX512-NEXT:    [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; AVX512-NEXT:    ret <4 x float> [[RESULT1]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fadd float %a0, %a1
+  %a23 = fadd float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fadd float %b0, %b1
+  %b23 = fadd float %b2, %b3
+  %hadd0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hadd1 = insertelement <4 x float> %hadd0, float %a23, i32 1
+  %hadd2 = insertelement <4 x float> %hadd1, float %b01, i32 2
+  %hadd3 = insertelement <4 x float> %hadd2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hadd3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+  ret <4 x float> %result
+}
+
+define <4 x float> @add_v4f32_uu23(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @add_v4f32_uu23(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 3>
+; CHECK-NEXT:    [[RESULT1:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[RESULT1]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fadd float %a0, %a1
+  %a23 = fadd float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fadd float %b0, %b1
+  %b23 = fadd float %b2, %b3
+  %hadd0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hadd1 = insertelement <4 x float> %hadd0, float %a23, i32 1
+  %hadd2 = insertelement <4 x float> %hadd1, float %b01, i32 2
+  %hadd3 = insertelement <4 x float> %hadd2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hadd3, <4 x float> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x float> @add_v4f32_01uu(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @add_v4f32_01uu(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x float> [[TMP3]]
+;
+  %a0 = extractelement <4 x float> %a, i32 0
+  %a1 = extractelement <4 x float> %a, i32 1
+  %a2 = extractelement <4 x float> %a, i32 2
+  %a3 = extractelement <4 x float> %a, i32 3
+  %a01 = fadd float %a0, %a1
+  %a23 = fadd float %a2, %a3
+  %b0 = extractelement <4 x float> %b, i32 0
+  %b1 = extractelement <4 x float> %b, i32 1
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %b01 = fadd float %b0, %b1
+  %b23 = fadd float %b2, %b3
+  %hadd0 = insertelement <4 x float> poison, float %a01, i32 0
+  %hadd1 = insertelement <4 x float> %hadd0, float %a23, i32 1
+  %hadd2 = insertelement <4 x float> %hadd1, float %b01, i32 2
+  %hadd3 = insertelement <4 x float> %hadd2, float %b23, i32 3
+  %result = shufflevector <4 x float> %hadd3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  ret <4 x float> %result
+}
+
+;
+; v8f32
+;
+
+define <8 x float> @add_v8f32_01234567(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: @add_v8f32_01234567(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[TMP7]]
+;
+; AVX-LABEL: @add_v8f32_01234567(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %a01 = fadd float %a0, %a1
+  %a23 = fadd float %a2, %a3
+  %a45 = fadd float %a4, %a5
+  %a67 = fadd float %a6, %a7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %b01 = fadd float %b0, %b1
+  %b23 = fadd float %b2, %b3
+  %b45 = fadd float %b4, %b5
+  %b67 = fadd float %b6, %b7
+  %hadd0 = insertelement <8 x float> poison, float %a01, i32 0
+  %hadd1 = insertelement <8 x float> %hadd0, float %a23, i32 1
+  %hadd2 = insertelement <8 x float> %hadd1, float %b01, i32 2
+  %hadd3 = insertelement <8 x float> %hadd2, float %b23, i32 3
+  %hadd4 = insertelement <8 x float> %hadd3, float %a45, i32 4
+  %hadd5 = insertelement <8 x float> %hadd4, float %a67, i32 5
+  %hadd6 = insertelement <8 x float> %hadd5, float %b45, i32 6
+  %hadd7 = insertelement <8 x float> %hadd6, float %b67, i32 7
+  %result = shufflevector <8 x float> %hadd7, <8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %result
+}
+
+define <8 x float> @add_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: @add_v8f32_012u4567(
+; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
+; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE-NEXT:    [[A67:%.*]] = fadd float [[A6]], [[A7]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[HADD5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE-NEXT:    ret <8 x float> [[RESULT]]
+;
+; AVX-LABEL: @add_v8f32_012u4567(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
+; AVX-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A]], [[SHIFT]]
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP4:%.*]] = fadd <8 x float> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[HADD5:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 14, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]]
+; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; AVX-NEXT:    ret <8 x float> [[RESULT]]
+;
+  %a0 = extractelement <8 x float> %a, i32 0
+  %a1 = extractelement <8 x float> %a, i32 1
+  %a2 = extractelement <8 x float> %a, i32 2
+  %a3 = extractelement <8 x float> %a, i32 3
+  %a4 = extractelement <8 x float> %a, i32 4
+  %a5 = extractelement <8 x float> %a, i32 5
+  %a6 = extractelement <8 x float> %a, i32 6
+  %a7 = extractelement <8 x float> %a, i32 7
+  %a01 = fadd float %a0, %a1
+  %a23 = fadd float %a2, %a3
+  %a45 = fadd float %a4, %a5
+  %a67 = fadd float %a6, %a7
+  %b0 = extractelement <8 x float> %b, i32 0
+  %b1 = extractelement <8 x float> %b, i32 1
+  %b2 = extractelement <8 x float> %b, i32 2
+  %b3 = extractelement <8 x float> %b, i32 3
+  %b4 = extractelement <8 x float> %b, i32 4
+  %b5 = extractelement <8 x float> %b, i32 5
+  %b6 = extractelement <8 x float> %b, i32 6
+  %b7 = extractelement <8 x float> %b, i32 7
+  %b01 = fadd float %b0, %b1
+  %b23 = fadd float %b2, %b3
+  %b45 = fadd float %b4, %b5
+  %b67 = fadd float %b6, %b7
+  %hadd0 = insertelement <8 x float> poison, float %a01, i32 0
+  %hadd1 = insertelement <8 x float> %hadd0, float %a23, i32 1
+  %hadd2 = insertelement <8 x float> %hadd1, float %b01, i32 2
+  %hadd3 = insertelement <8 x float> %hadd2, float %b23, i32 3
+  %hadd4 = insertelement <8 x float> %hadd3, float %a45, i32 4
+  %hadd5 = insertelement <8 x float> %hadd4, float %a67, i32 5
+  %hadd6 = insertelement <8 x float> %hadd5, float %b45, i32 6
+  %hadd7 = insertelement <8 x float> %hadd6, float %b67, i32 7
+  %result = shufflevector <8 x float> %hadd7, <8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %result
+}
+
+;
+; v2f64
+;
+
+define <2 x double> @add_v2f64_01(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @add_v2f64_01(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <2 x double> [[TMP3]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %a01 = fadd double %a0, %a1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %b01 = fadd double %b0, %b1
+  %hadd0 = insertelement <2 x double> poison, double %a01, i32 0
+  %hadd1 = insertelement <2 x double> %hadd0, double %b01, i32 1
+  %result = shufflevector <2 x double> %hadd1, <2 x double> %a, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %result
+}
+
+define <2 x double> @add_v2f64_u1(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @add_v2f64_u1(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[B]], [[SHIFT]]
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT:    ret <2 x double> [[RESULT]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %a01 = fadd double %a0, %a1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %b01 = fadd double %b0, %b1
+  %hadd0 = insertelement <2 x double> poison, double %a01, i32 0
+  %hadd1 = insertelement <2 x double> %hadd0, double %b01, i32 1
+  %result = shufflevector <2 x double> %hadd1, <2 x double> %a, <2 x i32> <i32 poison, i32 1>
+  ret <2 x double> %result
+}
+
+define <2 x double> @add_v2f64_0u(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @add_v2f64_0u(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[A]], [[SHIFT]]
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT:    ret <2 x double> [[RESULT]]
+;
+  %a0 = extractelement <2 x double> %a, i32 0
+  %a1 = extractelement <2 x double> %a, i32 1
+  %a01 = fadd double %a0, %a1
+  %b0 = extractelement <2 x double> %b, i32 0
+  %b1 = extractelement <2 x double> %b, i32 1
+  %b01 = fadd double %b0, %b1
+  %hadd0 = insertelement <2 x double> poison, double %a01, i32 0
+  %hadd1 = insertelement <2 x double> %hadd0, double %b01, i32 1
+  %result = shufflevector <2 x double> %hadd1, <2 x double> %a, <2 x i32> <i32 0, i32 poison>
+  ret <2 x double> %result
+}
+
+;
+; v4f64
+;
+
+define <4 x double> @add_v4f64_0123(<4 x double> %a, <4 x double> %b) {
+; SSE-LABEL: @add_v4f64_0123(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[TMP7]]
+;
+; AVX-LABEL: @add_v4f64_0123(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fadd double %a0, %a1
+  %a23 = fadd double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fadd double %b0, %b1
+  %b23 = fadd double %b2, %b3
+  %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hadd1 = insertelement <4 x double> %hadd0, double %b01, i32 1
+  %hadd2 = insertelement <4 x double> %hadd1, double %a23, i32 2
+  %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @add_v4f64_u123(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @add_v4f64_u123(
+; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <2 x i32> <i32 0, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <2 x i32> <i32 1, i32 7>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
+; SSE2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @add_v4f64_u123(
+; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE4-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @add_v4f64_u123(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[B]], [[SHIFT]]
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
+; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> <i32 poison, i32 1, i32 2, i32 6>
+; AVX-NEXT:    ret <4 x double> [[RESULT]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fadd double %a0, %a1
+  %a23 = fadd double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fadd double %b0, %b1
+  %b23 = fadd double %b2, %b3
+  %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hadd1 = insertelement <4 x double> %hadd0, double %b01, i32 1
+  %hadd2 = insertelement <4 x double> %hadd1, double %a23, i32 2
+  %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @add_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
+; SSE-LABEL: @add_v4f64_0u23(
+; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 2, i32 poison>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 3, i32 poison>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @add_v4f64_0u23(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[B]], [[SHIFT]]
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 2, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 3, i32 poison>
+; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 poison, i32 2, i32 6>
+; AVX-NEXT:    ret <4 x double> [[RESULT]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fadd double %a0, %a1
+  %a23 = fadd double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fadd double %b0, %b1
+  %b23 = fadd double %b2, %b3
+  %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hadd1 = insertelement <4 x double> %hadd0, double %b01, i32 1
+  %hadd2 = insertelement <4 x double> %hadd1, double %a23, i32 2
+  %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @add_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
+; SSE-LABEL: @add_v4f64_01u3(
+; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @add_v4f64_01u3(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[B]], [[SHIFT]]
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 6>
+; AVX-NEXT:    ret <4 x double> [[RESULT]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fadd double %a0, %a1
+  %a23 = fadd double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fadd double %b0, %b1
+  %b23 = fadd double %b2, %b3
+  %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hadd1 = insertelement <4 x double> %hadd0, double %b01, i32 1
+  %hadd2 = insertelement <4 x double> %hadd1, double %a23, i32 2
+  %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @add_v4f64_012u(<4 x double> %a, <4 x double> %b) {
+; SSE-LABEL: @add_v4f64_012u(
+; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
+; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SSE-NEXT:    [[A23:%.*]] = fadd double [[A2]], [[A3]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[A23]], i64 2
+; SSE-NEXT:    ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @add_v4f64_012u(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x double> [[A]], [[SHIFT]]
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
+; AVX-NEXT:    ret <4 x double> [[RESULT]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fadd double %a0, %a1
+  %a23 = fadd double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fadd double %b0, %b1
+  %b23 = fadd double %b2, %b3
+  %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hadd1 = insertelement <4 x double> %hadd0, double %b01, i32 1
+  %hadd2 = insertelement <4 x double> %hadd1, double %a23, i32 2
+  %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+  ret <4 x double> %result
+}
+
+define <4 x double> @add_v4f64_uu23(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @add_v4f64_uu23(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 2, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; SSE2-NEXT:    ret <4 x double> [[RESULT1]]
+;
+; SSE4-LABEL: @add_v4f64_uu23(
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; SSE4-NEXT:    [[RESULT1:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    ret <4 x double> [[RESULT1]]
+;
+; AVX-LABEL: @add_v4f64_uu23(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; AVX-NEXT:    [[RESULT1:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[RESULT1]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fadd double %a0, %a1
+  %a23 = fadd double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fadd double %b0, %b1
+  %b23 = fadd double %b2, %b3
+  %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hadd1 = insertelement <4 x double> %hadd0, double %b01, i32 1
+  %hadd2 = insertelement <4 x double> %hadd1, double %a23, i32 2
+  %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
+  ret <4 x double> %result
+}
+
+define <4 x double> @add_v4f64_01uu(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: @add_v4f64_01uu(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+;
+  %a0 = extractelement <4 x double> %a, i32 0
+  %a1 = extractelement <4 x double> %a, i32 1
+  %a2 = extractelement <4 x double> %a, i32 2
+  %a3 = extractelement <4 x double> %a, i32 3
+  %a01 = fadd double %a0, %a1
+  %a23 = fadd double %a2, %a3
+  %b0 = extractelement <4 x double> %b, i32 0
+  %b1 = extractelement <4 x double> %b, i32 1
+  %b2 = extractelement <4 x double> %b, i32 2
+  %b3 = extractelement <4 x double> %b, i32 3
+  %b01 = fadd double %b0, %b1
+  %b23 = fadd double %b2, %b3
+  %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
+  %hadd1 = insertelement <4 x double> %hadd0, double %b01, i32 1
+  %hadd2 = insertelement <4 x double> %hadd1, double %a23, i32 2
+  %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
+  %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+  ret <4 x double> %result
+}

From 01b96385fd8760d1fc79c35d1f980c9b64d03599 Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Wed, 18 Dec 2024 14:02:22 +0100
Subject: [PATCH 082/209] [RISCV][test] Add zbb-logic-neg-imm.ll

---
 llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll | 366 +++++++++++++++++++
 1 file changed, 366 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll

diff --git a/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll b/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll
new file mode 100644
index 0000000000000..87e72c28a9965
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll
@@ -0,0 +1,366 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+zbb -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32,NOZBS32
+; RUN: llc -mtriple=riscv64 -mattr=+zbb -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64,NOZBS64
+; RUN: llc -mtriple=riscv32 -mattr=+zbb,+zbs -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32,ZBS
+; RUN: llc -mtriple=riscv64 -mattr=+zbb,+zbs -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64,ZBS
+
+define i32 @and0xabcdefff(i32 %x) {
+; RV32-LABEL: and0xabcdefff:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, 703711
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: and0xabcdefff:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 703711
+; RV64-NEXT:    addiw a1, a1, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    ret
+  %and = and i32 %x, -1412567041
+  ret i32 %and
+}
+
+define i32 @orlow13(i32 %x) {
+; RV32-LABEL: orlow13:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, 2
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: orlow13:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 2
+; RV64-NEXT:    addiw a1, a1, -1
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+  %or = or i32 %x, 8191
+  ret i32 %or
+}
+
+define i64 @orlow24(i64 %x) {
+; RV32-LABEL: orlow24:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a2, 4096
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: orlow24:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 4096
+; RV64-NEXT:    addiw a1, a1, -1
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+  %or = or i64 %x, 16777215
+  ret i64 %or
+}
+
+define i32 @xorlow16(i32 %x) {
+; RV32-LABEL: xorlow16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, 16
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    xor a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: xorlow16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 16
+; RV64-NEXT:    addiw a1, a1, -1
+; RV64-NEXT:    xor a0, a0, a1
+; RV64-NEXT:    ret
+  %xor = xor i32 %x, 65535
+  ret i32 %xor
+}
+
+define i32 @xorlow31(i32 %x) {
+; RV32-LABEL: xorlow31:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, 524288
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    xor a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: xorlow31:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 524288
+; RV64-NEXT:    addiw a1, a1, -1
+; RV64-NEXT:    xor a0, a0, a1
+; RV64-NEXT:    ret
+  %xor = xor i32 %x, 2147483647
+  ret i32 %xor
+}
+
+define i32 @oraddlow16(i32 %x) {
+; RV32-LABEL: oraddlow16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, 16
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: oraddlow16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 16
+; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    addw a0, a0, a1
+; RV64-NEXT:    ret
+  %or = or i32 %x, 65535
+  %add = add nsw i32 %or, 65535
+  ret i32 %add
+}
+
+define i32 @addorlow16(i32 %x) {
+; RV32-LABEL: addorlow16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, 16
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    add a0, a0, a1
+; RV32-NEXT:    or a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: addorlow16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 16
+; RV64-NEXT:    addiw a1, a1, -1
+; RV64-NEXT:    addw a0, a0, a1
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+  %add = add nsw i32 %x, 65535
+  %or = or i32 %add, 65535
+  ret i32 %or
+}
+
+define i32 @andxorlow16(i32 %x) {
+; RV32-LABEL: andxorlow16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, 16
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    andn a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: andxorlow16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 16
+; RV64-NEXT:    addiw a1, a1, -1
+; RV64-NEXT:    andn a0, a1, a0
+; RV64-NEXT:    ret
+  %and = and i32 %x, 65535
+  %xor = xor i32 %and, 65535
+  ret i32 %xor
+}
+
+define void @orarray100(ptr %a) {
+; RV32-LABEL: orarray100:
+; RV32:       # %bb.0: # %entry
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    li a2, 0
+; RV32-NEXT:    lui a3, 16
+; RV32-NEXT:    addi a3, a3, -1
+; RV32-NEXT:  .LBB8_1: # %for.body
+; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32-NEXT:    slli a4, a1, 2
+; RV32-NEXT:    addi a1, a1, 1
+; RV32-NEXT:    add a4, a0, a4
+; RV32-NEXT:    lw a5, 0(a4)
+; RV32-NEXT:    seqz a6, a1
+; RV32-NEXT:    add a2, a2, a6
+; RV32-NEXT:    xori a6, a1, 100
+; RV32-NEXT:    or a5, a5, a3
+; RV32-NEXT:    or a6, a6, a2
+; RV32-NEXT:    sw a5, 0(a4)
+; RV32-NEXT:    bnez a6, .LBB8_1
+; RV32-NEXT:  # %bb.2: # %for.cond.cleanup
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: orarray100:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi a1, a0, 400
+; RV64-NEXT:    lui a2, 16
+; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:  .LBB8_1: # %for.body
+; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64-NEXT:    lw a3, 0(a0)
+; RV64-NEXT:    or a3, a3, a2
+; RV64-NEXT:    sw a3, 0(a0)
+; RV64-NEXT:    addi a0, a0, 4
+; RV64-NEXT:    bne a0, a1, .LBB8_1
+; RV64-NEXT:  # %bb.2: # %for.cond.cleanup
+; RV64-NEXT:    ret
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds nuw i32, ptr %a, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx, align 4
+  %or = or i32 %1, 65535
+  store i32 %or, ptr %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+define void @orarray3(ptr %a) {
+; CHECK-LABEL: orarray3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 16
+; CHECK-NEXT:    lw a2, 0(a0)
+; CHECK-NEXT:    lw a3, 4(a0)
+; CHECK-NEXT:    lw a4, 8(a0)
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    or a2, a2, a1
+; CHECK-NEXT:    or a3, a3, a1
+; CHECK-NEXT:    or a1, a4, a1
+; CHECK-NEXT:    sw a2, 0(a0)
+; CHECK-NEXT:    sw a3, 4(a0)
+; CHECK-NEXT:    sw a1, 8(a0)
+; CHECK-NEXT:    ret
+  %1 = load i32, ptr %a, align 4
+  %or = or i32 %1, 65535
+  store i32 %or, ptr %a, align 4
+  %arrayidx.1 = getelementptr inbounds nuw i8, ptr %a, i64 4
+  %2 = load i32, ptr %arrayidx.1, align 4
+  %or.1 = or i32 %2, 65535
+  store i32 %or.1, ptr %arrayidx.1, align 4
+  %arrayidx.2 = getelementptr inbounds nuw i8, ptr %a, i64 8
+  %3 = load i32, ptr %arrayidx.2, align 4
+  %or.2 = or i32 %3, 65535
+  store i32 %or.2, ptr %arrayidx.2, align 4
+  ret void
+}
+
+define i32 @andlow16(i32 %x) {
+; CHECK-LABEL: andlow16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    zext.h a0, a0
+; CHECK-NEXT:    ret
+  %and = and i32 %x, 65535
+  ret i32 %and
+}
+
+define i32 @andlow24(i32 %x) {
+; RV32-LABEL: andlow24:
+; RV32:       # %bb.0:
+; RV32-NEXT:    slli a0, a0, 8
+; RV32-NEXT:    srli a0, a0, 8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: andlow24:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 40
+; RV64-NEXT:    srli a0, a0, 40
+; RV64-NEXT:    ret
+  %and = and i32 %x, 16777215
+  ret i32 %and
+}
+
+define i32 @compl(i32 %x) {
+; CHECK-LABEL: compl:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    ret
+  %not = xor i32 %x, -1
+  ret i32 %not
+}
+
+define i32 @orlow12(i32 %x) {
+; NOZBS32-LABEL: orlow12:
+; NOZBS32:       # %bb.0:
+; NOZBS32-NEXT:    lui a1, 1
+; NOZBS32-NEXT:    addi a1, a1, -1
+; NOZBS32-NEXT:    or a0, a0, a1
+; NOZBS32-NEXT:    ret
+;
+; NOZBS64-LABEL: orlow12:
+; NOZBS64:       # %bb.0:
+; NOZBS64-NEXT:    lui a1, 1
+; NOZBS64-NEXT:    addiw a1, a1, -1
+; NOZBS64-NEXT:    or a0, a0, a1
+; NOZBS64-NEXT:    ret
+;
+; ZBS-LABEL: orlow12:
+; ZBS:       # %bb.0:
+; ZBS-NEXT:    ori a0, a0, 2047
+; ZBS-NEXT:    bseti a0, a0, 11
+; ZBS-NEXT:    ret
+  %or = or i32 %x, 4095
+  ret i32 %or
+}
+
+define i32 @xorlow12(i32 %x) {
+; NOZBS32-LABEL: xorlow12:
+; NOZBS32:       # %bb.0:
+; NOZBS32-NEXT:    lui a1, 1
+; NOZBS32-NEXT:    addi a1, a1, -1
+; NOZBS32-NEXT:    xor a0, a0, a1
+; NOZBS32-NEXT:    ret
+;
+; NOZBS64-LABEL: xorlow12:
+; NOZBS64:       # %bb.0:
+; NOZBS64-NEXT:    lui a1, 1
+; NOZBS64-NEXT:    addiw a1, a1, -1
+; NOZBS64-NEXT:    xor a0, a0, a1
+; NOZBS64-NEXT:    ret
+;
+; ZBS-LABEL: xorlow12:
+; ZBS:       # %bb.0:
+; ZBS-NEXT:    xori a0, a0, 2047
+; ZBS-NEXT:    binvi a0, a0, 11
+; ZBS-NEXT:    ret
+  %xor = xor i32 %x, 4095
+  ret i32 %xor
+}
+
+define i64 @andimm64(i64 %x) {
+; RV32-LABEL: andimm64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a1, 1044496
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: andimm64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 65281
+; RV64-NEXT:    slli a1, a1, 4
+; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    ret
+  %and = and i64 %x, 4278255615
+  ret i64 %and
+}
+
+define i64 @andimm64srli(i64 %x) {
+; RV32-LABEL: andimm64srli:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a2, 917504
+; RV32-NEXT:    or a1, a1, a2
+; RV32-NEXT:    lui a2, 8192
+; RV32-NEXT:    addi a2, a2, -1
+; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: andimm64srli:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a1, 983040
+; RV64-NEXT:    srli a1, a1, 3
+; RV64-NEXT:    not a1, a1
+; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    ret
+  %or = or i64 %x, -2305843009180139521
+  ret i64 %or
+}

From 254ba78495100d9f20c4fa9802395f11c6d3cef1 Mon Sep 17 00:00:00 2001
From: MagentaTreehouse <99200384+MagentaTreehouse@users.noreply.github.com>
Date: Thu, 19 Dec 2024 12:46:03 -0500
Subject: [PATCH 083/209] [GenericDomTree][NFC] Remove unnecessary
 `const_cast`s (#97638)

---
 llvm/include/llvm/Support/GenericDomTree.h | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/llvm/include/llvm/Support/GenericDomTree.h b/llvm/include/llvm/Support/GenericDomTree.h
index d0eec2070b4d7..a4a680c97a079 100644
--- a/llvm/include/llvm/Support/GenericDomTree.h
+++ b/llvm/include/llvm/Support/GenericDomTree.h
@@ -456,7 +456,7 @@ class DominatorTreeBase {
   bool isReachableFromEntry(const NodeT *A) const {
     assert(!this->isPostDominator() &&
            "This is not implemented for post dominators");
-    return isReachableFromEntry(getNode(const_cast<NodeT *>(A)));
+    return isReachableFromEntry(getNode(A));
   }
 
   bool isReachableFromEntry(const DomTreeNodeBase<NodeT> *A) const { return A; }
@@ -1014,11 +1014,7 @@ bool DominatorTreeBase<NodeT, IsPostDom>::dominates(const NodeT *A,
   if (A == B)
     return true;
 
-  // Cast away the const qualifiers here. This is ok since
-  // this function doesn't actually return the values returned
-  // from getNode.
-  return dominates(getNode(const_cast<NodeT *>(A)),
-                   getNode(const_cast<NodeT *>(B)));
+  return dominates(getNode(A), getNode(B));
 }
 template <typename NodeT, bool IsPostDom>
 bool DominatorTreeBase<NodeT, IsPostDom>::properlyDominates(
@@ -1026,11 +1022,7 @@ bool DominatorTreeBase<NodeT, IsPostDom>::properlyDominates(
   if (A == B)
     return false;
 
-  // Cast away the const qualifiers here. This is ok since
-  // this function doesn't actually return the values returned
-  // from getNode.
-  return dominates(getNode(const_cast<NodeT *>(A)),
-                   getNode(const_cast<NodeT *>(B)));
+  return dominates(getNode(A), getNode(B));
 }
 
 } // end namespace llvm

From 46e782300765eeac8026377bf30d5f08888c2b25 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Thu, 19 Dec 2024 09:57:27 -0800
Subject: [PATCH 084/209] [lldb][debugserver] Read/write SME registers on arm64
 (#119171)

**Note:** The register reading and writing depends on new register
flavor support in thread_get_state/thread_set_state in the kernel, which
will be first available in macOS 15.4.

The Apple M4 line of cores includes the Scalable Matrix Extension (SME)
feature. The M4s do not implement Scalable Vector Extension (SVE),
although the processor is in Streaming SVE Mode when the SME is being
used. The most obvious side effects of being in SSVE Mode are that (on
the M4 cores) NEON instructions cannot be used, and watchpoints may get
false positives, the address comparisons are done at a lowered
granularity.

When SSVE mode is enabled, the kernel will provide the Streaming Vector
Length register, which is a maximum of 64 bytes with the M4. Also
provided are SVCR (with bits indicating if SSVE mode and SME mode are
enabled), TPIDR2, SVL. Then the SVE registers Z0..31 (SVL bytes long),
P0..15 (SVL/8 bytes), the ZA matrix register (SVL*SVL bytes), and the M4
supports SME2, so the ZT0 register (64 bytes).

When SSVE/SME are disabled, none of these registers are provided by the
kernel - reads and writes of them will fail.

Unlike Linux, lldb cannot modify the SVL through a thread_set_state
call, or change the processor state's SSVE/SME status. There is also no
way for a process to request a lowered SVL size today, so the work that
David did to handle VL/SVL changing while stepping through a process is
not an issue on Darwin today. But debugserver should be providing
everything necessary so we can reuse all of David's work on resizing the
register contexts in lldb if it happens in the future. debugbserver
sends svl, svcr, and tpidr2 in the expedited registers when a thread
stops, if SSVE|SME mode are enabled (if the kernel allows it to read the
ARM_SME_STATE register set).

While the maximum SVL is 64 bytes on M4, the AArch64 maximum possible
SVL is 256; this would give us a 64k ZA register. If debugserver sized
all of its register contexts assuming the largest possible SVL, we could
easily use 2MB more memory for the register contexts of all threads in a
process -- and on iOS et al, processes must run within a small memory
allotment and this would push us over that.

Much of the work in debugserver was changing the arm64 register context
from being a static compile-time array of register sets, to being
initialized at runtime if debugserver is running on a machine with SME.
The ZA is only created to the machine's actual maximum SVL. The size of
the 32 SVE Z registers is less significant so I am statically allocating
those to the architecturally largest possible SVL value today.

Also, debugserver includes information about registers that share the
same part of the register file. e.g. S0 and D0 are the lower parts of
the NEON 128-bit V0 register. And when running on an SME machine, v0 is
the lower 128 bits of the SVE Z0 register. So the register maps used
when defining the VFP registers must differ depending on the
capabilities of the cpu at runtime.

I also changed register reading in debugserver, where formerly when
debugserver was asked to read a register, and the thread_get_state read
of that register failed, it would return all zero's. This is necessary
when constructing a `g` packet that gets all registers - because there
is no separation between register bytes, the offsets are fixed. But when
we are asking for a single register (e.g. Z0) when not in SSVE/SME mode,
this should return an error.

This does mean that when you're running on an SME capabable machine, but
not in SME mode, and do `register read -a`, lldb will report that 48 SVE
registers were unavailable and 5 SME registers were unavailable. But
that's only when `-a` is used.

The register reading and writing depends on new register flavor support
in thread_get_state/thread_set_state in the kernel, which is not yet in
a release. The test case I wrote is skipped on current OSes. I pilfered
the SME register setup from some of David's existing SME test files;
there were a few Linux specific details in those tests that they weren't
easy to reuse on Darwin.

rdar://121608074
---
 .../AArch64/ArchitectureAArch64.cpp           |  19 +
 .../register_command/TestRegisters.py         |  30 +-
 lldb/test/API/macosx/sme-registers/Makefile   |   5 +
 .../sme-registers/TestSMERegistersDarwin.py   | 217 ++++
 lldb/test/API/macosx/sme-registers/main.c     | 113 +++
 lldb/tools/debugserver/source/DNBDefs.h       |  25 +-
 .../debugserver/source/MacOSX/MachProcess.mm  |  14 +-
 .../debugserver/source/MacOSX/MachThread.cpp  |   8 +-
 .../source/MacOSX/arm64/DNBArchImplARM64.cpp  | 954 ++++++++++++++----
 .../source/MacOSX/arm64/DNBArchImplARM64.h    |  72 +-
 .../source/MacOSX/arm64/sme_thread_status.h   |  77 ++
 lldb/tools/debugserver/source/RNBRemote.cpp   | 138 +--
 12 files changed, 1401 insertions(+), 271 deletions(-)
 create mode 100644 lldb/test/API/macosx/sme-registers/Makefile
 create mode 100644 lldb/test/API/macosx/sme-registers/TestSMERegistersDarwin.py
 create mode 100644 lldb/test/API/macosx/sme-registers/main.c
 create mode 100644 lldb/tools/debugserver/source/MacOSX/arm64/sme_thread_status.h

diff --git a/lldb/source/Plugins/Architecture/AArch64/ArchitectureAArch64.cpp b/lldb/source/Plugins/Architecture/AArch64/ArchitectureAArch64.cpp
index 181ba4e7d8772..6a072354972ac 100644
--- a/lldb/source/Plugins/Architecture/AArch64/ArchitectureAArch64.cpp
+++ b/lldb/source/Plugins/Architecture/AArch64/ArchitectureAArch64.cpp
@@ -100,6 +100,25 @@ bool ArchitectureAArch64::ReconfigureRegisterInfo(DynamicRegisterInfo &reg_info,
     if (reg_value != fail_value && reg_value <= 32)
       svg_reg_value = reg_value;
   }
+  if (!svg_reg_value) {
+    const RegisterInfo *darwin_svg_reg_info = reg_info.GetRegisterInfo("svl");
+    if (darwin_svg_reg_info) {
+      uint32_t svg_reg_num = darwin_svg_reg_info->kinds[eRegisterKindLLDB];
+      uint64_t reg_value =
+          reg_context.ReadRegisterAsUnsigned(svg_reg_num, fail_value);
+      // UpdateARM64SVERegistersInfos and UpdateARM64SMERegistersInfos
+      // expect the number of 8-byte granules; darwin provides number of
+      // bytes.
+      if (reg_value != fail_value && reg_value <= 256) {
+        svg_reg_value = reg_value / 8;
+        // Apple hardware only implements Streaming SVE mode, so
+        // the non-streaming Vector Length is not reported by the
+        // kernel. Set both svg and vg to this svl value.
+        if (!vg_reg_value)
+          vg_reg_value = reg_value / 8;
+      }
+    }
+  }
 
   if (!vg_reg_value && !svg_reg_value)
     return false;
diff --git a/lldb/test/API/commands/register/register/register_command/TestRegisters.py b/lldb/test/API/commands/register/register/register_command/TestRegisters.py
index bfd7a382064e9..0b80a09534371 100644
--- a/lldb/test/API/commands/register/register/register_command/TestRegisters.py
+++ b/lldb/test/API/commands/register/register/register_command/TestRegisters.py
@@ -21,6 +21,24 @@ def tearDown(self):
         self.dbg.GetSelectedTarget().GetProcess().Destroy()
         TestBase.tearDown(self)
 
+    # on macOS, detect if the current machine is arm64 and supports SME
+    def get_sme_available(self):
+        if self.getArchitecture() != "arm64":
+            return None
+        try:
+            sysctl_output = subprocess.check_output(
+                ["sysctl", "hw.optional.arm.FEAT_SME"]
+            ).decode("utf-8")
+        except subprocess.CalledProcessError:
+            return None
+        m = re.match(r"hw\.optional\.arm\.FEAT_SME: (\w+)", sysctl_output)
+        if m:
+            if int(m.group(1)) == 1:
+                return True
+            else:
+                return False
+        return None
+
     @skipIfiOSSimulator
     @skipIf(archs=no_match(["amd64", "arm", "i386", "x86_64"]))
     @expectedFailureAll(oslist=["freebsd", "netbsd"], bugnumber="llvm.org/pr48371")
@@ -32,11 +50,19 @@ def test_register_commands(self):
         # verify that logging does not assert
         self.log_enable("registers")
 
+        error_str_matched = False
+        if self.get_sme_available() and self.platformIsDarwin():
+            # On Darwin AArch64 SME machines, we will have unavailable
+            # registers when not in Streaming SVE Mode/SME, so
+            # `register read -a` will report that some registers
+            # could not be read.  This is expected.
+            error_str_matched = True
+
         self.expect(
             "register read -a",
             MISSING_EXPECTED_REGISTERS,
             substrs=["registers were unavailable"],
-            matching=False,
+            matching=error_str_matched,
         )
 
         all_registers = self.res.GetOutput()
@@ -60,7 +86,7 @@ def test_register_commands(self):
                 self.runCmd("register read q15")  # may be available
 
         self.expect(
-            "register read -s 4", substrs=["invalid register set index: 4"], error=True
+            "register read -s 8", substrs=["invalid register set index: 8"], error=True
         )
 
     @skipIfiOSSimulator
diff --git a/lldb/test/API/macosx/sme-registers/Makefile b/lldb/test/API/macosx/sme-registers/Makefile
new file mode 100644
index 0000000000000..d4173d262ed27
--- /dev/null
+++ b/lldb/test/API/macosx/sme-registers/Makefile
@@ -0,0 +1,5 @@
+C_SOURCES := main.c
+
+CFLAGS_EXTRAS := -mcpu=apple-m4
+
+include Makefile.rules
diff --git a/lldb/test/API/macosx/sme-registers/TestSMERegistersDarwin.py b/lldb/test/API/macosx/sme-registers/TestSMERegistersDarwin.py
new file mode 100644
index 0000000000000..6f9d055cef506
--- /dev/null
+++ b/lldb/test/API/macosx/sme-registers/TestSMERegistersDarwin.py
@@ -0,0 +1,217 @@
+import lldb
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.decorators import *
+import lldbsuite.test.lldbutil as lldbutil
+import os
+
+
+class TestSMERegistersDarwin(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+    mydir = TestBase.compute_mydir(__file__)
+
+    @skipIfRemote
+    @skipUnlessDarwin
+    @skipUnlessFeature("hw.optional.arm.FEAT_SME")
+    @skipUnlessFeature("hw.optional.arm.FEAT_SME2")
+    # thread_set_state/thread_get_state only avail in macOS 15.4+
+    @skipIf(macos_version=["<", "15.4"])
+    def test(self):
+        """Test that we can read the contents of the SME/SVE registers on Darwin"""
+        self.build()
+        (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
+            self, "break before sme", lldb.SBFileSpec("main.c")
+        )
+        frame = thread.GetFrameAtIndex(0)
+        self.assertTrue(frame.IsValid())
+
+        self.assertTrue(
+            target.BreakpointCreateBySourceRegex(
+                "break while sme", lldb.SBFileSpec("main.c")
+            ).IsValid()
+        )
+        self.assertTrue(
+            target.BreakpointCreateBySourceRegex(
+                "break after sme", lldb.SBFileSpec("main.c")
+            ).IsValid()
+        )
+
+        if self.TraceOn():
+            self.runCmd("reg read -a")
+
+        self.assertTrue(frame.register["svl"].GetError().Fail())
+        self.assertTrue(frame.register["z0"].GetError().Fail())
+        self.assertTrue(frame.register["p0"].GetError().Fail())
+        self.assertTrue(frame.register["za"].GetError().Fail())
+        self.assertTrue(frame.register["zt0"].GetError().Fail())
+
+        process.Continue()
+        frame = thread.GetFrameAtIndex(0)
+        self.assertEqual(thread.GetStopReason(), lldb.eStopReasonBreakpoint)
+
+        # Now in SME enabled mode
+        self.assertTrue(frame.register["svl"].GetError().Success())
+        self.assertTrue(frame.register["z0"].GetError().Success())
+        self.assertTrue(frame.register["p0"].GetError().Success())
+        self.assertTrue(frame.register["za"].GetError().Success())
+        self.assertTrue(frame.register["zt0"].GetError().Success())
+
+        # SSVE and SME modes should be enabled (reflecting PSTATE.SM and PSTATE.ZA)
+        svcr = frame.register["svcr"]
+        self.assertEqual(svcr.GetValueAsUnsigned(), 3)
+
+        svl_reg = frame.register["svl"]
+        svl = svl_reg.GetValueAsUnsigned()
+
+        z0 = frame.register["z0"]
+        self.assertEqual(z0.GetNumChildren(), svl)
+        self.assertEqual(z0.GetChildAtIndex(0).GetValueAsUnsigned(), 0x1)
+        self.assertEqual(z0.GetChildAtIndex(svl - 1).GetValueAsUnsigned(), 0x1)
+
+        z31 = frame.register["z31"]
+        self.assertEqual(z31.GetNumChildren(), svl)
+        self.assertEqual(z31.GetChildAtIndex(0).GetValueAsUnsigned(), 32)
+        self.assertEqual(z31.GetChildAtIndex(svl - 1).GetValueAsUnsigned(), 32)
+
+        p0 = frame.register["p0"]
+        self.assertEqual(p0.GetNumChildren(), svl / 8)
+        self.assertEqual(p0.GetChildAtIndex(0).GetValueAsUnsigned(), 0xFF)
+        self.assertEqual(
+            p0.GetChildAtIndex(p0.GetNumChildren() - 1).GetValueAsUnsigned(), 0xFF
+        )
+
+        p15 = frame.register["p15"]
+        self.assertEqual(p15.GetNumChildren(), svl / 8)
+        self.assertEqual(p15.GetChildAtIndex(0).GetValueAsUnsigned(), 0xFF)
+        self.assertEqual(
+            p15.GetChildAtIndex(p15.GetNumChildren() - 1).GetValueAsUnsigned(), 0xFF
+        )
+
+        za = frame.register["za"]
+        self.assertEqual(za.GetNumChildren(), (svl * svl))
+        za_0 = za.GetChildAtIndex(0)
+        self.assertEqual(za_0.GetValueAsUnsigned(), 4)
+        za_final = za.GetChildAtIndex(za.GetNumChildren() - 1)
+        self.assertEqual(za_final.GetValueAsUnsigned(), 67)
+
+        zt0 = frame.register["zt0"]
+        self.assertEqual(zt0.GetNumChildren(), 64)
+        zt0_0 = zt0.GetChildAtIndex(0)
+        self.assertEqual(zt0_0.GetValueAsUnsigned(), 0)
+        zt0_final = zt0.GetChildAtIndex(63)
+        self.assertEqual(zt0_final.GetValueAsUnsigned(), 63)
+
+        # Modify all of the registers, instruction step, confirm that the
+        # registers have the new values.  Without the instruction step, it's
+        # possible debugserver or lldb could lie about the write succeeding.
+
+        z0_old_values = []
+        z0_new_values = []
+        z0_new_str = '"{'
+        for i in range(svl):
+            z0_old_values.append(z0.GetChildAtIndex(i).GetValueAsUnsigned())
+            z0_new_values.append(z0_old_values[i] + 5)
+            z0_new_str = z0_new_str + ("0x%02x " % z0_new_values[i])
+        z0_new_str = z0_new_str + '}"'
+        self.runCmd("reg write z0 %s" % z0_new_str)
+
+        z31_old_values = []
+        z31_new_values = []
+        z31_new_str = '"{'
+        for i in range(svl):
+            z31_old_values.append(z31.GetChildAtIndex(i).GetValueAsUnsigned())
+            z31_new_values.append(z31_old_values[i] + 3)
+            z31_new_str = z31_new_str + ("0x%02x " % z31_new_values[i])
+        z31_new_str = z31_new_str + '}"'
+        self.runCmd("reg write z31 %s" % z31_new_str)
+
+        p0_old_values = []
+        p0_new_values = []
+        p0_new_str = '"{'
+        for i in range(int(svl / 8)):
+            p0_old_values.append(p0.GetChildAtIndex(i).GetValueAsUnsigned())
+            p0_new_values.append(p0_old_values[i] - 5)
+            p0_new_str = p0_new_str + ("0x%02x " % p0_new_values[i])
+        p0_new_str = p0_new_str + '}"'
+        self.runCmd("reg write p0 %s" % p0_new_str)
+
+        p15_old_values = []
+        p15_new_values = []
+        p15_new_str = '"{'
+        for i in range(int(svl / 8)):
+            p15_old_values.append(p15.GetChildAtIndex(i).GetValueAsUnsigned())
+            p15_new_values.append(p15_old_values[i] - 8)
+            p15_new_str = p15_new_str + ("0x%02x " % p15_new_values[i])
+        p15_new_str = p15_new_str + '}"'
+        self.runCmd("reg write p15 %s" % p15_new_str)
+
+        za_old_values = []
+        za_new_values = []
+        za_new_str = '"{'
+        for i in range(svl * svl):
+            za_old_values.append(za.GetChildAtIndex(i).GetValueAsUnsigned())
+            za_new_values.append(za_old_values[i] + 7)
+            za_new_str = za_new_str + ("0x%02x " % za_new_values[i])
+        za_new_str = za_new_str + '}"'
+        self.runCmd("reg write za %s" % za_new_str)
+
+        zt0_old_values = []
+        zt0_new_values = []
+        zt0_new_str = '"{'
+        for i in range(64):
+            zt0_old_values.append(zt0.GetChildAtIndex(i).GetValueAsUnsigned())
+            zt0_new_values.append(zt0_old_values[i] + 2)
+            zt0_new_str = zt0_new_str + ("0x%02x " % zt0_new_values[i])
+        zt0_new_str = zt0_new_str + '}"'
+        self.runCmd("reg write zt0 %s" % zt0_new_str)
+
+        thread.StepInstruction(False)
+        frame = thread.GetFrameAtIndex(0)
+
+        if self.TraceOn():
+            self.runCmd("reg read -a")
+
+        z0 = frame.register["z0"]
+        for i in range(z0.GetNumChildren()):
+            self.assertEqual(
+                z0_new_values[i], z0.GetChildAtIndex(i).GetValueAsUnsigned()
+            )
+
+        z31 = frame.register["z31"]
+        for i in range(z31.GetNumChildren()):
+            self.assertEqual(
+                z31_new_values[i], z31.GetChildAtIndex(i).GetValueAsUnsigned()
+            )
+
+        p0 = frame.register["p0"]
+        for i in range(p0.GetNumChildren()):
+            self.assertEqual(
+                p0_new_values[i], p0.GetChildAtIndex(i).GetValueAsUnsigned()
+            )
+
+        p15 = frame.register["p15"]
+        for i in range(p15.GetNumChildren()):
+            self.assertEqual(
+                p15_new_values[i], p15.GetChildAtIndex(i).GetValueAsUnsigned()
+            )
+
+        za = frame.register["za"]
+        for i in range(za.GetNumChildren()):
+            self.assertEqual(
+                za_new_values[i], za.GetChildAtIndex(i).GetValueAsUnsigned()
+            )
+
+        zt0 = frame.register["zt0"]
+        for i in range(zt0.GetNumChildren()):
+            self.assertEqual(
+                zt0_new_values[i], zt0.GetChildAtIndex(i).GetValueAsUnsigned()
+            )
+
+        process.Continue()
+        frame = thread.GetFrameAtIndex(0)
+        self.assertEqual(thread.GetStopReason(), lldb.eStopReasonBreakpoint)
+
+        self.assertTrue(frame.register["svl"].GetError().Fail())
+        self.assertTrue(frame.register["z0"].GetError().Fail())
+        self.assertTrue(frame.register["p0"].GetError().Fail())
+        self.assertTrue(frame.register["za"].GetError().Fail())
+        self.assertTrue(frame.register["zt0"].GetError().Fail())
diff --git a/lldb/test/API/macosx/sme-registers/main.c b/lldb/test/API/macosx/sme-registers/main.c
new file mode 100644
index 0000000000000..2ebddfc001f8f
--- /dev/null
+++ b/lldb/test/API/macosx/sme-registers/main.c
@@ -0,0 +1,113 @@
+///  BUILT with
+///    xcrun -sdk macosx.internal clang -mcpu=apple-m4 -g sme.c -o sme
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void write_sve_regs() {
+  asm volatile("ptrue p0.b\n\t");
+  asm volatile("ptrue p1.h\n\t");
+  asm volatile("ptrue p2.s\n\t");
+  asm volatile("ptrue p3.d\n\t");
+  asm volatile("pfalse p4.b\n\t");
+  asm volatile("ptrue p5.b\n\t");
+  asm volatile("ptrue p6.h\n\t");
+  asm volatile("ptrue p7.s\n\t");
+  asm volatile("ptrue p8.d\n\t");
+  asm volatile("pfalse p9.b\n\t");
+  asm volatile("ptrue p10.b\n\t");
+  asm volatile("ptrue p11.h\n\t");
+  asm volatile("ptrue p12.s\n\t");
+  asm volatile("ptrue p13.d\n\t");
+  asm volatile("pfalse p14.b\n\t");
+  asm volatile("ptrue p15.b\n\t");
+
+  asm volatile("cpy  z0.b, p0/z, #1\n\t");
+  asm volatile("cpy  z1.b, p5/z, #2\n\t");
+  asm volatile("cpy  z2.b, p10/z, #3\n\t");
+  asm volatile("cpy  z3.b, p15/z, #4\n\t");
+  asm volatile("cpy  z4.b, p0/z, #5\n\t");
+  asm volatile("cpy  z5.b, p5/z, #6\n\t");
+  asm volatile("cpy  z6.b, p10/z, #7\n\t");
+  asm volatile("cpy  z7.b, p15/z, #8\n\t");
+  asm volatile("cpy  z8.b, p0/z, #9\n\t");
+  asm volatile("cpy  z9.b, p5/z, #10\n\t");
+  asm volatile("cpy  z10.b, p10/z, #11\n\t");
+  asm volatile("cpy  z11.b, p15/z, #12\n\t");
+  asm volatile("cpy  z12.b, p0/z, #13\n\t");
+  asm volatile("cpy  z13.b, p5/z, #14\n\t");
+  asm volatile("cpy  z14.b, p10/z, #15\n\t");
+  asm volatile("cpy  z15.b, p15/z, #16\n\t");
+  asm volatile("cpy  z16.b, p0/z, #17\n\t");
+  asm volatile("cpy  z17.b, p5/z, #18\n\t");
+  asm volatile("cpy  z18.b, p10/z, #19\n\t");
+  asm volatile("cpy  z19.b, p15/z, #20\n\t");
+  asm volatile("cpy  z20.b, p0/z, #21\n\t");
+  asm volatile("cpy  z21.b, p5/z, #22\n\t");
+  asm volatile("cpy  z22.b, p10/z, #23\n\t");
+  asm volatile("cpy  z23.b, p15/z, #24\n\t");
+  asm volatile("cpy  z24.b, p0/z, #25\n\t");
+  asm volatile("cpy  z25.b, p5/z, #26\n\t");
+  asm volatile("cpy  z26.b, p10/z, #27\n\t");
+  asm volatile("cpy  z27.b, p15/z, #28\n\t");
+  asm volatile("cpy  z28.b, p0/z, #29\n\t");
+  asm volatile("cpy  z29.b, p5/z, #30\n\t");
+  asm volatile("cpy  z30.b, p10/z, #31\n\t");
+  asm volatile("cpy  z31.b, p15/z, #32\n\t");
+}
+
+#define MAX_VL_BYTES 256
+void set_za_register(int svl, int value_offset) {
+  uint8_t data[MAX_VL_BYTES];
+
+  // ldr za will actually wrap the selected vector row, by the number of rows
+  // you have. So setting one that didn't exist would actually set one that did.
+  // That's why we need the streaming vector length here.
+  for (int i = 0; i < svl; ++i) {
+    // This may involve instructions that require the smefa64 extension.
+    for (int j = 0; j < MAX_VL_BYTES; j++)
+      data[j] = i + value_offset;
+    // Each one of these loads a VL sized row of ZA.
+    asm volatile("mov w12, %w0\n\t"
+                 "ldr za[w12, 0], [%1]\n\t" ::"r"(i),
+                 "r"(&data)
+                 : "w12");
+  }
+}
+
+static uint16_t arm_sme_svl_b(void) {
+  uint64_t ret = 0;
+  asm volatile("rdsvl  %[ret], #1" : [ret] "=r"(ret));
+  return (uint16_t)ret;
+}
+
+void arm_sme2_set_zt0() {
+#define ZTO_LEN (512 / 8)
+  uint8_t data[ZTO_LEN];
+  for (unsigned i = 0; i < ZTO_LEN; ++i)
+    data[i] = i + 0;
+
+  asm volatile("ldr zt0, [%0]" ::"r"(&data));
+#undef ZT0_LEN
+}
+
+int main() {
+  printf("Enable SME mode\n"); // break before sme
+
+  asm volatile("smstart");
+
+  write_sve_regs();
+
+  set_za_register(arm_sme_svl_b(), 4);
+
+  arm_sme2_set_zt0();
+
+  int c = 10; // break while sme
+  c += 5;
+  c += 5;
+
+  asm volatile("smstop");
+
+  printf("SME mode disabled\n"); // break after sme
+}
diff --git a/lldb/tools/debugserver/source/DNBDefs.h b/lldb/tools/debugserver/source/DNBDefs.h
index dacee652b3ebf..df8ca809d412c 100644
--- a/lldb/tools/debugserver/source/DNBDefs.h
+++ b/lldb/tools/debugserver/source/DNBDefs.h
@@ -312,16 +312,21 @@ struct DNBRegisterValue {
     uint64_t uint64;
     float float32;
     double float64;
-    int8_t v_sint8[64];
-    int16_t v_sint16[32];
-    int32_t v_sint32[16];
-    int64_t v_sint64[8];
-    uint8_t v_uint8[64];
-    uint16_t v_uint16[32];
-    uint32_t v_uint32[16];
-    uint64_t v_uint64[8];
-    float v_float32[16];
-    double v_float64[8];
+    // AArch64 SME's ZA register max size is 64k, this object must be
+    // large enough to hold that much data.  The current Apple cores
+    // have a much smaller maximum ZA reg size, but there are not
+    // multiple copies of this object so increase the static size to
+    // maximum possible.
+    int8_t v_sint8[65536];
+    int16_t v_sint16[32768];
+    int32_t v_sint32[16384];
+    int64_t v_sint64[8192];
+    uint8_t v_uint8[65536];
+    uint16_t v_uint16[32768];
+    uint32_t v_uint32[16384];
+    uint64_t v_uint64[8192];
+    float v_float32[16384];
+    double v_float64[8192];
     void *pointer;
     char *c_str;
   } value;
diff --git a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
index 530c5b24b424e..a2179bf2f91e5 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
+++ b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
@@ -1417,15 +1417,17 @@ static uint64_t bits(uint64_t value, uint32_t msbit, uint32_t lsbit) {
       continue;
     for (uint32_t reg = 0; reg < reg_sets[set].num_registers; ++reg) {
       if (strcmp(reg_sets[set].registers[reg].name, "esr") == 0) {
-        DNBRegisterValue reg_value;
-        if (GetRegisterValue(tid, set, reg, &reg_value)) {
-          esr = reg_value.value.uint64;
+        std::unique_ptr<DNBRegisterValue> reg_value =
+            std::make_unique<DNBRegisterValue>();
+        if (GetRegisterValue(tid, set, reg, reg_value.get())) {
+          esr = reg_value->value.uint64;
         }
       }
       if (strcmp(reg_sets[set].registers[reg].name, "far") == 0) {
-        DNBRegisterValue reg_value;
-        if (GetRegisterValue(tid, set, reg, &reg_value)) {
-          far = reg_value.value.uint64;
+        std::unique_ptr<DNBRegisterValue> reg_value =
+            std::make_unique<DNBRegisterValue>();
+        if (GetRegisterValue(tid, set, reg, reg_value.get())) {
+          far = reg_value->value.uint64;
         }
       }
     }
diff --git a/lldb/tools/debugserver/source/MacOSX/MachThread.cpp b/lldb/tools/debugserver/source/MacOSX/MachThread.cpp
index de2bebfcec709..69e1c9bb0e252 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachThread.cpp
+++ b/lldb/tools/debugserver/source/MacOSX/MachThread.cpp
@@ -509,10 +509,12 @@ void MachThread::DumpRegisterState(nub_size_t regSet) {
     if (m_arch_up->RegisterSetStateIsValid((int)regSet)) {
       const size_t numRegisters = GetNumRegistersInSet(regSet);
       uint32_t regIndex = 0;
-      DNBRegisterValueClass reg;
+      std::unique_ptr<DNBRegisterValueClass> reg =
+          std::make_unique<DNBRegisterValueClass>();
       for (regIndex = 0; regIndex < numRegisters; ++regIndex) {
-        if (m_arch_up->GetRegisterValue((uint32_t)regSet, regIndex, &reg)) {
-          reg.Dump(NULL, NULL);
+        if (m_arch_up->GetRegisterValue((uint32_t)regSet, regIndex,
+                                        reg.get())) {
+          reg->Dump(NULL, NULL);
         }
       }
     } else {
diff --git a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp
index b6f52cb5cf496..34a4ee21f8502 100644
--- a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp
+++ b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp
@@ -93,6 +93,45 @@ DNBArchMachARM64::SoftwareBreakpointOpcode(nub_size_t byte_size) {
 
 uint32_t DNBArchMachARM64::GetCPUType() { return CPU_TYPE_ARM64; }
 
+static std::once_flag g_cpu_has_sme_once;
+bool DNBArchMachARM64::CPUHasSME() {
+  static bool g_has_sme = false;
+  std::call_once(g_cpu_has_sme_once, []() {
+    int ret = 0;
+    size_t size = sizeof(ret);
+    if (sysctlbyname("hw.optional.arm.FEAT_SME", &ret, &size, NULL, 0) != -1)
+      g_has_sme = ret == 1;
+  });
+  return g_has_sme;
+}
+
+static std::once_flag g_cpu_has_sme2_once;
+bool DNBArchMachARM64::CPUHasSME2() {
+  static bool g_has_sme2 = false;
+  std::call_once(g_cpu_has_sme2_once, []() {
+    int ret = 0;
+    size_t size = sizeof(ret);
+    if (sysctlbyname("hw.optional.arm.FEAT_SME2", &ret, &size, NULL, 0) != -1)
+      g_has_sme2 = ret == 1;
+  });
+  return g_has_sme2;
+}
+
+static std::once_flag g_sme_max_svl_once;
+unsigned int DNBArchMachARM64::GetSMEMaxSVL() {
+  static unsigned int g_sme_max_svl = 0;
+  std::call_once(g_sme_max_svl_once, []() {
+    if (CPUHasSME()) {
+      unsigned int ret = 0;
+      size_t size = sizeof(ret);
+      if (sysctlbyname("hw.optional.arm.sme_max_svl_b", &ret, &size, NULL, 0) !=
+          -1)
+        g_sme_max_svl = ret;
+    }
+  });
+  return g_sme_max_svl;
+}
+
 static uint64_t clear_pac_bits(uint64_t value) {
   uint32_t addressing_bits = 0;
   if (!DNBGetAddressingBits(addressing_bits))
@@ -415,6 +454,118 @@ kern_return_t DNBArchMachARM64::GetDBGState(bool force) {
   return kret;
 }
 
+kern_return_t DNBArchMachARM64::GetSVEState(bool force) {
+  int set = e_regSetSVE;
+  // Check if we have valid cached registers
+  if (!force && m_state.GetError(set, Read) == KERN_SUCCESS)
+    return KERN_SUCCESS;
+
+  if (!CPUHasSME())
+    return KERN_INVALID_ARGUMENT;
+
+  // If the processor is not in Streaming SVE Mode, these thread_get_states
+  // will fail, and we may return uninitialized data in the register context.
+  memset(&m_state.context.sve.z[0], 0,
+         ARM_SVE_Z_STATE_COUNT * sizeof(uint32_t));
+  memset(&m_state.context.sve.z[16], 0,
+         ARM_SVE_Z_STATE_COUNT * sizeof(uint32_t));
+  memset(&m_state.context.sve.p[0], 0,
+         ARM_SVE_P_STATE_COUNT * sizeof(uint32_t));
+
+  // Read the registers from our thread
+  mach_msg_type_number_t count = ARM_SVE_Z_STATE_COUNT;
+  kern_return_t kret =
+      ::thread_get_state(m_thread->MachPortNumber(), ARM_SVE_Z_STATE1,
+                         (thread_state_t)&m_state.context.sve.z[0], &count);
+  m_state.SetError(set, Read, kret);
+  DNBLogThreadedIf(LOG_THREAD, "Read SVE registers z0..z15 return value %d",
+                   kret);
+  if (kret != KERN_SUCCESS)
+    return kret;
+
+  count = ARM_SVE_Z_STATE_COUNT;
+  kret = thread_get_state(m_thread->MachPortNumber(), ARM_SVE_Z_STATE2,
+                          (thread_state_t)&m_state.context.sve.z[16], &count);
+  m_state.SetError(set, Read, kret);
+  DNBLogThreadedIf(LOG_THREAD, "Read SVE registers z16..z31 return value %d",
+                   kret);
+  if (kret != KERN_SUCCESS)
+    return kret;
+
+  count = ARM_SVE_P_STATE_COUNT;
+  kret = thread_get_state(m_thread->MachPortNumber(), ARM_SVE_P_STATE,
+                          (thread_state_t)&m_state.context.sve.p[0], &count);
+  m_state.SetError(set, Read, kret);
+  DNBLogThreadedIf(LOG_THREAD, "Read SVE registers p0..p15 return value %d",
+                   kret);
+
+  return kret;
+}
+
+kern_return_t DNBArchMachARM64::GetSMEState(bool force) {
+  int set = e_regSetSME;
+  // Check if we have valid cached registers
+  if (!force && m_state.GetError(set, Read) == KERN_SUCCESS)
+    return KERN_SUCCESS;
+
+  if (!CPUHasSME())
+    return KERN_INVALID_ARGUMENT;
+
+  // If the processor is not in Streaming SVE Mode, these thread_get_states
+  // will fail, and we may return uninitialized data in the register context.
+  memset(&m_state.context.sme.svcr, 0, ARM_SME_STATE_COUNT * sizeof(uint32_t));
+  memset(m_state.context.sme.za.data(), 0, m_state.context.sme.za.size());
+  if (CPUHasSME2())
+    memset(&m_state.context.sme.zt0, 0,
+           ARM_SME2_STATE_COUNT * sizeof(uint32_t));
+
+  // Read the registers from our thread
+  mach_msg_type_number_t count = ARM_SME_STATE_COUNT;
+  kern_return_t kret =
+      ::thread_get_state(m_thread->MachPortNumber(), ARM_SME_STATE,
+                         (thread_state_t)&m_state.context.sme.svcr, &count);
+  m_state.SetError(set, Read, kret);
+  DNBLogThreadedIf(LOG_THREAD, "Read ARM_SME_STATE return value %d", kret);
+  if (kret != KERN_SUCCESS)
+    return kret;
+
+  size_t za_size = m_state.context.sme.svl_b * m_state.context.sme.svl_b;
+  const size_t max_chunk_size = 4096;
+  int n_chunks;
+  size_t chunk_size;
+  if (za_size <= max_chunk_size) {
+    n_chunks = 1;
+    chunk_size = za_size;
+  } else {
+    n_chunks = za_size / max_chunk_size;
+    chunk_size = max_chunk_size;
+  }
+  for (int i = 0; i < n_chunks; i++) {
+    count = ARM_SME_ZA_STATE_COUNT;
+    arm_sme_za_state_t za_state;
+    kret = thread_get_state(m_thread->MachPortNumber(), ARM_SME_ZA_STATE1 + i,
+                            (thread_state_t)&za_state, &count);
+    m_state.SetError(set, Read, kret);
+    DNBLogThreadedIf(LOG_THREAD, "Read ARM_SME_STATE return value %d", kret);
+    if (kret != KERN_SUCCESS)
+      return kret;
+    memcpy(m_state.context.sme.za.data() + (i * chunk_size), &za_state,
+           chunk_size);
+  }
+
+  if (CPUHasSME2()) {
+    count = ARM_SME2_STATE_COUNT;
+    kret = thread_get_state(m_thread->MachPortNumber(), ARM_SME2_STATE,
+                            (thread_state_t)&m_state.context.sme.zt0, &count);
+    m_state.SetError(set, Read, kret);
+    DNBLogThreadedIf(LOG_THREAD, "Read ARM_SME2_STATE return value %d", kret);
+    if (kret != KERN_SUCCESS)
+      return kret;
+  }
+
+  return kret;
+}
+
 kern_return_t DNBArchMachARM64::SetGPRState() {
   int set = e_regSetGPR;
   kern_return_t kret = ::thread_set_state(
@@ -441,6 +592,80 @@ kern_return_t DNBArchMachARM64::SetVFPState() {
   return kret;                             // Return the error code
 }
 
+kern_return_t DNBArchMachARM64::SetSVEState() {
+  if (!CPUHasSME())
+    return KERN_INVALID_ARGUMENT;
+
+  int set = e_regSetSVE;
+  kern_return_t kret = thread_set_state(
+      m_thread->MachPortNumber(), ARM_SVE_Z_STATE1,
+      (thread_state_t)&m_state.context.sve.z[0], ARM_SVE_Z_STATE_COUNT);
+  m_state.SetError(set, Write, kret);
+  DNBLogThreadedIf(LOG_THREAD, "Write ARM_SVE_Z_STATE1 return value %d", kret);
+  if (kret != KERN_SUCCESS)
+    return kret;
+
+  kret = thread_set_state(m_thread->MachPortNumber(), ARM_SVE_Z_STATE2,
+                          (thread_state_t)&m_state.context.sve.z[16],
+                          ARM_SVE_Z_STATE_COUNT);
+  m_state.SetError(set, Write, kret);
+  DNBLogThreadedIf(LOG_THREAD, "Write ARM_SVE_Z_STATE2 return value %d", kret);
+  if (kret != KERN_SUCCESS)
+    return kret;
+
+  kret = thread_set_state(m_thread->MachPortNumber(), ARM_SVE_P_STATE,
+                          (thread_state_t)&m_state.context.sve.p[0],
+                          ARM_SVE_P_STATE_COUNT);
+  m_state.SetError(set, Write, kret);
+  DNBLogThreadedIf(LOG_THREAD, "Write ARM_SVE_P_STATE return value %d", kret);
+  if (kret != KERN_SUCCESS)
+    return kret;
+
+  return kret;
+}
+
+kern_return_t DNBArchMachARM64::SetSMEState() {
+  if (!CPUHasSME())
+    return KERN_INVALID_ARGUMENT;
+  kern_return_t kret;
+
+  int set = e_regSetSME;
+  size_t za_size = m_state.context.sme.svl_b * m_state.context.sme.svl_b;
+  const size_t max_chunk_size = 4096;
+  int n_chunks;
+  size_t chunk_size;
+  if (za_size <= max_chunk_size) {
+    n_chunks = 1;
+    chunk_size = za_size;
+  } else {
+    n_chunks = za_size / max_chunk_size;
+    chunk_size = max_chunk_size;
+  }
+  for (int i = 0; i < n_chunks; i++) {
+    arm_sme_za_state_t za_state;
+    memcpy(&za_state, m_state.context.sme.za.data() + (i * chunk_size),
+           chunk_size);
+    kret = thread_set_state(m_thread->MachPortNumber(), ARM_SME_ZA_STATE1 + i,
+                            (thread_state_t)&za_state, ARM_SME_ZA_STATE_COUNT);
+    m_state.SetError(set, Write, kret);
+    DNBLogThreadedIf(LOG_THREAD, "Write ARM_SME_STATE return value %d", kret);
+    if (kret != KERN_SUCCESS)
+      return kret;
+  }
+
+  if (CPUHasSME2()) {
+    kret = thread_set_state(m_thread->MachPortNumber(), ARM_SME2_STATE,
+                            (thread_state_t)&m_state.context.sme.zt0,
+                            ARM_SME2_STATE);
+    m_state.SetError(set, Write, kret);
+    DNBLogThreadedIf(LOG_THREAD, "Write ARM_SME2_STATE return value %d", kret);
+    if (kret != KERN_SUCCESS)
+      return kret;
+  }
+
+  return kret;
+}
+
 kern_return_t DNBArchMachARM64::SetEXCState() {
   int set = e_regSetEXC;
   kern_return_t kret = ::thread_set_state(
@@ -1531,6 +1756,59 @@ enum {
   vfp_d31
 };
 
+enum {
+  sve_z0,
+  sve_z1,
+  sve_z2,
+  sve_z3,
+  sve_z4,
+  sve_z5,
+  sve_z6,
+  sve_z7,
+  sve_z8,
+  sve_z9,
+  sve_z10,
+  sve_z11,
+  sve_z12,
+  sve_z13,
+  sve_z14,
+  sve_z15,
+  sve_z16,
+  sve_z17,
+  sve_z18,
+  sve_z19,
+  sve_z20,
+  sve_z21,
+  sve_z22,
+  sve_z23,
+  sve_z24,
+  sve_z25,
+  sve_z26,
+  sve_z27,
+  sve_z28,
+  sve_z29,
+  sve_z30,
+  sve_z31,
+  sve_p0,
+  sve_p1,
+  sve_p2,
+  sve_p3,
+  sve_p4,
+  sve_p5,
+  sve_p6,
+  sve_p7,
+  sve_p8,
+  sve_p9,
+  sve_p10,
+  sve_p11,
+  sve_p12,
+  sve_p13,
+  sve_p14,
+  sve_p15
+};
+
+enum { sme_svcr, sme_tpidr2, sme_svl_b, sme_za, sme_zt0 };
+
 enum { exc_far = 0, exc_esr, exc_exception };
 
 // These numbers from the "DWARF for the ARM 64-bit Architecture (AArch64)"
@@ -1681,7 +1959,60 @@ enum {
   debugserver_vfp_v30,
   debugserver_vfp_v31,
   debugserver_vfp_fpsr,
-  debugserver_vfp_fpcr
+  debugserver_vfp_fpcr,
+  debugserver_sve_z0,
+  debugserver_sve_z1,
+  debugserver_sve_z2,
+  debugserver_sve_z3,
+  debugserver_sve_z4,
+  debugserver_sve_z5,
+  debugserver_sve_z6,
+  debugserver_sve_z7,
+  debugserver_sve_z8,
+  debugserver_sve_z9,
+  debugserver_sve_z10,
+  debugserver_sve_z11,
+  debugserver_sve_z12,
+  debugserver_sve_z13,
+  debugserver_sve_z14,
+  debugserver_sve_z15,
+  debugserver_sve_z16,
+  debugserver_sve_z17,
+  debugserver_sve_z18,
+  debugserver_sve_z19,
+  debugserver_sve_z20,
+  debugserver_sve_z21,
+  debugserver_sve_z22,
+  debugserver_sve_z23,
+  debugserver_sve_z24,
+  debugserver_sve_z25,
+  debugserver_sve_z26,
+  debugserver_sve_z27,
+  debugserver_sve_z28,
+  debugserver_sve_z29,
+  debugserver_sve_z30,
+  debugserver_sve_z31,
+  debugserver_sve_p0,
+  debugserver_sve_p1,
+  debugserver_sve_p2,
+  debugserver_sve_p3,
+  debugserver_sve_p4,
+  debugserver_sve_p5,
+  debugserver_sve_p6,
+  debugserver_sve_p7,
+  debugserver_sve_p8,
+  debugserver_sve_p9,
+  debugserver_sve_p10,
+  debugserver_sve_p11,
+  debugserver_sve_p12,
+  debugserver_sve_p13,
+  debugserver_sve_p14,
+  debugserver_sve_p15,
+  debugserver_sme_svcr,
+  debugserver_sme_tpidr2,
+  debugserver_sme_svl_b,
+  debugserver_sme_za,
+  debugserver_sme_zt0
 };
 
 const char *g_contained_x0[]{"x0", NULL};
@@ -1906,38 +2237,74 @@ const char *g_contained_v29[]{"v29", NULL};
 const char *g_contained_v30[]{"v30", NULL};
 const char *g_contained_v31[]{"v31", NULL};
 
-const char *g_invalidate_v0[]{"v0", "d0", "s0", NULL};
-const char *g_invalidate_v1[]{"v1", "d1", "s1", NULL};
-const char *g_invalidate_v2[]{"v2", "d2", "s2", NULL};
-const char *g_invalidate_v3[]{"v3", "d3", "s3", NULL};
-const char *g_invalidate_v4[]{"v4", "d4", "s4", NULL};
-const char *g_invalidate_v5[]{"v5", "d5", "s5", NULL};
-const char *g_invalidate_v6[]{"v6", "d6", "s6", NULL};
-const char *g_invalidate_v7[]{"v7", "d7", "s7", NULL};
-const char *g_invalidate_v8[]{"v8", "d8", "s8", NULL};
-const char *g_invalidate_v9[]{"v9", "d9", "s9", NULL};
-const char *g_invalidate_v10[]{"v10", "d10", "s10", NULL};
-const char *g_invalidate_v11[]{"v11", "d11", "s11", NULL};
-const char *g_invalidate_v12[]{"v12", "d12", "s12", NULL};
-const char *g_invalidate_v13[]{"v13", "d13", "s13", NULL};
-const char *g_invalidate_v14[]{"v14", "d14", "s14", NULL};
-const char *g_invalidate_v15[]{"v15", "d15", "s15", NULL};
-const char *g_invalidate_v16[]{"v16", "d16", "s16", NULL};
-const char *g_invalidate_v17[]{"v17", "d17", "s17", NULL};
-const char *g_invalidate_v18[]{"v18", "d18", "s18", NULL};
-const char *g_invalidate_v19[]{"v19", "d19", "s19", NULL};
-const char *g_invalidate_v20[]{"v20", "d20", "s20", NULL};
-const char *g_invalidate_v21[]{"v21", "d21", "s21", NULL};
-const char *g_invalidate_v22[]{"v22", "d22", "s22", NULL};
-const char *g_invalidate_v23[]{"v23", "d23", "s23", NULL};
-const char *g_invalidate_v24[]{"v24", "d24", "s24", NULL};
-const char *g_invalidate_v25[]{"v25", "d25", "s25", NULL};
-const char *g_invalidate_v26[]{"v26", "d26", "s26", NULL};
-const char *g_invalidate_v27[]{"v27", "d27", "s27", NULL};
-const char *g_invalidate_v28[]{"v28", "d28", "s28", NULL};
-const char *g_invalidate_v29[]{"v29", "d29", "s29", NULL};
-const char *g_invalidate_v30[]{"v30", "d30", "s30", NULL};
-const char *g_invalidate_v31[]{"v31", "d31", "s31", NULL};
+const char *g_invalidate_v[32][4]{
+    {"v0", "d0", "s0", NULL},    {"v1", "d1", "s1", NULL},
+    {"v2", "d2", "s2", NULL},    {"v3", "d3", "s3", NULL},
+    {"v4", "d4", "s4", NULL},    {"v5", "d5", "s5", NULL},
+    {"v6", "d6", "s6", NULL},    {"v7", "d7", "s7", NULL},
+    {"v8", "d8", "s8", NULL},    {"v9", "d9", "s9", NULL},
+    {"v10", "d10", "s10", NULL}, {"v11", "d11", "s11", NULL},
+    {"v12", "d12", "s12", NULL}, {"v13", "d13", "s13", NULL},
+    {"v14", "d14", "s14", NULL}, {"v15", "d15", "s15", NULL},
+    {"v16", "d16", "s16", NULL}, {"v17", "d17", "s17", NULL},
+    {"v18", "d18", "s18", NULL}, {"v19", "d19", "s19", NULL},
+    {"v20", "d20", "s20", NULL}, {"v21", "d21", "s21", NULL},
+    {"v22", "d22", "s22", NULL}, {"v23", "d23", "s23", NULL},
+    {"v24", "d24", "s24", NULL}, {"v25", "d25", "s25", NULL},
+    {"v26", "d26", "s26", NULL}, {"v27", "d27", "s27", NULL},
+    {"v28", "d28", "s28", NULL}, {"v29", "d29", "s29", NULL},
+    {"v30", "d30", "s30", NULL}, {"v31", "d31", "s31", NULL}};
+
+const char *g_invalidate_z[32][5]{
+    {"z0", "v0", "d0", "s0", NULL},     {"z1", "v1", "d1", "s1", NULL},
+    {"z2", "v2", "d2", "s2", NULL},     {"z3", "v3", "d3", "s3", NULL},
+    {"z4", "v4", "d4", "s4", NULL},     {"z5", "v5", "d5", "s5", NULL},
+    {"z6", "v6", "d6", "s6", NULL},     {"z7", "v7", "d7", "s7", NULL},
+    {"z8", "v8", "d8", "s8", NULL},     {"z9", "v9", "d9", "s9", NULL},
+    {"z10", "v10", "d10", "s10", NULL}, {"z11", "v11", "d11", "s11", NULL},
+    {"z12", "v12", "d12", "s12", NULL}, {"z13", "v13", "d13", "s13", NULL},
+    {"z14", "v14", "d14", "s14", NULL}, {"z15", "v15", "d15", "s15", NULL},
+    {"z16", "v16", "d16", "s16", NULL}, {"z17", "v17", "d17", "s17", NULL},
+    {"z18", "v18", "d18", "s18", NULL}, {"z19", "v19", "d19", "s19", NULL},
+    {"z20", "v20", "d20", "s20", NULL}, {"z21", "v21", "d21", "s21", NULL},
+    {"z22", "v22", "d22", "s22", NULL}, {"z23", "v23", "d23", "s23", NULL},
+    {"z24", "v24", "d24", "s24", NULL}, {"z25", "v25", "d25", "s25", NULL},
+    {"z26", "v26", "d26", "s26", NULL}, {"z27", "v27", "d27", "s27", NULL},
+    {"z28", "v28", "d28", "s28", NULL}, {"z29", "v29", "d29", "s29", NULL},
+    {"z30", "v30", "d30", "s30", NULL}, {"z31", "v31", "d31", "s31", NULL}};
+
+const char *g_contained_z0[]{"z0", NULL};
+const char *g_contained_z1[]{"z1", NULL};
+const char *g_contained_z2[]{"z2", NULL};
+const char *g_contained_z3[]{"z3", NULL};
+const char *g_contained_z4[]{"z4", NULL};
+const char *g_contained_z5[]{"z5", NULL};
+const char *g_contained_z6[]{"z6", NULL};
+const char *g_contained_z7[]{"z7", NULL};
+const char *g_contained_z8[]{"z8", NULL};
+const char *g_contained_z9[]{"z9", NULL};
+const char *g_contained_z10[]{"z10", NULL};
+const char *g_contained_z11[]{"z11", NULL};
+const char *g_contained_z12[]{"z12", NULL};
+const char *g_contained_z13[]{"z13", NULL};
+const char *g_contained_z14[]{"z14", NULL};
+const char *g_contained_z15[]{"z15", NULL};
+const char *g_contained_z16[]{"z16", NULL};
+const char *g_contained_z17[]{"z17", NULL};
+const char *g_contained_z18[]{"z18", NULL};
+const char *g_contained_z19[]{"z19", NULL};
+const char *g_contained_z20[]{"z20", NULL};
+const char *g_contained_z21[]{"z21", NULL};
+const char *g_contained_z22[]{"z22", NULL};
+const char *g_contained_z23[]{"z23", NULL};
+const char *g_contained_z24[]{"z24", NULL};
+const char *g_contained_z25[]{"z25", NULL};
+const char *g_contained_z26[]{"z26", NULL};
+const char *g_contained_z27[]{"z27", NULL};
+const char *g_contained_z28[]{"z28", NULL};
+const char *g_contained_z29[]{"z29", NULL};
+const char *g_contained_z30[]{"z30", NULL};
+const char *g_contained_z31[]{"z31", NULL};
 
 #if defined(__arm64__) || defined(__aarch64__)
 #define VFP_V_OFFSET_IDX(idx)                                                  \
@@ -1948,141 +2315,18 @@ const char *g_invalidate_v31[]{"v31", "d31", "s31", NULL};
   (offsetof(DNBArchMachARM64::FPU, opaque) + (idx * 16) +                      \
    offsetof(DNBArchMachARM64::Context, vfp))
 #endif
-#define VFP_OFFSET_NAME(reg)                                                   \
-  (offsetof(DNBArchMachARM64::FPU, reg) +                                      \
-   offsetof(DNBArchMachARM64::Context, vfp))
 #define EXC_OFFSET(reg)                                                        \
   (offsetof(DNBArchMachARM64::EXC, reg) +                                      \
    offsetof(DNBArchMachARM64::Context, exc))
-
-//#define FLOAT_FORMAT Float
-#define DEFINE_VFP_V_IDX(idx)                                                  \
-  {                                                                            \
-    e_regSetVFP, vfp_v##idx, "v" #idx, "q" #idx, Vector, VectorOfUInt8, 16,    \
-        VFP_V_OFFSET_IDX(idx), INVALID_NUB_REGNUM, dwarf_v##idx,               \
-        INVALID_NUB_REGNUM, debugserver_vfp_v##idx, NULL, g_invalidate_v##idx  \
-  }
-#define DEFINE_PSEUDO_VFP_S_IDX(idx)                                           \
-  {                                                                            \
-    e_regSetVFP, vfp_s##idx, "s" #idx, NULL, IEEE754, Float, 4, 0,             \
-        INVALID_NUB_REGNUM, INVALID_NUB_REGNUM, INVALID_NUB_REGNUM,            \
-        INVALID_NUB_REGNUM, g_contained_v##idx, g_invalidate_v##idx            \
-  }
-#define DEFINE_PSEUDO_VFP_D_IDX(idx)                                           \
-  {                                                                            \
-    e_regSetVFP, vfp_d##idx, "d" #idx, NULL, IEEE754, Float, 8, 0,             \
-        INVALID_NUB_REGNUM, INVALID_NUB_REGNUM, INVALID_NUB_REGNUM,            \
-        INVALID_NUB_REGNUM, g_contained_v##idx, g_invalidate_v##idx            \
-  }
-
-// Floating point registers
-const DNBRegisterInfo DNBArchMachARM64::g_vfp_registers[] = {
-    DEFINE_VFP_V_IDX(0),
-    DEFINE_VFP_V_IDX(1),
-    DEFINE_VFP_V_IDX(2),
-    DEFINE_VFP_V_IDX(3),
-    DEFINE_VFP_V_IDX(4),
-    DEFINE_VFP_V_IDX(5),
-    DEFINE_VFP_V_IDX(6),
-    DEFINE_VFP_V_IDX(7),
-    DEFINE_VFP_V_IDX(8),
-    DEFINE_VFP_V_IDX(9),
-    DEFINE_VFP_V_IDX(10),
-    DEFINE_VFP_V_IDX(11),
-    DEFINE_VFP_V_IDX(12),
-    DEFINE_VFP_V_IDX(13),
-    DEFINE_VFP_V_IDX(14),
-    DEFINE_VFP_V_IDX(15),
-    DEFINE_VFP_V_IDX(16),
-    DEFINE_VFP_V_IDX(17),
-    DEFINE_VFP_V_IDX(18),
-    DEFINE_VFP_V_IDX(19),
-    DEFINE_VFP_V_IDX(20),
-    DEFINE_VFP_V_IDX(21),
-    DEFINE_VFP_V_IDX(22),
-    DEFINE_VFP_V_IDX(23),
-    DEFINE_VFP_V_IDX(24),
-    DEFINE_VFP_V_IDX(25),
-    DEFINE_VFP_V_IDX(26),
-    DEFINE_VFP_V_IDX(27),
-    DEFINE_VFP_V_IDX(28),
-    DEFINE_VFP_V_IDX(29),
-    DEFINE_VFP_V_IDX(30),
-    DEFINE_VFP_V_IDX(31),
-    {e_regSetVFP, vfp_fpsr, "fpsr", NULL, Uint, Hex, 4,
-     VFP_V_OFFSET_IDX(32) + 0, INVALID_NUB_REGNUM, INVALID_NUB_REGNUM,
-     INVALID_NUB_REGNUM, INVALID_NUB_REGNUM, NULL, NULL},
-    {e_regSetVFP, vfp_fpcr, "fpcr", NULL, Uint, Hex, 4,
-     VFP_V_OFFSET_IDX(32) + 4, INVALID_NUB_REGNUM, INVALID_NUB_REGNUM,
-     INVALID_NUB_REGNUM, INVALID_NUB_REGNUM, NULL, NULL},
-
-    DEFINE_PSEUDO_VFP_S_IDX(0),
-    DEFINE_PSEUDO_VFP_S_IDX(1),
-    DEFINE_PSEUDO_VFP_S_IDX(2),
-    DEFINE_PSEUDO_VFP_S_IDX(3),
-    DEFINE_PSEUDO_VFP_S_IDX(4),
-    DEFINE_PSEUDO_VFP_S_IDX(5),
-    DEFINE_PSEUDO_VFP_S_IDX(6),
-    DEFINE_PSEUDO_VFP_S_IDX(7),
-    DEFINE_PSEUDO_VFP_S_IDX(8),
-    DEFINE_PSEUDO_VFP_S_IDX(9),
-    DEFINE_PSEUDO_VFP_S_IDX(10),
-    DEFINE_PSEUDO_VFP_S_IDX(11),
-    DEFINE_PSEUDO_VFP_S_IDX(12),
-    DEFINE_PSEUDO_VFP_S_IDX(13),
-    DEFINE_PSEUDO_VFP_S_IDX(14),
-    DEFINE_PSEUDO_VFP_S_IDX(15),
-    DEFINE_PSEUDO_VFP_S_IDX(16),
-    DEFINE_PSEUDO_VFP_S_IDX(17),
-    DEFINE_PSEUDO_VFP_S_IDX(18),
-    DEFINE_PSEUDO_VFP_S_IDX(19),
-    DEFINE_PSEUDO_VFP_S_IDX(20),
-    DEFINE_PSEUDO_VFP_S_IDX(21),
-    DEFINE_PSEUDO_VFP_S_IDX(22),
-    DEFINE_PSEUDO_VFP_S_IDX(23),
-    DEFINE_PSEUDO_VFP_S_IDX(24),
-    DEFINE_PSEUDO_VFP_S_IDX(25),
-    DEFINE_PSEUDO_VFP_S_IDX(26),
-    DEFINE_PSEUDO_VFP_S_IDX(27),
-    DEFINE_PSEUDO_VFP_S_IDX(28),
-    DEFINE_PSEUDO_VFP_S_IDX(29),
-    DEFINE_PSEUDO_VFP_S_IDX(30),
-    DEFINE_PSEUDO_VFP_S_IDX(31),
-
-    DEFINE_PSEUDO_VFP_D_IDX(0),
-    DEFINE_PSEUDO_VFP_D_IDX(1),
-    DEFINE_PSEUDO_VFP_D_IDX(2),
-    DEFINE_PSEUDO_VFP_D_IDX(3),
-    DEFINE_PSEUDO_VFP_D_IDX(4),
-    DEFINE_PSEUDO_VFP_D_IDX(5),
-    DEFINE_PSEUDO_VFP_D_IDX(6),
-    DEFINE_PSEUDO_VFP_D_IDX(7),
-    DEFINE_PSEUDO_VFP_D_IDX(8),
-    DEFINE_PSEUDO_VFP_D_IDX(9),
-    DEFINE_PSEUDO_VFP_D_IDX(10),
-    DEFINE_PSEUDO_VFP_D_IDX(11),
-    DEFINE_PSEUDO_VFP_D_IDX(12),
-    DEFINE_PSEUDO_VFP_D_IDX(13),
-    DEFINE_PSEUDO_VFP_D_IDX(14),
-    DEFINE_PSEUDO_VFP_D_IDX(15),
-    DEFINE_PSEUDO_VFP_D_IDX(16),
-    DEFINE_PSEUDO_VFP_D_IDX(17),
-    DEFINE_PSEUDO_VFP_D_IDX(18),
-    DEFINE_PSEUDO_VFP_D_IDX(19),
-    DEFINE_PSEUDO_VFP_D_IDX(20),
-    DEFINE_PSEUDO_VFP_D_IDX(21),
-    DEFINE_PSEUDO_VFP_D_IDX(22),
-    DEFINE_PSEUDO_VFP_D_IDX(23),
-    DEFINE_PSEUDO_VFP_D_IDX(24),
-    DEFINE_PSEUDO_VFP_D_IDX(25),
-    DEFINE_PSEUDO_VFP_D_IDX(26),
-    DEFINE_PSEUDO_VFP_D_IDX(27),
-    DEFINE_PSEUDO_VFP_D_IDX(28),
-    DEFINE_PSEUDO_VFP_D_IDX(29),
-    DEFINE_PSEUDO_VFP_D_IDX(30),
-    DEFINE_PSEUDO_VFP_D_IDX(31)
-
-};
+#define SVE_OFFSET_Z_IDX(idx)                                                  \
+  (offsetof(DNBArchMachARM64::SVE, z[idx]) +                                   \
+   offsetof(DNBArchMachARM64::Context, sve))
+#define SVE_OFFSET_P_IDX(idx)                                                  \
+  (offsetof(DNBArchMachARM64::SVE, p[idx]) +                                   \
+   offsetof(DNBArchMachARM64::Context, sve))
+#define SME_OFFSET(reg)                                                        \
+  (offsetof(DNBArchMachARM64::SME, reg) +                                      \
+   offsetof(DNBArchMachARM64::Context, sme))
 
 //_STRUCT_ARM_EXCEPTION_STATE64
 //{
@@ -2106,29 +2350,220 @@ const DNBRegisterInfo DNBArchMachARM64::g_exc_registers[] = {
 // Number of registers in each register set
 const size_t DNBArchMachARM64::k_num_gpr_registers =
     sizeof(g_gpr_registers) / sizeof(DNBRegisterInfo);
-const size_t DNBArchMachARM64::k_num_vfp_registers =
-    sizeof(g_vfp_registers) / sizeof(DNBRegisterInfo);
 const size_t DNBArchMachARM64::k_num_exc_registers =
     sizeof(g_exc_registers) / sizeof(DNBRegisterInfo);
-const size_t DNBArchMachARM64::k_num_all_registers =
-    k_num_gpr_registers + k_num_vfp_registers + k_num_exc_registers;
-
-// Register set definitions. The first definitions at register set index
-// of zero is for all registers, followed by other registers sets. The
-// register information for the all register set need not be filled in.
-const DNBRegisterSetInfo DNBArchMachARM64::g_reg_sets[] = {
-    {"ARM64 Registers", NULL, k_num_all_registers},
-    {"General Purpose Registers", g_gpr_registers, k_num_gpr_registers},
-    {"Floating Point Registers", g_vfp_registers, k_num_vfp_registers},
-    {"Exception State Registers", g_exc_registers, k_num_exc_registers}};
-// Total number of register sets for this architecture
-const size_t DNBArchMachARM64::k_num_register_sets =
-    sizeof(g_reg_sets) / sizeof(DNBRegisterSetInfo);
 
+static std::vector<DNBRegisterInfo> g_sve_registers;
+static void initialize_sve_registers() {
+  static const char *g_z_regnames[32] = {
+      "z0",  "z1",  "z2",  "z3",  "z4",  "z5",  "z6",  "z7",
+      "z8",  "z9",  "z10", "z11", "z12", "z13", "z14", "z15",
+      "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
+      "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"};
+  static const char *g_p_regnames[16] = {
+      "p0", "p1", "p2",  "p3",  "p4",  "p5",  "p6",  "p7",
+      "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15"};
+
+  if (DNBArchMachARM64::CPUHasSME()) {
+    uint32_t svl_bytes = DNBArchMachARM64::GetSMEMaxSVL();
+    for (uint32_t i = 0; i < 32; i++) {
+      g_sve_registers.push_back(
+          {DNBArchMachARM64::e_regSetSVE, (uint32_t)sve_z0 + i, g_z_regnames[i],
+           NULL, Vector, VectorOfUInt8, svl_bytes,
+           static_cast<uint32_t>(SVE_OFFSET_Z_IDX(i)), INVALID_NUB_REGNUM,
+           INVALID_NUB_REGNUM, INVALID_NUB_REGNUM,
+           (uint32_t)debugserver_sve_z0 + i, NULL, g_invalidate_z[i]});
+    }
+    for (uint32_t i = 0; i < 16; i++) {
+      g_sve_registers.push_back(
+          {DNBArchMachARM64::e_regSetSVE, (uint32_t)sve_p0 + i, g_p_regnames[i],
+           NULL, Vector, VectorOfUInt8, svl_bytes / 8,
+           (uint32_t)SVE_OFFSET_P_IDX(i), INVALID_NUB_REGNUM,
+           INVALID_NUB_REGNUM, INVALID_NUB_REGNUM,
+           (uint32_t)debugserver_sve_p0 + i, NULL, NULL});
+    }
+  }
+}
+
+static std::vector<DNBRegisterInfo> g_vfp_registers;
+static void initialize_vfp_registers() {
+  static const char *g_v_regnames[32] = {
+      "v0",  "v1",  "v2",  "v3",  "v4",  "v5",  "v6",  "v7",
+      "v8",  "v9",  "v10", "v11", "v12", "v13", "v14", "v15",
+      "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+      "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"};
+  static const char *g_q_regnames[32] = {
+      "q0",  "q1",  "q2",  "q3",  "q4",  "q5",  "q6",  "q7",
+      "q8",  "q9",  "q10", "q11", "q12", "q13", "q14", "q15",
+      "q16", "q17", "q18", "q19", "q20", "q21", "q22", "q23",
+      "q24", "q25", "q26", "q27", "q28", "q29", "q30", "q31"};
+
+  static const char *g_d_regnames[32] = {
+      "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
+      "d8",  "d9",  "d10", "d11", "d12", "d13", "d14", "d15",
+      "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+      "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"};
+
+  static const char *g_s_regnames[32] = {
+      "s0",  "s1",  "s2",  "s3",  "s4",  "s5",  "s6",  "s7",
+      "s8",  "s9",  "s10", "s11", "s12", "s13", "s14", "s15",
+      "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
+      "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31"};
+
+  for (uint32_t i = 0; i < 32; i++)
+    if (DNBArchMachARM64::CPUHasSME())
+      g_vfp_registers.push_back(
+          {DNBArchMachARM64::e_regSetVFP, (uint32_t)vfp_v0 + i, g_v_regnames[i],
+           g_q_regnames[i], Vector, VectorOfUInt8, 16,
+           static_cast<uint32_t>(VFP_V_OFFSET_IDX(i)), INVALID_NUB_REGNUM,
+           (uint32_t)dwarf_v0 + i, INVALID_NUB_REGNUM,
+           (uint32_t)debugserver_vfp_v0 + i, NULL, g_invalidate_z[i]});
+    else
+      g_vfp_registers.push_back(
+          {DNBArchMachARM64::e_regSetVFP, (uint32_t)vfp_v0 + i, g_v_regnames[i],
+           g_q_regnames[i], Vector, VectorOfUInt8, 16,
+           static_cast<uint32_t>(VFP_V_OFFSET_IDX(i)), INVALID_NUB_REGNUM,
+           (uint32_t)dwarf_v0 + i, INVALID_NUB_REGNUM,
+           (uint32_t)debugserver_vfp_v0 + i, NULL, g_invalidate_v[i]});
+
+  g_vfp_registers.push_back(
+      {DNBArchMachARM64::e_regSetVFP, vfp_fpsr, "fpsr", NULL, Uint, Hex, 4,
+       VFP_V_OFFSET_IDX(32) + 0, INVALID_NUB_REGNUM, INVALID_NUB_REGNUM,
+       INVALID_NUB_REGNUM, INVALID_NUB_REGNUM, NULL, NULL});
+  g_vfp_registers.push_back(
+      {DNBArchMachARM64::e_regSetVFP, vfp_fpcr, "fpcr", NULL, Uint, Hex, 4,
+       VFP_V_OFFSET_IDX(32) + 4, INVALID_NUB_REGNUM, INVALID_NUB_REGNUM,
+       INVALID_NUB_REGNUM, INVALID_NUB_REGNUM, NULL, NULL});
+
+  for (uint32_t i = 0; i < 32; i++)
+    if (DNBArchMachARM64::CPUHasSME())
+      g_vfp_registers.push_back(
+          {DNBArchMachARM64::e_regSetVFP, (uint32_t)vfp_d0 + i, g_d_regnames[i],
+           NULL, IEEE754, Float, 8, 0, INVALID_NUB_REGNUM, INVALID_NUB_REGNUM,
+           INVALID_NUB_REGNUM, INVALID_NUB_REGNUM, NULL, g_invalidate_z[i]});
+    else
+      g_vfp_registers.push_back(
+          {DNBArchMachARM64::e_regSetVFP, (uint32_t)vfp_d0 + i, g_d_regnames[i],
+           NULL, IEEE754, Float, 8, 0, INVALID_NUB_REGNUM, INVALID_NUB_REGNUM,
+           INVALID_NUB_REGNUM, INVALID_NUB_REGNUM, NULL, g_invalidate_v[i]});
+
+  for (uint32_t i = 0; i < 32; i++)
+    if (DNBArchMachARM64::CPUHasSME())
+      g_vfp_registers.push_back(
+          {DNBArchMachARM64::e_regSetVFP, (uint32_t)vfp_s0 + i, g_s_regnames[i],
+           NULL, IEEE754, Float, 4, 0, INVALID_NUB_REGNUM, INVALID_NUB_REGNUM,
+           INVALID_NUB_REGNUM, INVALID_NUB_REGNUM, NULL, g_invalidate_z[i]});
+    else
+      g_vfp_registers.push_back(
+          {DNBArchMachARM64::e_regSetVFP, (uint32_t)vfp_s0 + i, g_s_regnames[i],
+           NULL, IEEE754, Float, 4, 0, INVALID_NUB_REGNUM, INVALID_NUB_REGNUM,
+           INVALID_NUB_REGNUM, INVALID_NUB_REGNUM, NULL, g_invalidate_v[i]});
+}
+
+static std::once_flag g_vfp_once;
+DNBRegisterInfo *
+DNBArchMachARM64::get_vfp_registerinfo(size_t &num_vfp_registers) {
+  std::call_once(g_vfp_once, []() { initialize_vfp_registers(); });
+  num_vfp_registers = g_vfp_registers.size();
+  if (num_vfp_registers > 0)
+    return g_vfp_registers.data();
+  else
+    return nullptr;
+}
+
+static std::once_flag g_sve_once;
+DNBRegisterInfo *
+DNBArchMachARM64::get_sve_registerinfo(size_t &num_sve_registers) {
+  std::call_once(g_sve_once, []() { initialize_sve_registers(); });
+  num_sve_registers = g_sve_registers.size();
+  if (num_sve_registers > 0)
+    return g_sve_registers.data();
+  else
+    return nullptr;
+}
+
+static std::vector<DNBRegisterInfo> g_sme_registers;
+static void initialize_sme_registers() {
+  if (DNBArchMachARM64::CPUHasSME()) {
+    uint32_t svl_bytes = DNBArchMachARM64::GetSMEMaxSVL();
+    g_sme_registers.push_back(
+        {DNBArchMachARM64::e_regSetSME, sme_svcr, "svcr", NULL, Uint, Hex, 8,
+         SME_OFFSET(svcr), INVALID_NUB_REGNUM, INVALID_NUB_REGNUM,
+         INVALID_NUB_REGNUM, INVALID_NUB_REGNUM, NULL, NULL});
+    g_sme_registers.push_back(
+        {DNBArchMachARM64::e_regSetSME, sme_tpidr2, "tpidr2", NULL, Uint, Hex,
+         8, SME_OFFSET(tpidr2), INVALID_NUB_REGNUM, INVALID_NUB_REGNUM,
+         INVALID_NUB_REGNUM, INVALID_NUB_REGNUM, NULL, NULL});
+    g_sme_registers.push_back(
+        {DNBArchMachARM64::e_regSetSME, sme_svl_b, "svl", NULL, Uint, Hex, 2,
+         SME_OFFSET(svl_b), INVALID_NUB_REGNUM, INVALID_NUB_REGNUM,
+         INVALID_NUB_REGNUM, INVALID_NUB_REGNUM, NULL, NULL});
+    uint32_t za_max_size = svl_bytes * svl_bytes;
+    g_sme_registers.push_back({DNBArchMachARM64::e_regSetSME, sme_za, "za",
+                               NULL, Vector, VectorOfUInt8, za_max_size,
+                               SME_OFFSET(za), INVALID_NUB_REGNUM,
+                               INVALID_NUB_REGNUM, INVALID_NUB_REGNUM,
+                               INVALID_NUB_REGNUM, NULL, NULL});
+  }
+  if (DNBArchMachARM64::CPUHasSME2()) {
+    g_sme_registers.push_back({DNBArchMachARM64::e_regSetSME, sme_zt0, "zt0",
+                               NULL, Vector, VectorOfUInt8, 64, SME_OFFSET(zt0),
+                               INVALID_NUB_REGNUM, INVALID_NUB_REGNUM,
+                               INVALID_NUB_REGNUM, INVALID_NUB_REGNUM, NULL,
+                               NULL});
+  }
+}
+
+static std::once_flag g_sme_once;
+DNBRegisterInfo *
+DNBArchMachARM64::get_sme_registerinfo(size_t &num_sme_registers) {
+  std::call_once(g_sme_once, []() { initialize_sme_registers(); });
+  num_sme_registers = g_sme_registers.size();
+  if (num_sme_registers > 0)
+    return g_sme_registers.data();
+  else
+    return nullptr;
+}
+
+static std::vector<DNBRegisterSetInfo> g_reg_sets;
+void DNBArchMachARM64::initialize_reg_sets() {
+  nub_size_t num_all_registers = DNBArchMachARM64::k_num_gpr_registers +
+                                 DNBArchMachARM64::k_num_exc_registers;
+  size_t num_vfp_registers = 0;
+  DNBRegisterInfo *vfp_reginfos =
+      DNBArchMachARM64::get_vfp_registerinfo(num_vfp_registers);
+  size_t num_sve_registers = 0;
+  DNBRegisterInfo *sve_reginfos =
+      DNBArchMachARM64::get_sve_registerinfo(num_sve_registers);
+  size_t num_sme_registers = 0;
+  DNBRegisterInfo *sme_reginfos =
+      DNBArchMachARM64::get_sme_registerinfo(num_sme_registers);
+  num_all_registers +=
+      num_vfp_registers + num_sve_registers + num_sme_registers;
+  g_reg_sets.push_back({"ARM64 Registers", NULL, num_all_registers});
+  g_reg_sets.push_back({"General Purpose Registers",
+                        DNBArchMachARM64::g_gpr_registers,
+                        DNBArchMachARM64::k_num_gpr_registers});
+  g_reg_sets.push_back(
+      {"Floating Point Registers", vfp_reginfos, num_vfp_registers});
+  g_reg_sets.push_back({"Exception State Registers",
+                        DNBArchMachARM64::g_exc_registers,
+                        DNBArchMachARM64::k_num_exc_registers});
+  if (DNBArchMachARM64::CPUHasSME()) {
+    g_reg_sets.push_back({"Scalable Vector Extension Registers", sve_reginfos,
+                          num_sve_registers});
+    g_reg_sets.push_back({"Scalable Matrix Extension Registers", sme_reginfos,
+                          num_sme_registers});
+  }
+}
+
+static std::once_flag g_initialize_register_set_info;
 const DNBRegisterSetInfo *
 DNBArchMachARM64::GetRegisterSetInfo(nub_size_t *num_reg_sets) {
-  *num_reg_sets = k_num_register_sets;
-  return g_reg_sets;
+  std::call_once(g_initialize_register_set_info,
+                 []() { initialize_reg_sets(); });
+  *num_reg_sets = g_reg_sets.size();
+  return g_reg_sets.data();
 }
 
 bool DNBArchMachARM64::FixGenericRegisterNumber(uint32_t &set, uint32_t &reg) {
@@ -2185,6 +2620,7 @@ bool DNBArchMachARM64::GetRegisterValue(uint32_t set, uint32_t reg,
 
   const DNBRegisterInfo *regInfo = m_thread->GetRegisterInfo(set, reg);
   if (regInfo) {
+    uint16_t max_svl_bytes = GetSMEMaxSVL();
     value->info = *regInfo;
     switch (set) {
     case e_regSetGPR:
@@ -2281,6 +2717,46 @@ bool DNBArchMachARM64::GetRegisterValue(uint32_t set, uint32_t reg,
       }
       break;
 
+    case e_regSetSVE:
+      if (GetRegisterState(e_regSetSVE, false) != KERN_SUCCESS)
+        return false;
+
+      if (reg >= sve_z0 && reg <= sve_z31) {
+        memset(&value->value.v_uint8, 0, max_svl_bytes);
+        memcpy(&value->value.v_uint8, &m_state.context.sve.z[reg - sve_z0],
+               max_svl_bytes);
+        return true;
+      } else if (reg >= sve_p0 && reg <= sve_p15) {
+        memset(&value->value.v_uint8, 0, max_svl_bytes / 8);
+        memcpy(&value->value.v_uint8, &m_state.context.sve.p[reg - sve_p0],
+               max_svl_bytes / 8);
+        return true;
+      }
+      break;
+
+    case e_regSetSME:
+      if (GetRegisterState(e_regSetSME, false) != KERN_SUCCESS)
+        return false;
+
+      if (reg == sme_svcr) {
+        value->value.uint64 = m_state.context.sme.svcr;
+        return true;
+      } else if (reg == sme_tpidr2) {
+        value->value.uint64 = m_state.context.sme.tpidr2;
+        return true;
+      } else if (reg == sme_svl_b) {
+        value->value.uint64 = m_state.context.sme.svl_b;
+        return true;
+      } else if (reg == sme_za) {
+        memcpy(&value->value.v_uint8, m_state.context.sme.za.data(),
+               max_svl_bytes * max_svl_bytes);
+        return true;
+      } else if (reg == sme_zt0) {
+        memcpy(&value->value.v_uint8, &m_state.context.sme.zt0, 64);
+        return true;
+      }
+      break;
+
     case e_regSetEXC:
       if (reg == exc_far) {
         value->value.uint64 = m_state.context.exc.__far;
@@ -2387,6 +2863,37 @@ bool DNBArchMachARM64::SetRegisterValue(uint32_t set, uint32_t reg,
       }
       break;
 
+    case e_regSetSVE:
+      if (reg >= sve_z0 && reg <= sve_z31) {
+        uint16_t max_svl_bytes = GetSMEMaxSVL();
+        memcpy(&m_state.context.sve.z[reg - sve_z0], &value->value.v_uint8,
+               max_svl_bytes);
+        success = true;
+      }
+      if (reg >= sve_p0 && reg <= sve_p15) {
+        uint16_t max_svl_bytes = GetSMEMaxSVL();
+        memcpy(&m_state.context.sve.p[reg - sve_p0], &value->value.v_uint8,
+               max_svl_bytes / 8);
+        success = true;
+      }
+      break;
+
+    case e_regSetSME:
+      // Cannot change ARM_SME_STATE registers with thread_set_state
+      if (reg == sme_svcr || reg == sme_tpidr2 || reg == sme_svl_b)
+        return false;
+      if (reg == sme_za) {
+        uint16_t max_svl_bytes = GetSMEMaxSVL();
+        memcpy(m_state.context.sme.za.data(), &value->value.v_uint8,
+               max_svl_bytes * max_svl_bytes);
+        success = true;
+      }
+      if (reg == sme_zt0) {
+        memcpy(&m_state.context.sme.zt0, &value->value.v_uint8, 64);
+        success = true;
+      }
+      break;
+
     case e_regSetEXC:
       if (reg == exc_far) {
         m_state.context.exc.__far = value->value.uint64;
@@ -2408,13 +2915,26 @@ bool DNBArchMachARM64::SetRegisterValue(uint32_t set, uint32_t reg,
 
 kern_return_t DNBArchMachARM64::GetRegisterState(int set, bool force) {
   switch (set) {
-  case e_regSetALL:
-    return GetGPRState(force) | GetVFPState(force) | GetEXCState(force) |
-           GetDBGState(force);
+  case e_regSetALL: {
+    kern_return_t retval = GetGPRState(force) | GetVFPState(force) |
+                           GetEXCState(force) | GetDBGState(force);
+    // If the processor is not in Streaming SVE Mode currently, these
+    // two will fail to read.  Don't return that as an error, it will
+    // be the most common case.
+    if (CPUHasSME()) {
+      GetSVEState(force);
+      GetSMEState(force);
+    }
+    return retval;
+  }
   case e_regSetGPR:
     return GetGPRState(force);
   case e_regSetVFP:
     return GetVFPState(force);
+  case e_regSetSVE:
+    return GetSVEState(force);
+  case e_regSetSME:
+    return GetSMEState(force);
   case e_regSetEXC:
     return GetEXCState(force);
   case e_regSetDBG:
@@ -2438,6 +2958,10 @@ kern_return_t DNBArchMachARM64::SetRegisterState(int set) {
     return SetGPRState();
   case e_regSetVFP:
     return SetVFPState();
+  case e_regSetSVE:
+    return SetSVEState();
+  case e_regSetSME:
+    return SetSMEState();
   case e_regSetEXC:
     return SetEXCState();
   case e_regSetDBG:
@@ -2455,6 +2979,15 @@ bool DNBArchMachARM64::RegisterSetStateIsValid(int set) const {
 nub_size_t DNBArchMachARM64::GetRegisterContext(void *buf, nub_size_t buf_len) {
   nub_size_t size = sizeof(m_state.context.gpr) + sizeof(m_state.context.vfp) +
                     sizeof(m_state.context.exc);
+  const bool cpu_has_sme = CPUHasSME();
+  if (cpu_has_sme) {
+    size += sizeof(m_state.context.sve);
+    // ZA register is in a std::vector<uint8_t> so we need to add
+    // the sizes of the SME manually.
+    size += ARM_SME_STATE_COUNT * sizeof(uint32_t);
+    size += m_state.context.sme.za.size();
+    size += ARM_SME2_STATE_COUNT * sizeof(uint32_t);
+  }
 
   if (buf && buf_len) {
     if (size > buf_len)
@@ -2463,6 +2996,13 @@ nub_size_t DNBArchMachARM64::GetRegisterContext(void *buf, nub_size_t buf_len) {
     bool force = false;
     if (GetGPRState(force) | GetVFPState(force) | GetEXCState(force))
       return 0;
+    // Don't error out if SME/SVE fail to read. These can only be read
+    // when the process is in Streaming SVE Mode, so the failure to read
+    // them will be common.
+    if (cpu_has_sme) {
+      GetSVEState(force);
+      GetSMEState(force);
+    }
 
     // Copy each struct individually to avoid any padding that might be between
     // the structs in m_state.context
@@ -2471,6 +3011,21 @@ nub_size_t DNBArchMachARM64::GetRegisterContext(void *buf, nub_size_t buf_len) {
     p += sizeof(m_state.context.gpr);
     ::memcpy(p, &m_state.context.vfp, sizeof(m_state.context.vfp));
     p += sizeof(m_state.context.vfp);
+    if (cpu_has_sme) {
+      ::memcpy(p, &m_state.context.sve, sizeof(m_state.context.sve));
+      p += sizeof(m_state.context.sve);
+
+      memcpy(p, &m_state.context.sme.svcr,
+             ARM_SME_STATE_COUNT * sizeof(uint32_t));
+      p += ARM_SME_STATE_COUNT * sizeof(uint32_t);
+      memcpy(p, m_state.context.sme.za.data(), m_state.context.sme.za.size());
+      p += m_state.context.sme.za.size();
+      if (CPUHasSME2()) {
+        memcpy(p, &m_state.context.sme.zt0,
+               ARM_SME2_STATE_COUNT * sizeof(uint32_t));
+        p += ARM_SME2_STATE_COUNT * sizeof(uint32_t);
+      }
+    }
     ::memcpy(p, &m_state.context.exc, sizeof(m_state.context.exc));
     p += sizeof(m_state.context.exc);
 
@@ -2490,6 +3045,15 @@ nub_size_t DNBArchMachARM64::SetRegisterContext(const void *buf,
                                                 nub_size_t buf_len) {
   nub_size_t size = sizeof(m_state.context.gpr) + sizeof(m_state.context.vfp) +
                     sizeof(m_state.context.exc);
+  if (CPUHasSME()) {
+    // m_state.context.za is three status registers, then a std::vector<uint8_t>
+    // for ZA, then zt0, so the size of the data is not statically knowable.
+    nub_size_t sme_size = ARM_SME_STATE_COUNT * sizeof(uint32_t);
+    sme_size += m_state.context.sme.za.size();
+    sme_size += ARM_SME2_STATE_COUNT * sizeof(uint32_t);
+
+    size += sizeof(m_state.context.sve) + sme_size;
+  }
 
   if (buf == NULL || buf_len == 0)
     size = 0;
@@ -2505,6 +3069,20 @@ nub_size_t DNBArchMachARM64::SetRegisterContext(const void *buf,
     p += sizeof(m_state.context.gpr);
     ::memcpy(&m_state.context.vfp, p, sizeof(m_state.context.vfp));
     p += sizeof(m_state.context.vfp);
+    if (CPUHasSME()) {
+      memcpy(&m_state.context.sve, p, sizeof(m_state.context.sve));
+      p += sizeof(m_state.context.sve);
+      memcpy(&m_state.context.sme.svcr, p,
+             ARM_SME_STATE_COUNT * sizeof(uint32_t));
+      p += ARM_SME_STATE_COUNT * sizeof(uint32_t);
+      memcpy(m_state.context.sme.za.data(), p, m_state.context.sme.za.size());
+      p += m_state.context.sme.za.size();
+      if (CPUHasSME2()) {
+        memcpy(&m_state.context.sme.zt0, p,
+               ARM_SME2_STATE_COUNT * sizeof(uint32_t));
+        p += ARM_SME2_STATE_COUNT * sizeof(uint32_t);
+      }
+    }
     ::memcpy(&m_state.context.exc, p, sizeof(m_state.context.exc));
     p += sizeof(m_state.context.exc);
 
@@ -2513,6 +3091,10 @@ nub_size_t DNBArchMachARM64::SetRegisterContext(const void *buf,
     assert(bytes_written == size);
     SetGPRState();
     SetVFPState();
+    if (CPUHasSME()) {
+      SetSVEState();
+      SetSMEState();
+    }
     SetEXCState();
   }
   DNBLogThreadedIf(
diff --git a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.h b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.h
index 0ea33d8e1c4c5..11ad1f40c3ef6 100644
--- a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.h
+++ b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.h
@@ -15,6 +15,10 @@
 #include <map>
 #include <vector>
 
+#if !defined(ARM_SME_STATE)
+#include "sme_thread_status.h"
+#endif
+
 #if defined(ARM_THREAD_STATE64_COUNT)
 
 #include "DNBArch.h"
@@ -93,7 +97,6 @@ class DNBArchMachARM64 : public DNBArchProtocol {
   bool DisableHardwareWatchpoint_helper(uint32_t hw_break_index,
                                         bool also_set_on_task);
 
-protected:
   kern_return_t EnableHardwareSingleStep(bool enable);
   static bool FixGenericRegisterNumber(uint32_t &set, uint32_t &reg);
 
@@ -102,6 +105,8 @@ class DNBArchMachARM64 : public DNBArchProtocol {
     e_regSetGPR, // ARM_THREAD_STATE64,
     e_regSetVFP, // ARM_NEON_STATE64,
     e_regSetEXC, // ARM_EXCEPTION_STATE64,
+    e_regSetSVE, // ARM_SVE_Z_STATE1, ARM_SVE_Z_STATE2, ARM_SVE_P_STATE
+    e_regSetSME, // ARM_SME_STATE, ARM_SME_ZA_STATE1..16, ARM_SME2_STATE
     e_regSetDBG, // ARM_DEBUG_STATE64,
     kNumRegisterSets
   };
@@ -119,20 +124,39 @@ class DNBArchMachARM64 : public DNBArchProtocol {
   typedef arm_neon_state64_t FPU;
   typedef arm_exception_state64_t EXC;
 
+  struct SVE {
+    uint8_t z[32][256]; // arm_sve_z_state_t z[2]
+    uint8_t p[16][32];  // arm_sve_p_state_t p
+  };
+
+  struct SME {
+    uint64_t svcr;   // arm_sme_state_t
+    uint64_t tpidr2; // arm_sme_state_t
+    uint16_t svl_b;  // arm_sme_state_t
+
+    std::vector<uint8_t> za;
+    uint8_t zt0[64];
+
+    SME() {
+      if (DNBArchMachARM64::CPUHasSME()) {
+        int svl = GetSMEMaxSVL();
+        za.resize(svl * svl, 0);
+      }
+    }
+  };
+
   static const DNBRegisterInfo g_gpr_registers[];
-  static const DNBRegisterInfo g_vfp_registers[];
   static const DNBRegisterInfo g_exc_registers[];
-  static const DNBRegisterSetInfo g_reg_sets[];
 
   static const size_t k_num_gpr_registers;
-  static const size_t k_num_vfp_registers;
   static const size_t k_num_exc_registers;
   static const size_t k_num_all_registers;
-  static const size_t k_num_register_sets;
 
   struct Context {
     GPR gpr;
     FPU vfp;
+    SVE sve;
+    SME sme;
     EXC exc;
   };
 
@@ -141,6 +165,8 @@ class DNBArchMachARM64 : public DNBArchProtocol {
     arm_debug_state64_t dbg;
     kern_return_t gpr_errs[2]; // Read/Write errors
     kern_return_t vfp_errs[2]; // Read/Write errors
+    kern_return_t sve_errs[2]; // Read/Write errors
+    kern_return_t sme_errs[2]; // Read/Write errors
     kern_return_t exc_errs[2]; // Read/Write errors
     kern_return_t dbg_errs[2]; // Read/Write errors
     State() {
@@ -148,6 +174,8 @@ class DNBArchMachARM64 : public DNBArchProtocol {
       for (i = 0; i < kNumErrors; i++) {
         gpr_errs[i] = -1;
         vfp_errs[i] = -1;
+        sve_errs[i] = -1;
+        sme_errs[i] = -1;
         exc_errs[i] = -1;
         dbg_errs[i] = -1;
       }
@@ -163,11 +191,15 @@ class DNBArchMachARM64 : public DNBArchProtocol {
         // we got any kind of error.
         case e_regSetALL:
           return gpr_errs[err_idx] | vfp_errs[err_idx] | exc_errs[err_idx] |
-                 dbg_errs[err_idx];
+                 sve_errs[err_idx] | sme_errs[err_idx] | dbg_errs[err_idx];
         case e_regSetGPR:
           return gpr_errs[err_idx];
         case e_regSetVFP:
           return vfp_errs[err_idx];
+        case e_regSetSVE:
+          return sve_errs[err_idx];
+        case e_regSetSME:
+          return sme_errs[err_idx];
         case e_regSetEXC:
           return exc_errs[err_idx];
         // case e_regSetDBG:   return dbg_errs[err_idx];
@@ -183,6 +215,8 @@ class DNBArchMachARM64 : public DNBArchProtocol {
         case e_regSetALL:
           gpr_errs[err_idx] = err;
           vfp_errs[err_idx] = err;
+          sve_errs[err_idx] = err;
+          sme_errs[err_idx] = err;
           dbg_errs[err_idx] = err;
           exc_errs[err_idx] = err;
           return true;
@@ -195,6 +229,14 @@ class DNBArchMachARM64 : public DNBArchProtocol {
           vfp_errs[err_idx] = err;
           return true;
 
+        case e_regSetSVE:
+          sve_errs[err_idx] = err;
+          return true;
+
+        case e_regSetSME:
+          sme_errs[err_idx] = err;
+          return true;
+
         case e_regSetEXC:
           exc_errs[err_idx] = err;
           return true;
@@ -215,11 +257,15 @@ class DNBArchMachARM64 : public DNBArchProtocol {
 
   kern_return_t GetGPRState(bool force);
   kern_return_t GetVFPState(bool force);
+  kern_return_t GetSVEState(bool force);
+  kern_return_t GetSMEState(bool force);
   kern_return_t GetEXCState(bool force);
   kern_return_t GetDBGState(bool force);
 
   kern_return_t SetGPRState();
   kern_return_t SetVFPState();
+  kern_return_t SetSVEState();
+  kern_return_t SetSMEState();
   kern_return_t SetEXCState();
   kern_return_t SetDBGState(bool also_set_on_task);
 
@@ -246,7 +292,16 @@ class DNBArchMachARM64 : public DNBArchProtocol {
     uint32_t control;
   };
 
-protected:
+  static bool CPUHasSME();
+  static bool CPUHasSME2();
+  static unsigned int GetSMEMaxSVL();
+
+private:
+  static DNBRegisterInfo *get_vfp_registerinfo(size_t &num_vfp_registers);
+  static DNBRegisterInfo *get_sve_registerinfo(size_t &num_sve_registers);
+  static DNBRegisterInfo *get_sme_registerinfo(size_t &num_sme_registers);
+  static void initialize_reg_sets();
+
   MachThread *m_thread;
   State m_state;
   arm_debug_state64_t m_dbg_save;
@@ -264,6 +319,9 @@ class DNBArchMachARM64 : public DNBArchProtocol {
 
   typedef std::map<uint32_t, Context> SaveRegisterStates;
   SaveRegisterStates m_saved_register_states;
+
+  DNBArchMachARM64(const DNBArchMachARM64 &) = delete;
+  DNBArchMachARM64 &operator=(const DNBArchMachARM64 &) = delete;
 };
 
 #endif // #if defined (ARM_THREAD_STATE64_COUNT)
diff --git a/lldb/tools/debugserver/source/MacOSX/arm64/sme_thread_status.h b/lldb/tools/debugserver/source/MacOSX/arm64/sme_thread_status.h
new file mode 100644
index 0000000000000..f33b3202ccab5
--- /dev/null
+++ b/lldb/tools/debugserver/source/MacOSX/arm64/sme_thread_status.h
@@ -0,0 +1,77 @@
+#include <mach/mach.h>
+#include <stdint.h>
+
+// Define the SVE/SME/SME2 thread status structures
+// flavors, and sizes so this can build against an
+// older SDK which does not have these definitions
+// yet.
+
+#if !defined(ARM_SME_STATE)
+
+#define _STRUCT_ARM_SME_STATE struct arm_sme_state
+_STRUCT_ARM_SME_STATE {
+  uint64_t svcr;
+  uint64_t tpidr2_el0;
+  uint16_t svl_b;
+};
+
+#define _STRUCT_ARM_SVE_Z_STATE struct arm_sve_z_state
+_STRUCT_ARM_SVE_Z_STATE { char z[16][256]; }
+__attribute__((aligned(alignof(unsigned int))));
+
+#define _STRUCT_ARM_SVE_P_STATE struct arm_sve_p_state
+_STRUCT_ARM_SVE_P_STATE { char p[16][256 / 8]; }
+__attribute__((aligned(alignof(unsigned int))));
+
+#define _STRUCT_ARM_SME_ZA_STATE struct arm_sme_za_state
+_STRUCT_ARM_SME_ZA_STATE { char za[4096]; }
+__attribute__((aligned(alignof(unsigned int))));
+
+#define _STRUCT_ARM_SME2_STATE struct arm_sme2_state
+_STRUCT_ARM_SME2_STATE { char zt0[64]; }
+__attribute__((aligned(alignof(unsigned int))));
+
+#define ARM_SME_STATE 28
+#define ARM_SVE_Z_STATE1 29
+#define ARM_SVE_Z_STATE2 30
+#define ARM_SVE_P_STATE 31
+#define ARM_SME_ZA_STATE1 32
+#define ARM_SME_ZA_STATE2 33
+#define ARM_SME_ZA_STATE3 34
+#define ARM_SME_ZA_STATE4 35
+#define ARM_SME_ZA_STATE5 36
+#define ARM_SME_ZA_STATE6 37
+#define ARM_SME_ZA_STATE7 38
+#define ARM_SME_ZA_STATE8 39
+#define ARM_SME_ZA_STATE9 40
+#define ARM_SME_ZA_STATE10 41
+#define ARM_SME_ZA_STATE11 42
+#define ARM_SME_ZA_STATE12 42
+#define ARM_SME_ZA_STATE13 44
+#define ARM_SME_ZA_STATE14 45
+#define ARM_SME_ZA_STATE15 46
+#define ARM_SME_ZA_STATE16 47
+#define ARM_SME2_STATE 48
+
+typedef _STRUCT_ARM_SME_STATE arm_sme_state_t;
+typedef _STRUCT_ARM_SVE_Z_STATE arm_sve_z_state_t;
+typedef _STRUCT_ARM_SVE_P_STATE arm_sve_p_state_t;
+typedef _STRUCT_ARM_SME_ZA_STATE arm_sme_za_state_t;
+typedef _STRUCT_ARM_SME2_STATE arm_sme2_state_t;
+
+#define ARM_SME_STATE_COUNT                                                    \
+  ((mach_msg_type_number_t)(sizeof(arm_sme_state_t) / sizeof(uint32_t)))
+
+#define ARM_SVE_Z_STATE_COUNT                                                  \
+  ((mach_msg_type_number_t)(sizeof(arm_sve_z_state_t) / sizeof(uint32_t)))
+
+#define ARM_SVE_P_STATE_COUNT                                                  \
+  ((mach_msg_type_number_t)(sizeof(arm_sve_p_state_t) / sizeof(uint32_t)))
+
+#define ARM_SME_ZA_STATE_COUNT                                                 \
+  ((mach_msg_type_number_t)(sizeof(arm_sme_za_state_t) / sizeof(uint32_t)))
+
+#define ARM_SME2_STATE_COUNT                                                   \
+  ((mach_msg_type_number_t)(sizeof(arm_sme2_state_t) / sizeof(uint32_t)))
+
+#endif // !defined(ARM_SME_STATE)
diff --git a/lldb/tools/debugserver/source/RNBRemote.cpp b/lldb/tools/debugserver/source/RNBRemote.cpp
index 07211c6e9db49..efa015920c0d5 100644
--- a/lldb/tools/debugserver/source/RNBRemote.cpp
+++ b/lldb/tools/debugserver/source/RNBRemote.cpp
@@ -22,6 +22,7 @@
 #include <mach/mach_vm.h>
 #include <mach/task_info.h>
 #include <pwd.h>
+#include <string>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <unistd.h>
@@ -2567,42 +2568,50 @@ rnb_err_t RNBRemote::HandlePacket_QSetProcessEvent(const char *p) {
   return SendPacket("OK");
 }
 
-void register_value_in_hex_fixed_width(std::ostream &ostrm, nub_process_t pid,
+// If a fail_value is provided, a correct-length reply is always provided,
+// even if the register cannot be read right now on this thread.
+bool register_value_in_hex_fixed_width(std::ostream &ostrm, nub_process_t pid,
                                        nub_thread_t tid,
                                        const register_map_entry_t *reg,
-                                       const DNBRegisterValue *reg_value_ptr) {
+                                       const DNBRegisterValue *reg_value_ptr,
+                                       std::optional<uint8_t> fail_value) {
   if (reg != NULL) {
-    DNBRegisterValue reg_value;
+    std::unique_ptr<DNBRegisterValue> reg_value =
+        std::make_unique<DNBRegisterValue>();
     if (reg_value_ptr == NULL) {
       if (DNBThreadGetRegisterValueByID(pid, tid, reg->nub_info.set,
-                                        reg->nub_info.reg, &reg_value))
-        reg_value_ptr = &reg_value;
+                                        reg->nub_info.reg, reg_value.get()))
+        reg_value_ptr = reg_value.get();
     }
 
     if (reg_value_ptr) {
       append_hex_value(ostrm, reg_value_ptr->value.v_uint8, reg->nub_info.size,
                        false);
-    } else {
-      // If we fail to read a register value, check if it has a default
-      // fail value. If it does, return this instead in case some of
-      // the registers are not available on the current system.
-      if (reg->nub_info.size > 0) {
-        std::vector<uint8_t> zeros(reg->nub_info.size, '\0');
-        append_hex_value(ostrm, zeros.data(), zeros.size(), false);
-      }
+      return true;
     }
+    if (!fail_value || reg->nub_info.size == 0)
+      return false;
+
+    // Pad out the reply to the correct size to maintain correct offsets,
+    // even if we could not read the register value.
+    std::vector<uint8_t> fail_result(reg->nub_info.size, *fail_value);
+    append_hex_value(ostrm, fail_result.data(), fail_result.size(), false);
+    return true;
   }
+  return false;
 }
 
 void debugserver_regnum_with_fixed_width_hex_register_value(
     std::ostream &ostrm, nub_process_t pid, nub_thread_t tid,
-    const register_map_entry_t *reg, const DNBRegisterValue *reg_value_ptr) {
+    const register_map_entry_t *reg, const DNBRegisterValue *reg_value_ptr,
+    std::optional<uint8_t> fail_value) {
   // Output the register number as 'NN:VVVVVVVV;' where NN is a 2 bytes HEX
   // gdb register number, and VVVVVVVV is the correct number of hex bytes
   // as ASCII for the register value.
   if (reg != NULL) {
     ostrm << RAWHEX8(reg->debugserver_regnum) << ':';
-    register_value_in_hex_fixed_width(ostrm, pid, tid, reg, reg_value_ptr);
+    register_value_in_hex_fixed_width(ostrm, pid, tid, reg, reg_value_ptr,
+                                      fail_value);
     ostrm << ';';
   }
 }
@@ -2651,15 +2660,16 @@ typedef std::map<nub_addr_t, StackMemory> StackMemoryMap;
 static void ReadStackMemory(nub_process_t pid, nub_thread_t tid,
                             StackMemoryMap &stack_mmap,
                             uint32_t backtrace_limit = 256) {
-  DNBRegisterValue reg_value;
+  std::unique_ptr<DNBRegisterValue> reg_value =
+      std::make_unique<DNBRegisterValue>();
   if (DNBThreadGetRegisterValueByID(pid, tid, REGISTER_SET_GENERIC,
-                                    GENERIC_REGNUM_FP, &reg_value)) {
+                                    GENERIC_REGNUM_FP, reg_value.get())) {
     uint32_t frame_count = 0;
     uint64_t fp = 0;
-    if (reg_value.info.size == 4)
-      fp = reg_value.value.uint32;
+    if (reg_value->info.size == 4)
+      fp = reg_value->value.uint32;
     else
-      fp = reg_value.value.uint64;
+      fp = reg_value->value.uint64;
     while (fp != 0) {
       // Make sure we never recurse more than 256 times so we don't recurse too
       // far or
@@ -2667,7 +2677,7 @@ static void ReadStackMemory(nub_process_t pid, nub_thread_t tid,
       if (++frame_count > backtrace_limit)
         break;
 
-      const nub_size_t read_size = reg_value.info.size * 2;
+      const nub_size_t read_size = reg_value->info.size * 2;
       StackMemory stack_memory;
       stack_memory.length = read_size;
       if (DNBProcessMemoryRead(pid, fp, read_size, stack_memory.bytes) !=
@@ -2679,7 +2689,7 @@ static void ReadStackMemory(nub_process_t pid, nub_thread_t tid,
       // Put the entry into the cache
       stack_mmap[fp] = stack_memory;
       // Dereference the frame pointer to get to the previous frame pointer
-      if (reg_value.info.size == 4)
+      if (reg_value->info.size == 4)
         fp = ((uint32_t *)stack_memory.bytes)[0];
       else
         fp = ((uint64_t *)stack_memory.bytes)[0];
@@ -2842,31 +2852,35 @@ rnb_err_t RNBRemote::SendStopReplyPacketForThread(nub_thread_t tid) {
     if (g_num_reg_entries == 0)
       InitializeRegisters();
 
-    if (g_reg_entries != NULL) {
-      auto interesting_regset = [](int regset) -> bool {
-#if defined(__arm64__) || defined(__aarch64__)
-        // GPRs and exception registers, helpful for debugging
-        // from packet logs.
-        return regset == 1 || regset == 3;
-#else
-        return regset == 1;
-#endif
-      };
-
-      DNBRegisterValue reg_value;
-      for (uint32_t reg = 0; reg < g_num_reg_entries; reg++) {
-        // Expedite all registers in the first register set that aren't
-        // contained in other registers
-        if (interesting_regset(g_reg_entries[reg].nub_info.set) &&
-            g_reg_entries[reg].nub_info.value_regs == NULL) {
-          if (!DNBThreadGetRegisterValueByID(
-                  pid, tid, g_reg_entries[reg].nub_info.set,
-                  g_reg_entries[reg].nub_info.reg, &reg_value))
-            continue;
-
-          debugserver_regnum_with_fixed_width_hex_register_value(
-              ostrm, pid, tid, &g_reg_entries[reg], &reg_value);
-        }
+    nub_size_t num_reg_sets = 0;
+    const DNBRegisterSetInfo *reg_sets = DNBGetRegisterSetInfo(&num_reg_sets);
+
+    std::unique_ptr<DNBRegisterValue> reg_value =
+        std::make_unique<DNBRegisterValue>();
+    for (uint32_t reg = 0; reg < g_num_reg_entries; reg++) {
+      int regset = g_reg_entries[reg].nub_info.set;
+      bool include_reg = false;
+      // Expedite interesting register sets, all registers not
+      // contained in other registers
+      if (g_reg_entries[reg].nub_info.value_regs == nullptr &&
+          (strcmp("General Purpose Registers", reg_sets[regset].name) == 0 ||
+           strcmp("Exception State Registers", reg_sets[regset].name) == 0))
+        include_reg = true;
+      // Include the SME state registers
+      if (strcmp("svcr", g_reg_entries[reg].nub_info.name) == 0 ||
+          strcmp("tpidr2", g_reg_entries[reg].nub_info.name) == 0 ||
+          strcmp("svl", g_reg_entries[reg].nub_info.name) == 0)
+        include_reg = true;
+
+      if (include_reg) {
+        if (!DNBThreadGetRegisterValueByID(pid, tid, regset,
+                                           g_reg_entries[reg].nub_info.reg,
+                                           reg_value.get()))
+          continue;
+
+        debugserver_regnum_with_fixed_width_hex_register_value(
+            ostrm, pid, tid, &g_reg_entries[reg], reg_value.get(),
+            std::nullopt);
       }
     }
 
@@ -3326,14 +3340,19 @@ rnb_err_t RNBRemote::HandlePacket_G(const char *p) {
   if (g_num_reg_entries == 0)
     InitializeRegisters();
 
-  StdStringExtractor packet(p);
-  packet.SetFilePos(1); // Skip the 'G'
+  p += 1; // Skip the 'G'
 
   nub_process_t pid = m_ctx.ProcessID();
   nub_thread_t tid = ExtractThreadIDFromThreadSuffix(p);
   if (tid == INVALID_NUB_THREAD)
     return HandlePacket_ILLFORMED(__FILE__, __LINE__, p,
                                   "No thread specified in p packet");
+  // Skip the thread specification in `G;thread:3488ea;[..data...]`
+  const char *last_semi = strrchr(p, ';');
+  if (last_semi)
+    p = last_semi + 1;
+
+  StdStringExtractor packet(p);
 
   // Get the register context size first by calling with NULL buffer
   nub_size_t reg_ctx_size = DNBThreadGetRegisterContext(pid, tid, NULL, 0);
@@ -4212,7 +4231,9 @@ rnb_err_t RNBRemote::HandlePacket_p(const char *p) {
       append_hex_value(ostrm, zeros.data(), zeros.size(), false);
     }
   } else {
-    register_value_in_hex_fixed_width(ostrm, pid, tid, reg_entry, NULL);
+    if (!register_value_in_hex_fixed_width(ostrm, pid, tid, reg_entry, NULL,
+                                           std::nullopt))
+      return SendErrorPacket("E97");
   }
   return SendPacket(ostrm.str());
 }
@@ -4266,9 +4287,10 @@ rnb_err_t RNBRemote::HandlePacket_P(const char *p) {
     return SendErrorPacket("E48");
   }
 
-  DNBRegisterValue reg_value;
-  reg_value.info = reg_entry->nub_info;
-  packet.GetHexBytes(reg_value.value.v_sint8, reg_entry->nub_info.size, 0xcc);
+  std::unique_ptr<DNBRegisterValue> reg_value =
+      std::make_unique<DNBRegisterValue>();
+  reg_value->info = reg_entry->nub_info;
+  packet.GetHexBytes(reg_value->value.v_sint8, reg_entry->nub_info.size, 0xcc);
 
   nub_thread_t tid = ExtractThreadIDFromThreadSuffix(p);
   if (tid == INVALID_NUB_THREAD)
@@ -4276,7 +4298,8 @@ rnb_err_t RNBRemote::HandlePacket_P(const char *p) {
                                   "No thread specified in p packet");
 
   if (!DNBThreadSetRegisterValueByID(pid, tid, reg_entry->nub_info.set,
-                                     reg_entry->nub_info.reg, &reg_value)) {
+                                     reg_entry->nub_info.reg,
+                                     reg_value.get())) {
     return SendErrorPacket("E32");
   }
   return SendPacket("OK");
@@ -5561,7 +5584,8 @@ RNBRemote::GetJSONThreadsInfo(bool threads_with_valid_stop_info_only) {
           }
         }
 
-        DNBRegisterValue reg_value;
+        std::unique_ptr<DNBRegisterValue> reg_value =
+            std::make_unique<DNBRegisterValue>();
 
         if (g_reg_entries != NULL) {
           JSONGenerator::DictionarySP registers_dict_sp(
@@ -5574,14 +5598,14 @@ RNBRemote::GetJSONThreadsInfo(bool threads_with_valid_stop_info_only) {
                 g_reg_entries[reg].nub_info.value_regs == NULL) {
               if (!DNBThreadGetRegisterValueByID(
                       pid, tid, g_reg_entries[reg].nub_info.set,
-                      g_reg_entries[reg].nub_info.reg, &reg_value))
+                      g_reg_entries[reg].nub_info.reg, reg_value.get()))
                 continue;
 
               std::ostringstream reg_num;
               reg_num << std::dec << g_reg_entries[reg].debugserver_regnum;
               // Encode native byte ordered bytes as hex ascii
               registers_dict_sp->AddBytesAsHexASCIIString(
-                  reg_num.str(), reg_value.value.v_uint8,
+                  reg_num.str(), reg_value->value.v_uint8,
                   g_reg_entries[reg].nub_info.size);
             }
           }

From a03343daa6e7a44531e06c8897d6c6d4a46cd430 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 19 Dec 2024 10:16:10 -0800
Subject: [PATCH 085/209] [memprof] YAMLify the profile for
 memprof_missing_leaf.ll (NFC) (#120488)

This patch converts the profile for memprof_missing_leaf.ll to the
recently introduced YAML-based text format.
---
 .../Inputs/memprof_missing_leaf.exe           | Bin 1605072 -> 0 bytes
 .../Inputs/memprof_missing_leaf.memprofraw    | Bin 960 -> 0 bytes
 .../Inputs/update_memprof_inputs.sh           |  33 ------------
 .../PGOProfile/memprof_missing_leaf.ll        |  51 ++++++++++++++++--
 4 files changed, 47 insertions(+), 37 deletions(-)
 delete mode 100755 llvm/test/Transforms/PGOProfile/Inputs/memprof_missing_leaf.exe
 delete mode 100644 llvm/test/Transforms/PGOProfile/Inputs/memprof_missing_leaf.memprofraw

diff --git a/llvm/test/Transforms/PGOProfile/Inputs/memprof_missing_leaf.exe b/llvm/test/Transforms/PGOProfile/Inputs/memprof_missing_leaf.exe
deleted file mode 100755
index 39bc43023559e9cad546eb4a3afd44a1da121e0c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1605072
zcmeFadECs^`#=7=?|W`;`&dJ0NZFZW3o&+~43S+!)<|PtqUN^mja~GvA!Lh5i?vLm
zL@^TD8(R`;(4wS%*Lgj!*YEq7=P!@n<M;Qk^ZtCgdAjcBxz2U2bDis)*Xx|ub1mAn
zOixR*oRV%evdrH+RqnANpZ<e?WO4nIW%;anRv!6pWvil<Lr|Cezfb?wXL;;DIX?%b
z;~$)#St^$yKh5v0*MDE1?y>*H`B}>Tn`Jpa9SzOTO#L_f@aW(0Gg&^5{dec5<;Fjo
z-!r;8c(PU%$m!3loRD+l*De0N8&&WRH`?O&oS&Y`_`S~`S)v5^VdzZz{_pzAYkm3?
zQkK<P|KQlgn8WC~qY~=d@c@3$`I%5dI6s}=>v(=tzm)p^U)tx`)33ibcaZ53%bGjr
z?v$U+JJ^4<oS&u+Q##1+89o2M7W4b&|MzRZ@7I2Jel}M^=ckc1rN+>~ee2Y!F|=Q`
zp@WBw9#?H#y*kzE)T%!6>FPCYKGV&nO|2g3Y{V_o8m+?q_acg!f3I0_!mcwNGjja>
z>8B&BuKBxv*PdU#Ykue7is3BuFaJ<jFwei><!*yFx()uT;Qz*-E*1Y9?vh6R8-8@P
zWflAveTLnc_OJ58%l<n)^)~qO+u-lr{qH=>%BNXn|3#m#1^4_5e&;>^&QnhCf0J)>
z8~IMR!F%5Z58ei!bQ?T;8+`3;@V9P*@4XEkzYTs$@PD)CncK);z72l;Hu$aE;2HP+
zd%xtm4eq`TUi>zA>D%CSZ-ZB;oMxFZ#-)zn9tP#_9d3h<x()u~ZSd{4!H?et|M@m}
z{wn`&PtR@eX8A2?Z+`2sg_w_>xvz9L;H{RW;v1%$_>kZ7tpYwF4My4XJMeh&UK=p+
z%aot~1|IkYcs0OQ;|IYXco=wZ#eZm}crV~X6tB1rJdugOXDhy60-4b#^c!$h#~LGl
zFnFRB^t?mmE0+cT%0;Q|n*Rmlqu-_S^!f$3=jT*BS1#Z`EJ)=UEDbmHM#-M1C!^l_
zD^leb7KHwmRl(NK(f>U$l<Aj=i&N$I_62^G`sHdCc;bJi>e;^*^!bH&s`LWSJmMME
z2lAg2UT7rbi@%VnPoI||pX`Zxo&Fjd1)g{V_)BX4j`^Uc`)}ZPDZWI0(X==G67XS)
z-~Yby90%^$=WWH^pQYlPhl3}+8TdBkiERQ;XeIEE6#s0e;zv_??!F6pu6m@REt#qE
zdE^hKe-}}|R}ixp{ZEr$eX$(!z6RhQtUOP@3Oq0cxZ}@@C6O>Z@y}rA2eiR&_5ePL
z`o%L9JdcuJ&DaM#Om<sx6?pPWYJ2MzM|*u4seV{k>NoB6w}d`UzO=9s^tq4x%Bl`L
zoA4qHz%z6Scrw)vOUJ98<X3f{1O5x)SGKBr6#74{JO|5|xSij6k@~B69mp4=dXIU~
z-bgR>ugT|3>G1&ce}v?J+X$Z6a@6|}#I0urLcZnl)OM{E2e$ih4dmCU{KjJH|LJ7s
zl8`S${_|{o;C)G-!IJQqepyeryENqe#MAOM$h%*GKHsZ84eo?|A_@J=s=xi!3G#m}
zO>Ng{9d{lYcVj+)y!Qn7Wm>VzohN`NzXI-z-<Ne>@anwatiyiLc_gXxh$H`;{JLrH
zpVY4ARiS_AF7U6@dMnh1d<NO4mL&eBUqam=@5HNu8V`Mwfji@|hmOa+6wk}aKsG%6
zsNV-XgL=1+-yS&(+<OZAPP>kIz!OLZUQ_k2(hl-n$o{i54!gHPevHbWSqS++Uf^>S
zZ?#(M9RS>ESC{p`gELd{@&|#(v>zQkA2_P`=a6^s?=+sce@exFm%wi96R?0UR(;yb
zcr^GO)GzI)f!{M5<59ZLE)!n>ZZ!gr(_f`O0PZF|PkgC*CQ|jebW`zbsrctzp-)qa
ztN(i(xPJ!Z9lI5u0iO6P7#BaOo)ywCu0y}3^6!=Pk!f##ic2r|15Xk1pD`L&6UD*f
z<VAIxK>kazTY-tta|ij`D3Leyen;|4SAxga4*Z|0{=L;tV(KTe6u<9HZSNt-hZUdi
z()f8JwY@RTf8L_;QdI5foep`&p5teNKe`$A?4bN_Jp%owlK=m*1@i7K;BoxqbBR}`
zz24!#mnqM+A0QvPn5yS?8E7V+oTPc`U`ya0s`q4j;L8agk{k8rp!W8y3_Mv9^~yA1
zmtX+6H4FI9inj^@&!TyJ;}O-L#@#Q{K%;-;e(<!=0oh9PuIM)4)0MxkTmADN;LiBn
zECJu}M5kihIdQmiCHQTmw=_hGBY#Qbq5CBI%gIkVmxcZbienucs~tL^UdMmlR)5$<
z<8FZFZ*I!pMw9{1V2b};v>%UAKbGwS`QyZIJqx>iMdPKN7|QsMCxUi4?LEIu?Y|HD
zi+Xk$Q37^L<i|XIO7SBiVC?x1jq91(k3s6kgZWTzJMtf|#-)0s|EJpT{+iIo>Gz&*
zpkDU@;A>T%S(@JlD8F5{3-WQoYhO|P5_lYc*l`E+{EhT1(Ga-R7V?v|-jZ#AH>2_5
z9|8VcWdBYY$KD~_wE*(5f6y<^Jo1gkf%t2v*NFqJ_aLAA6u8rm4aL1p{|1TzU!;0g
zsSZ4TFL3#FyZkPGV&s1&eeToo<<;>eYTM<5-Kf_ePp!96NsOb!Ll|EJRDMVg$UjZ}
zTTTXu@t@#Lw6~ecuigsWt$A;T;!C7~M&4=-`Ao$RO#|-xA8=>h{9N<=SPuB@3^h<s
zdmm5oqKBd9Q03XC<9$E%`(GtsAM3L6D^J>V)Em=!Zz<20vb}HYX<dN4lV?7yeiD8h
zc60QsE%CvQOJtwDKIQ2Id1sy)^a$jiBY$fx^R=lru@?2Jn%46#K|azQJWs2hC4K;I
z>3re%|Da;vIYV}+rt7Q#t+Tou0#EoP_?>!d9S5HHG8KPQ<J&NbZ*8B1{5XmyS5^U!
z^aIaW)&JWsl>fU_{stQVgB1Up)<V0Ybs_(u@-*oUJTw*fY{d_s1;6L}RQ?Ap1NUm&
zD6H~*^6UJf>wU)#H^hNW|2{|S=tKk5`wZnR-%SD@d>{2X<2|3atKlh5@xPrH@>>XR
z-XHR1$ezIv<YNcH|DEbzSNCTUWVhK%!4s_r`86s(SI4PE<8<vs$R{Ze>1*%HN#5eF
zVlz17qFEO7alZz??V$WG2yWW@E!nMgQOF-5|9qnl<iDYDbVdC$PX2kIE_l3+(XP>2
zZ;?lVe?jAGs{}x!&pwKiHN}7icb^51GmhHjhCjq_0&lDQ6B~km8ufec72xr<fV^xU
z+2zz*z~i3)cl@f^AHYLbQ}MSnUPXU|J=^Io)Ds1ao)1&I3Uq`%2M8beF64v9Qu+OR
zfhQGr{J);gUsf9AH*0%Gorb*od@9d#+V4T_cTvMGcb9`c@eIg2adPBZ@L1!4cUSp3
zpQ)a-?yjKg3$Ly(9KT)oF!+Pi-lm<wlScltdjjw^)bCFm03Or*fs(4vE1EAQTg!lz
zlBsykMyNN`KDAx>2LSgj2Hr{K*Ng)0q48Bo?GPRdc~RRgLry}U$k)IfKY2=~KQk}n
zr}*&QFx6)*<YoKLE~|B34*h+P{dZ?xKBwzDpC9eY)(SS~M7@V-oEr5^y}XaHTh_(K
z4@;5%e<KcH@Xx4U-Wm*^VHC$kOPn<F$EaUoVh~er9_q){64;G=csJT5ziyYEGG7}!
zljfta&PS7|-VT!Z8UC0{3@as<{a?#^EJO2T+MkbS0{8p@J0tp8*7f?}iMIy6LU}U#
z08h*Y?o&HY(EfUt^!a)o^!J=WyFO8#QZE5_)BaDhO^UAozmxyGq;V{uam?|bCN9WN
zrT$&4<Co)e&%2@jyJXL-+Q5VLQSV{ZbH!ueFG>CNKzHCh2(K;s3C8~ZFn9(kkF^o<
zz6HRGXagR22J*glfIqMDOVrNExxhutF1aO87=Q2tfd>_TU_0=@5#Ww}a!aab^yxtJ
zQ9GH)4A0x-&vln6e^u}_NJj<Jsso=x<Mg=ZCy{B87dN)cEeVW<e<}IHPUW}mgS@kD
z>a6yOF9PnwnZ`xopP@APv(v7MrIF9OyJNgN^U(#x=g~gTA@zr|<R^dSg?%0*`&<$~
zG5Q}QKY3;h@NP7&8%_XEcsKNS=8Hn-An&6*ZOtv<k?W9m=BY&i;2)7+ZP5M=Q~!>U
z_+#`OOm=>09P|n90RIW?m-@dz-u*Lh$DViS`sh!Z7aov!X#B@|5b{%%=i$2GxtHSP
zMIZ1Jgiq@Qd?B@Km&Wrr#q(Y|kA%jd-r-vBMYUU8?dIrzQ{$Ll<CudNllj!xX8`&6
z-#X7mo&f&{<&UU;Mr#0f{Bv7g_+j)G@Viw0gJR&nK>qV!3)rn7)!Sqw@Hp+8O_vTb
zdU|wzbn>)aGl9o750UP+i)RUNZz<@rNAV}$0d7%Vv{c8%Od1zYl!yGM<mZ<(9)=o%
zf1>iQOh7(x4tQ(Di>qH*<X6`<U)o9hHzvUjy#G_`IpCfTz%RqiE|2Lv;{6u$h`5Jc
zI$wuA(Vrmi^!qy1$4B~{mG!>u&mqV=@#;s7W1Gl7KYauIi4P&~*gxxi;4xiqmQ;O~
zNXMCeY)S3SRTKEf6hFUFKTl3b)u-TRkPlGa_;CUFp^w%<Cv^WXsr!dcJjs>>!02<0
z;@&)sd-s#yjw}lMu$@oM08e5)>@UkJyY$=#Jg9cKL-DP;Z<)}2%L0n$t_uB=#jDte
zEQjs#XF=4vjPwbLVU7NGlRZ00fHnPih{jjd-H><3Q~lv&F4P;)^^W5wJv`u<OZ@(N
zz<o`?^Pv91YxRK#H7^pkvCCtUh#Gw&T_G>ipIsKn05Nzi(zBH$w5DCYI*`Ap^7C|l
z_l^X9Q1MT{hJ55Sa7X`lt^p5{|Bp}8cty`azSsPR=e7O!f#*Z=&ju2BjsCyV{8C!u
z13%X*(hoejNxsF4z(f92`yBcec$CKJzMSgke?s1gw`=r$fK3#~x;zLy7t_2`x<By1
zLhv|w*YDGT$MpG;W6y(+Kp%eIQECP7cvJ8^rTsEA0-X2F?iPiO-8{Nq+EV3niQ5~T
z<Mwa~M8+S&yD+{aJlbWT?jwc2L%kAS?9#px<f~GDbr=kO-%7}fyV_;H#8t!pH{~q_
zRs;9+hx|m9e_Ge?;j6$M`{dJc5!HPOXI!*e1Rl2=^72=^<l7J2tLspQXUaL?A<B#D
z<^&$2cy&bm*2)3?sPfNmf%dw40C&dS!q>nPUjuxd%IDgv{4_sKy8t};1LPe$C)A#a
z7lC(Bo=Kxn?|o#?JqN&(r1s9!{2`+GgJXw(^nHn_zW-22|Dn6iN2_Q)n${WoISAi0
z3wY>F_|HkzbGF7kFU7r2%3*xD=Tx*MWn0EB=kuUl<tYvy(|wx~6el<JfjsYPot_ST
z;;W(0S>=CeA@Hd7<8)nLoYMU|ZxnX!rSe;LLOysDxMR1=YG;d{J2yT7c}w#sN8b8M
z_0i|iPCRK-68Z<PK+it<3$M2X&d(qJ4gtS|o)2Dl7yJ?RE60CUl!knFvfDX{lO_(7
zpg4I}5*~wx-b8!nsvXKc3q1Nh@HvV<Fb(y_RG)z2S9Bf>&^&lpCTJ4}`1#T4THybi
z;>idZ??%2p#ntAbfWbX6*zHBFw~6|Bko^2Uacm>+qxb91>iRB5>${1?!Ece>a!3OW
z&tCHXVGDrs`x_%aRs2iX&B=>)=K}sa#hLDffWJuoe@6T9ank4R@{s?8aBCp!*^v5o
zs`gi8CG2)o?e?JyJo8DPK{}5FX&$NH1@h&|4mmWxeUk9$A?TBj{I>RV@Wfw&K5f<i
z-+lvlNb_DN&pD*;4F>5sLXXklw_Z>6TeHn;{Fqu0TopuDBH^t+M2N%kK%9(Z6E
z_+{AI<>Se~le*vavf?#m9x*&Tj~tW0V*F<r+ovz^%Y?Ta0Umx%`pc`3=l3XT=zfcP
z59~Qb^@$V&&q8X~>)Kx4-+N*U_`@5~-fha`(m3Oz`D@h2kPqs6O-_8vQvXkW0UjqF
zmit!qKa-08oCQ67TcE!z&+Sr3<Hlvu|E^BpxkCAE`y85o(t12w<3NDoz>jx8-b3#_
zR*8e(L;Bb9LQhW>=r7%8m&<jvT@<gLYz+B5gukS5gWs1ZzX?3S7r?(n`4?%w@Ou$e
zu4%pG&uhv7f0N?F<wn4_kv?tp`L9Qx|H^c4mu8ap8h?)W#(e#^;#D%yuGkv%my<t~
zm;~I?_a&ZF`IqGnhKI-XvF*?&awOFs+RMIy881^P-~OzR^2~#LZRIa94)|Egi+od6
zK6eFMvZ*@ao$_L<;g6C({Ho965_eRw<>j?%yKHz6dcHt*%hOW%DG%(T?TzSu!ZFZT
z?`ggicnfxO@}>9IgWsq7u1=mg@kii+A5-nq^#*Xy)l~e0>cGP^pB`uqJ@e9dsW(&Q
zH$k6fs^?z$S<otNEuT`=Mr3%}WzJoYpG1DOPx1$2&)#JJ5fUd2?xpzOUf)*_={n1a
z51Z>iPks(^ujbX>)oAZ=)#uK4z@MOeW1YtNIE{<`nwPpYFO}xlC6DIY)=Ka@>z9^G
z!O!o9Y&Z<uqjAsC|LLEAhc2h~%QoHj@b7?~@7G_LA_=Ook45W|+8MxuG%uI(0G~nj
z|EUu2I@IrBKk!$n-`Dqso!8O4^ZR_r^Lrg@76Eq`Lc1glv&*egh!4-wIF;~bx69pF
z6qEA0{a?%KseYJ~^lA1e^a(x%{+Cq#`2pamN%_o9)hD9*yrS}7Bq1NUl4}2Tc`)AH
zlphY)eJu~|Yke!@-NdVxXuLPN0RFJ9*PMR;MCbj0zQ@>B^;sr=FgzWpy?=KEo)`)}
zoqXG;>kc2SJJ!p>-|%cDI}Fr#65j^D3TwUdHJ%5kUseUd6W@S(-%|OA=IuTjN9T3@
z;-meagO}CL`o4!_hvI(#4^muuxftpVoQ9q<Z0)jJ!i#BF!UKIIY})0l#-+IC1KEl<
z^+EnH^~=VlsP{VIdHX@0_k|ki_)XIL52y6K*4T|y|M^qrBYwV9X(j4)w}HG2E4vKs
zgL?TK%|pW=zmxiLr#QTc1H7LwZx!%FU+|Am{$+Z8!LRH0OvM)#M!lY6u(R`i=G-da
zX;1OtGqJy^cM#!q8-gc9_T01u@_fF=ug^h}WVhUTRiEw9XOik!TKl~l)q7(i_&=n1
z<WbFg<C9?rXaBsYzSk6>@mp&&>h%%N&28YfbUiEGVV9eq0QY^K+V7iAC{FA8&N?nE
z8W+E{N4@@m;F+QP_q+|B<R_{8d*1~fro6i0_rMbuQss9@N1O4on&yQLx=xux?OiML
zuaU1!^RHXySw0t2_hHCK)Zc2V{*C8BAO9)zqfCo-@ykAv@t=hhXYy5p{8QAwC)J-l
z{m`y4%Cj#D`Pf^)*C@Uz1bw0(0hhGRF6X8xuJ0KtY;Bc^#^{sO_1Z$^`RF2Wi}I*X
z@_@(pHrjPk<r{qi`8f6OEPdaD<7y#&zF?J<fhNUyzOd;_@C0@Lr?v8T5jz__htRxG
z^D%ALD706$r|i;j3FMQdfd8s^Hy!VM-mr@p*7UpY4aht5Y0q2W30(*7*sbOLkWWy5
z<&uQZ@Z6v{u(B2K%B26A=e51pVb392!P=$31C>y3U)8_L|A2=(0hj5_E)(7c?xuCY
z%`3qDlwY+`yA`E;X5CQe8KHPvO4l`fzR#5jc|PCwd}Y+@Bl|p59r$#@w+;YLd@<Vf
zi?*w|1RP^ej{o_NgD3ta<eh#w_Z{#Uy??S#-{TGFd%OYVFE8=d_*ErZFIVaYoX_)y
zBp)#H-;;exOog6&uCR<Gc1E7h^=$Y-aeBY8imZ!GdxLs@$B8FindlA9=lg!tdc*lD
z*e!C_YvnW_@X<P@pZXi0Uu$^^`nbP=J)M2g#$Nyr>2nRo&XJD5kCH!st@94QXY!54
ztKi$<|4{AUOy^~OKcsRQw6`DS?JLy(tuEjZx3|lVqtGWw^HKAYz{4~j<$eKpRO7Jo
z9@^J2;E7LD{c4~3RZ#uPiA%K$LC?@(jNkRD&k?8JGr%)g@%LAN-%ZaY?~;98<0n7U
zxG1angMU4E-cp{fnx7~BO4ai(3E+k&nHTsWm2ZC<xJB<zUef0aUVXkG+gx^eI3MJj
z(zrOM{^_Oo_S7`Udvw3_8Re;c6L|1is-71$4hKGk-JHDZS?%B8W00SxJkP!c9-lrZ
z68E-CC262(Z;0~J-j{*<HQ#ppd9v=GgeXpKoB{dRtI%hN@}Kx#d1(D|HYfW1DCHG@
zNW3!o9HjX0{9@pIPVI7T;BK<>1)WFIX?*P(gL--2;Ko|;gr0%jW~=_4r6Ww-;PaI|
zG#>hOA5Vs%UA8rb{Cm{DVO<yS`Mc9)As-+=f4mWJK5si?F7!+ug`I0S0p&jBsSf=m
z{MzMF9Y@hL;F}eHU*3;3?Q-imk<N-AD+iva3-bLH|5Mxb0kx~k<B*R{Pqo`b-7oUe
zx~x<d<UgZ+kE%cOIlfAoulgxpeM|S#lC+=pQEBjp{(^lZT-fDkd*FPY?jwo+#^3r<
zzG_W_{6NCbuY`Qy5wust?6OJsdxNCU&o#gkt_*o+JpS4QIFHlnPXqsi{J)0gOUYW`
z32D8{pMX9V$*=tYxR0J2M$(Z#2Y&;P<30s-zVO_F-D1lB&7<J?fyPnyb--hjz~hXU
zW!HcQeoEzEsr%${+9yA(`9nP#zb4H#{^`#To<*w9wFki8mHco(f5?XxLB6@lCk6q3
zob2<}Ch+j{?oV|;Hd+LEwX+}FRsHr0^4ps-p&0*3&^oKkC&1&hU-yb6(ndZ)?}2>Q
z0eFn;{LB*NDT8*sqV4*sAlel_g#KEu_*%&i?QvZZ^2-&E>3dUgI(Ip{qQ(I_clqI;
zsMkyLPDJHBB)?VT<QudeOv?J&=)>#lYaeTS$v(g2hkqvNx!b~?;F&`C_K$slpCSBX
zXPiC^JM>mNysi3sN&iF+@Ps$Oo;w{oKMXxbQ@?c5c)O4CjS+Isk7;kBdu5wJe%&sQ
zxWV%*`T4xss5jaGdWzfF<pqtmp^m_v`83oWJOOHNHL-*7D?WeP_6+bK<t=+;AJxc@
zr*-tE@{m78d1L>s&@(U|`pC3mmygw+KC)*?U7xz|#5i)+r$y(3KYSK;UaLHH6!)D2
zE@`J-YN;P4X&yN&alqI)-V*xAwu4>rGyu-`emvL%c;Z>eJMpT!;(RVL{Q&fgd<J<Z
zPPQJW^==0)!_zK}pM`va{Qu(@fb;pLlKTFDU-yUaQl30Ik9cVw`9R{NX)njgPM@ni
zzfILM`;zKI`wx>fZg@3rh<bJzrSpD3=lyny7Z?E^7qzRV+9yi(8QT<k#&mt*_-*|#
zcw#rvzcJ;1MW3_#b-X)vSflwNpJyrcAb6gq_<y1!>g97Q35^>wX}q-3Jl~TEyE%IP
zqx;xT(zy7&3;6FRo^i6CHT}Zt+2h|pPxqPBepxQ>pPO;#(!48E?QpOXaBB_X#&&hM
zajL&38$6ER?w_u9*7uGa|6J}vy>YV74#^XYo}uT#Bk7P`+8l#?gq}O+{1SMQ)~$D5
z03M=z{^*n7pG@;?Vk6{(%TTXh6>Yc!c>D-(NB>v%0FTmhfl<;g=J{ZjJ|A?c{>{Z8
z#-8Dw@T&=mXU2iMsa@wZKM7HO@{o>;c@&?oM!=H@Ku^a{Uex@V&zFtS^<Y%*6)B<m
zWNH4u_f8y^b*t%@-~-^ls`8VwftRKHbN(ji!_N)>_#Svb`=yj)2W_v1+M7$)zfoHM
z?#%(7w<*uL>ptM`(0uoaANXdPAIEC`lcf3G()Djy^3O&2z|)HCf3^<n!1uA_mx<Nb
zjn6x56bCRkpA$TI9Q@v|U}wiZ$s51}G{5xI=Q3e>zEEHDLwEivwuR*}gk8#P0}r2X
za_jy~v~xvU?lYC2FOFgSwg}C$<sSh45ZUKbSr-`jQ#8K55I-^az2qkoG_HCzt`>%x
zR-3VqA4Ku|pZ9?KkHbEyrj<kd-0&xIRj?UeRGuP2Xz)$cUoRGi{S)Ug9wpq_<*%}k
z-$?Dv+X49N<Uh~+1>B;2<F9goKNbc5`>Ibf$y-dn`*J~l=Y7k2rl~%3Z^oDe`2FWn
z+gnNkqiNSH(x-*Qd4r!O`_GsMJoX**aq^Z=jsOqSe%g>zz`eRp=-ByL&D;6hSpnIg
zHuWaArnYyuB>V>FdtIK;b#a{XhsBS;4t##Uu#VsVQF~YG__av?u99DwdWVyL9vBXO
zzK3L<IE<}-2iQ5R{gqG8X-0J&>Ws&2;t!@DqfbKqIh8NC5<J0&fUi*e{2##MG=9I9
z;b!~89gx4M^6yOr&g=c}#J!DttO(?tyzbio_>a)MKV%Z*`P}Owo$tbn!6WI4UGfzG
zPk$OOueSxx&+AJq10L4*zQoP!^1JHE_Yc&Oelhy9eL8CYuBQIoF&^?EeQqW0ZI?pN
zpxzA>9}4{l+(Y?fL9fcwywGI>@X&nNL6$joc|+GD5n7K-$%Op>Xx`}~@!$9l-`}-U
zCSrpxpz%2GW#EBcs8_`7GUG$wVcHLATpxHZ^5;(^-Wq#yyxpk#%zsdOt3D3-DEZ05
z0r2l9eCiLt{q+3e1E(L`K!0al(@pnh61qR*_<xPY;Mq?4e<Y4)^iMtl9%tM&eG)j|
z?@?wn@Ng9J(tNwP?g#!AwYPR(;1{S}%io1Qp=022;@h=ffyZdy<FLHvVD$9$#Qd10
zzpzcu3-Eo3hX;VCF7<m{^GWy7)Ox=b{f(deNcs<b2>M5RL*DU|1)bCmLEthB?b4t>
zaK6W5#|Ys5Es(FN^5d7IUcTpKPcg{zIm$1D$LLv@#$C&~z`Y+M{?}Cd?CuVp{$&4Z
zJ%NWq&_~i^yX4dS|2L9f@dEhyd0ELQ@bK&4c}aOn=)R4g^17Gwy(TZc7ZLSCA8Rmp
z#JqO7rvAh6;b|S$bE$vxZBcz*1<zZ`^P9%`3e>+<o1k94zphkQ)kpU+#Ef?7-3vIs
z?{T-zYyM5BSJFJY)RpzV@mpT+FIfUT`F_Z*TY>XEiU<Ayp8N&;&Ujg%?XrpiuciOc
zW*6k6<mWDpZvl#L13i%EbASKb2mSe+=8cxXgFR92C~a?8<^|K<_&dN4C|-U&`1#(#
z6IXzH^||?7DnFq;c)SC^<M`+0KOyha{l<bSfAALY80CTCa^T_jjqh#-d47N3tyaMK
z{ecHMgNN@MdE_eO<5!@k(=UB%Lq3b*?GqAj&G_YWd8Z{l+i_U;@p@>xChGj|s{#H2
ziZ_vgW_b9%&?gm7c7yyPl^^v6<YNbbcT~K$jCaFxlg4|8p}<$t_$vMd@PMxOo&MUT
ze!}M^M{AzX=T>%V|MI<#`!uiM{ox-%uuuFD^mpc|Jxifayb|!X+OEF;1D@y%+{ufY
ztO3sVF}@X3o=+j~^kY@+-x&4po4KG5pO?8q5>#W)hO~YurRxqp7xQ5y@VHB&UJ37Z
zdHFneBJ`eB4+$)WKk_c*C5+hRs-6!G{F$o%eY$>+>H6L2uXasQ?@#2PRk{G@b2Q1}
zz{9VBzoyo^>p9?jj`B@C#}uV=OjQOzz8d+}EeRl|T@7en+nNL(Ci}0^_xjv_q_+3o
zMv#An+Lag!d<5~|SOJ{xby=YAk1eD9z3n-mKks`S(D{q^H=1=&dCIGs4@A99$PPuu
zK%Vc(Z1@r6JttG!J44sq(RG;jWxBD;_AK!5JqagTfTtMgvq$V>;t#(!k~tH&e<S#v
zaq-+C@I*+TXEaXodCa^T|M~gvQq8;0)4cp)Kh(?j!(3AT2`_~mj%j;;9}M}CWY0RU
zKu_-w$cr1>W%yO#3G%DM&U*9rR6Dyh-{W&V<5bUB81mA6c3C|V`jjJo3rGhUJNx!S
z{-DYqep~fCl3MTLLcmwidN8SRH8;hXLG{4T=M!D(hrE9>MiLvN4?ia=wjA~HbL9NG
z@9l3>(bl}1>Oa3B<oTYoSDS#J<8WGY$n$#_UI~CkAHF|xsm47|4EFg=>n(a1JPGo{
zuKIk$Px~A1>A9+;-gE9%o?Q0#T;w@2o&T+)aj7cx<AWPfFF!|qVK49i%_ARbJ{-|}
z*xARJ`ZMI+<cDQ{1s<ez&<%Yq!1s2h-2uDtd9=aRfk!I9ZZh55rJKyZCf?4cIPlby
zkPrP2@{-2bW%Wehe4pMstH8te#uU=~<l_|o<Mkmwkm7S5ji0=~v_ujiJ1#v3eVjb-
z{=dMJ&^*wIOHn;X<GTrYXTD2RfqW;5Gpi*ZGyHshD%KHrSkFO-yV~W=JAv1ward@V
zXL$Z0{Kw6x*IEUA`YTUg?RSg%y}mvdh|j|Ka@u9w!0@|B|A|in|CHk1x>r#z-;?>A
z=Fe`j+jY$oe7B&dQ|}^2AAL?CVaG1Z)qfU||9luiy@hC>xrpYUd{5>no!0_1e&r#K
z?VmF#?hO(98^5hdc5b8o!~0h!B|$RsfwE|qq$_s$@Fwsm*<sD2z|T^<a&G`0n+G0e
zTnu~-xV0t~Z}&QI|4!gezkjn2c$D_fXQ}>t9_pILVSeu4`c3cz4})Laz%DZfz#sTr
zQikR^e1A*tm8ySR@Jv>oj9lR1@5rpK1)TS_ep3Adx_<AfJTo+IaNM3h6ZP^vqFcTO
z9zG3y96x+k=jG&Ysra4xUO~K9WxGa4K393@$@eOBt^nL$7(B9FV3&TKfd@tacjo2m
zqoGfGivRg_ePMNi{4C|!seTfF9=MYa#AL!Ue&|mEcl6As?^$`NU4!#Lz9jkoCH3co
z`txwD*IEksKsoRaQ@n5lIPYKi-v(|`zA>XE?2zmM9-02^(&7Q&eBWsajX$1skl(5D
z|LD3PgW|xaVbsg_z+4oA8~a!k2kL15Ca8aVm4<w0iUV`&K+o`l&}XpfGjF`&)Q?f^
zM}7~Xn&wL}^=C=*?9yN3!?zS4)~G-8{iv^Jqh8;~(C4i3e52=sqO=coQSuNI=fmeK
z+LDf+TzLpQp2xv6M0w6n0G`+byu0Fg)~cPy0(bl&pT_?LJqNj_&jCaF9MGxvwB{Q@
zeGfsl1?<vPXib0pO7?$mtJbUg!x<{SG7jAT1@v+F8|!>+>3r?r)#^dMHsuMmG|nUj
zf@hBM|9uYfvF}s)_x}hyM9(A6>3eSOX|Qu)<%w&)y^h9R4oN&rdn4Pi4iY!8%R9S(
zCyu7pTkUh;zHfoca@#In>iax#nrCNLgZx;U-#@Mad=25l)IYsXg5R0fzMKzvzE61U
zCg8CZkgu=$^wfA8q<H(23`OIIv3JnksVZM~5P1BQZ+!eSa4+RE&r|^qpYQu|0p$7K
z;P=z?Ir6Wm_ADzK{>J`6+TXAw!87<s^24(=Q7?ao;$B_ny59o-8g1A6I?wWbr)zZ_
zSrZ`txXK^-MeSe`GbxVWw#o#)oSsjORKE=lg1j?dT+(>R_nB<E8$9n*ySjRT^Znt^
z$oD0U9b$zl+4@NH?Xsf<?7-{&7P{UEu7v+P@vUol$n*V&Q`DaPeTU1fA@8H-BkSDI
za|(^4GA{t<dlZNJw7(vOePlYe%bN9&=lgXFNnkPl8GI4`c~s>)=L8<6^Egeb1LyNl
z_s&&)=v+x7jbnT-<@G&~kH=H{z1N4zL;LF6OQBxRpU|_J>iMfotVVy{?<#s3Jf2Ib
z{Nw*toc90E=)L7(+TVLn-;a$@JYS~g%lN*yW&!BK&mXHuLTLP|3eDqvb^uRM{_yTs
zz`YbtX6bV}-cLI#`*p?+eD6n@N5C_I=GhlILY~h<H5>?>_tS>U1Yz_KeF*zY7_`g#
z8Ylf1fj_1LxPa#K!Bw#T8!ErODEJe4-{wih7nW4~C-6A<dCiyj{qu}v(1-77`FI_0
zi=M~*u5l(napt2NkoVI({;RIL1G>NA#HAJ*C&y7dY4JRGVn0KF=RKTr3B~oj7ja{|
zY&r)#O!MsYF2F0%bA)$}0guzXf2Za%4^aPhuLXPZcOf5s7&xCx?bjE$H6Q&Y%Xzze
zy8`v{y_D_W1s<U1&Q<rSo{ASyo{+AKlQh20XkO~4_nBAaP=BC%LBA69jGyy870X^n
zz5IT~y*gg_9DOt4H~zet=9eD2|8te%Z4q7X@O>0BhlA%&n#X%x1RlExdpdbhNzKy|
zI{%70+2t<1XCvML^T-40a9j2Lw@6`00HjEIX_pS|VF%uK8m99gpDX$G73dRv8u|<d
zjrHyyimU%i^X-zqg1#p}>xFB>P%obsTPFd**pu&veD!DG9?EAbDZiiiD;I}6-_vzn
z*B5*b>NoE|-b;S?&<9#C%_CR;0`8;dn=fg;6w`dk8K)I<p}l-h#m<Jn`8#XPwOs*f
zS6KUt_j&5eLcsVlKOfv10?zwC*<V8cKmz^f#HFj9AkW`(93dH%-7nfMM?O*sJp6sn
zq8c~&T<0}i=lW=#dR*HZr1t*pykGq)_7g^`UyYKVMj!WsqJWg&RiWI{9D^q*t~S08
z?TS@Ld&k(nWmy;NLB2VyYxehoJm13>dsXY51|C_4*ri@>;Au48`|CW&-ve!*4f#?u
zkN18Nc+wAk8HRRwBOml}@5a0^N%2dvA7c8Q_d`mlp3&@7JzMF1dyM9z1y6v7&!hDh
zgByF6rgeG_&C~cC)5vv@_dlCj@5l|nV>B-u*LiR<_19ZZLZ2Azdw0^j%hJ5d8DHZc
z2mf0%-<8=2obT6_w8CyzO!sv&)t&=fsF%N6Jy!B(V`uL{*k96SyPT~Io))wo{QfcE
zcac8pb$;}z9cn92cg>69AAsM<i;kwFUcQg9mCi?epGE~;M|i@B|JRiNR0eqX9>s<l
zPa^%%Uo%yHo#YRuy-D4#5cTZhilSY7kD@Op;)9#kgP|Odf1T#XoFl-`=Tk<j|A#t2
zpCMZBqtemF51WvmbP{tJJ!APG?~Ipa?I7Qi@|m=G!1=q$**fp@{f*=Gxwb{mwU_8Q
zV!qcgB7w!|!{4*M@G0tzehEDtzja>$9;5ZeTMq$mMEbue@!8nncf#-91w3*T{7!tn
zN1qQy>G@#(#?X`PymGMOE5Y-T+Q0i$@OPzn_4zE|d@st(wZQq@*(#k!{4|gJkqh#F
zk)LeTILzO_J|=O=*g3ovdP?}R%Vue>!TElJy1E_@e~)poOy#Q;0FVD&jJsosX9}(1
z;d!KzPvv!8EXyLh>|F!>qobhDR>co&0nYDflsO4J^flz2{%WXsV^Y6Ivs8ILsE>O2
zIbB%$r5E)}Zyl$6uS?c$)N9dt`RSa%50L#EY24%UkB4MlGk(~X^3`2W0q6V2I%{4O
zro8B~G~Dnc+rs|n9m{$p6Yb^qC~_`@JfC-Ks62s|sXX5`g#2up-}`EO<L^LKl7M6M
znL+K!q5G43zvUL4_w&%af2afW=X1dWHSgv79P&(n{0f?9YtDc^e1GQ8mw>w|ZtR!l
z8~ra)ocyg1a6WHaVli<3-sQOSz{B*u(+iqEBq)C<t?#{r=>E*JmC!D}Z!mW@`1zjd
z$Q!`<KFo3YzOknVJVd5HyUfl5e!eI3l+G7z$qt`&gS>kp#@%ckUpHjpGj<DXh5v|p
zcDcR@xR3IX8k&bBDgVEr@rTbL*3$J?GH(@|SIlLXjiSG)_b&3EF>$Sz*6IJ~`KCCX
zZz`tm@mlo!qOHU?!~Z1l=au=<#5ewKctYpF;2zi~2sN#xI=_2qe!oNawZgQoRZ90;
z_&n6Hhfyz|%Pye#lAH3SMqQylf8X|rMZlwO=qc%+UCzA(JfzRdhA4hg<86ZCZSr%-
z`@c(#!-Ms?KuDhpNSbPwRx<w@yYcs}h7AHwXNnuO#sKd`<NCM+cGEAO%BWY;Lc8SO
z44mKhX!$YlfWEJy?qh9{{Lt_WptxP581QeX-|Mu5{&!OT&~PGf-(Ki*O7-b1^NVRO
z&-+_+AAs-SJfnF?lJb!FXV52RHeRGS`WMkW%1ieuzb*NyY1ag5S6!V?-5bF3mg+f9
z=QS_QYY&$L&sK^9zsSUG_oL>$(tNwj(f0Cuy7q1Gcxc_aTKy;aZECx&Xui$gEiK&}
zJTH=;kADvMAM7VOz7jOPX6U}6MRE9XoqsoyAMTa~qp{~n;;$Zp9sCEO|4_BVswt3v
zi1Lc|n#cMzk9GR7vBqb<hq<fVt6=K&b(MvZl*Qt&rp%T2Z0yh9A$tA*>P>u>YERER
z;QT%DogVN9^nOccJvBEU`1xFFp$CEUIrO~}=!~A1$Zp@u1YvNmK6ewd*yV=$b6ovd
znq!xNCBe_%1G?$~&r<Tk>aBs-pz-*O<io~3{9UA>GCdjmd*XSjE%0Cfc5~w1p={uM
z&UCcqtNeb2OA=ty-Vn_@OB9b0-azLgkIqMZ)DHJ+J`m4S!A6{TGD6p_++W3xqTVQ-
zTTI9Yag06zdT%YWFxtiESl_M!JXsoc_)_&L<>YUVBY%+T+b+LrzRmYn6m1V4K2JJ*
z2lPx*9B8HOjpj{l@60gd`TW<98vprzq0y@$&-a3!kb!3G;AxHgq@e1*X$f$z2kn*V
z!!Bpyz=PCZ{`0^?l-C`)1l*#0GO6$R#C`|AxT{@i7KMHIp3lDf!Q<EQE9SM!8TFq8
z>C?G5cszRU_#KsR?8N7OsCSIw#WWAGC=V%m5%q>?pZqP2Ke3*OKh=~chvY?O9Pzo%
z8Rdb;^PpZC{&uM)er4?6hW2?DHv@i)))&)sox<M(I&=u~F?z3NbXnjL8rR==f$yMt
zpMMN`@^?S0cL&bj{X9EY`H#bYocqTzUjfeVE!2AzxNmZ*9lGfIQGR+qDyr`#Ch7i!
zZS}y@nCvq#6MFKw(KibL4}JnY&AQH%6`D^zN<7tTK%Valy(R(B^e=zk(=#6WOs9G9
z&5gkM9?tdJUs39>X@?>2rhKfo+99F)iV~*ma!}tRikkz@Qk?NSE(i4F{jtZ-f<Jj4
z_E$4lJ1T2??*x9pX0xoB0pR>y&Q5Ai{$5PiE6Pvfy=h(W@OLw}={&{XiJQLxJl6Bj
zXSnK9<#piveb3jvQTa2e?OJ^kc!b{nyi^_f@b}aA4*<^RNhfN%EIq&G*lkuV$cO5o
z-t($w!;Z>71o#t*x9SP}S<2HkYd#jId~8x*$n*DE>S*5eG5K4AZD=o_%e$BtJbW)o
z>qfwn?V+bkD|R{XJa`hnV7z3xfL~Lb-*d>HM|s|We75rBlsIF?%La-wL-e_-MbA~|
zSH-;SzpIMPAZff^!jj(^9=;E=i3d1;-+XTm=s$|=T+L}0J;!hO1LQp#=biptA=9p@
z*D3=0$TosqUX%D^aE?EjpM%Fsd3!;b*G!!Mh}KQd>A2&2ecv7p`8L$QM`r_%=y!Ie
zsy<)nK0vG}>?W^K+2u-S)nE4yWmwsz{{-;7O#Evk-!^*k_fqfBea?{XbGnu1-W=fP
zdp_^cePBL^KEFTey_@pkcgKMzoAkL;;;OMT$JHmMfTuaFBi3ttbJKeZ<1-*Xn$~@@
z>I3gf@&Ei&sMq}p`u%%t@2@)`@1b>QKV3H^DW9BmH+aHk<5$XH<?(30#D@c)toU|G
zXpQ}&-vD>~aFecQ!<onf53Br#GA|o>KKEVX3E+Ibd-G!8d=9aO=8d89sP|drUw8)c
zVVW-{YhIL~@v=bgyYu!C$Cff(8&F;SGoA<j>FiS-QM-9`y(!a#UDoM%=lzF#GB2C<
zE}?kRbC2pt>-59=UMt^gx>n;%(32Wxwuqk?f8+guzG4=Gy9>fTG7Rl9R65S!{C$~D
z5=f1neD7#Y&2xN|=QLG2B*=f>d<#6WPg4CMsq=622;!}i&(vN4d0#Wgf1rAnTm{_U
z2lx=hmz@Ugq5h5JLc92$nZptaO}hdV$F7b9UV+B%8~cDqj;HGX*=xWP>wphbed^r;
z9;E%xt@{26-%pq^7V=L}y`vpFv;@C&lU<(D{2{gz_(H{dG(mekoq@|Rx620+)rZ!<
z`8EM}zW{j|&UTr95_pvM1E(i}hv+?%?z#^3Qoj$(RQs<*|2p%|_8@THPgwe!>XU>%
z4OE}O_W|d7DGPt3@|1rjWu0RB_iI`&oP8NMzyFzd1$cBA>itve-MkKX@Hyblc-dVR
z_*&wR==zB7nY{Tp+RNWP9HsHfT@n1twchQr&@=jb=zW`~C7u|(55@nUk{21gI{9<K
z81V3?;7?QjJ(35R@y_2XbgO;%JjFBLfyZ|iJkEYj>?hz6$|oPuIO!>Xxa8a?wnRGA
z)H{aescg+NJ$imk!aj<|dA`4B-HWJ~?-6ahLG`5f2Og7#8~cRy`PU<Ah|C<oUnP6?
z$pbz4Jj>B0kUvNIM<o!McJaMEHNJ$rMeh++ISo9i?<=Z0R%^9qknDM+D){;PGLLiv
z&fg#Z>>T(bnm^p3=IpEI+`}|ZgTiC%(~k1Yj3<HfIpsc2qh5}CqqM)g#o;GwwBBMx
zz_W|}Q{z~CJLH$B{O>0qAEfy6^I70t%8Rl!-i9gO-ctg4dVfItk??Dmo$}N4ms|IX
zWcb)6sqG5C3VmeSvP&UJ1B`sQB=GKvkJtH&@3F~Q4f6cmmY2NH{|%b=pOZwz*#A7m
z|Ks<Ahwu4$z60cU)4u3S{{WBw0zFHrp3}A8Bh>FB@@czv!9KDav&$1XfP2>i->Uc?
zN!yH`g-QQ_EZdEJN)tY>H*ohf=;OrC_z2*9FHq*2;E(BkyM!IPEYoojrg70l*Ml7Q
zes+VO&trDK6F8s8tSF9Z$A=*FT&DGwC=J|CasGg=fBF5f*VTXg>Oao>mC*T)zXw`I
z^Dcg0J)aod=)?Cf{jKAL&rc1}eLoNF`!x`UH#|?%y%4WxoaB2`U(s_b5#kBm1-tS0
z3Qx3yo_)v<52&8u-LR*Gce|9*dB+=pU;V0h8SO{@-pYh6YKKkW*{1Tvl8RFv@{|NR
z(_VhBICc<t@&tICc)Ryg;NdTU7gqkFx^M5+eS1f~at7?c-xJI?3_Nerda!}ensLGB
z&Puff{xI3?7oFE4k3yf$s?QIaZ-+JCcJw@;?=SVI{{6*SztH=l7v?}uz9%{R7sY>0
zwNK<P;BJkp&OGu_A?VNN&fKyOX4=coYwmg!JYP_IhjjvejqpsJ*8&ruj}tdeNMd06
zH<RpqrwAC_+5x{3^V(&EzK;>tby*(8a}|Z2ypOSD1bFgO94__<<oSHZE0QRhdOfr+
zS8x#I`P|t89q;_!PBV?$5sll9oqd_$;qzG;Q-SmSrQ_#AAHJuwolIm#&oHgOz7snb
z{BxR@OY43b-%ImdQ`GD2ihdcQ_Uy0y9`BWkTf@Phr2Z;D3pn3*x@{qFzVGy-&w*R|
z9)eV7mx^bA2WZ{b_XKb^<qs`&z3eSr#r}Qg{`RxE(Jubp%4Lc3#y)({_B|Ru`5ecF
z&mbSw=bld7Na*vgUt3qS>y>WB&l;$g_W_FE2b}LE-Y)UhjIS82e=BSM2DN_+Du0LF
z(8ojj2_<!%6{qq3m2{}#w_Zj+*4GA1RQpGCKhUv%4z<6Z?Eg#|)H|5w&AaP?pU=sj
zeH?gVM(TLqFbF)|$$w6Nt9I6Xaz~%?dcG`n6FiRntIEJP{aBRz{N2Y;uX_ySCCu8T
zmn3+`4&F(r`t1G?JQ12lJg0zLwBObD8{mo4srt;2dC=%Hi`E_c)F1f#tjp<_m%!uK
z{*8?SPaTR^<#inK_imPGUc~1dzw)8ICum*selxXCH`FU?*d<Nl7=I_|;bqV#P#y9T
z#_iHa7Ph9n-nzgiDt<x75#L8tOYIyaJNMB2lh~seN3FHLYNms~C)u-OAL!5DgTD1F
z<bNi+eUb<HlIIrml(fe#X|iu@{K2jJ#*SY-pA-BEJ-;c_hh63r0RPkEZw+PQHaz^D
zqu*q_7@W@q_tk#qdn3yZ)pkXoXIriJS<T<Vl)shMc`&H+U=x*pt_yhh9NfB9&_D15
z_-m?s&vU?owEp^`Ja~A&<?s7}bH8s708h>W|7GPLb`1Q{FMvCCKBDnC@CN4hY?WVi
zH+cBm@%5^}`5u*1l7|>O^LKx$e2RK4%>!E~|3L98BhULS!|MW%HiDiq4DE7L9Kgtj
zw*arL_(Lne?`{rUmSc7)xf8gT_TdZu3_MQjosK$h2DZTtgOz{YBFIN{Uw4?|mgbXu
zU+xocLOyXA`bZeGOZBgT2Pj@O%z<|CIlkgMARjvd9;d&i>`{Ga-kJR>@Z=Q8J9*tU
z&9|eJZ;#XU%P7ikZ^;B|?9BT%@94ZUjOLw6=TUE5{dSeMx2mqw`F@gMEAS_KLQf~|
zb?yhA0yH0G>v#<7c<itIcRU68wIqLL3h+nhxle<af%843zibB{B|A_440wd{jX|0}
z^M1~Q10c`$8Fs$}JWlsc+|YZ;!g?>6lUJP8``9hoPv}q)aWYBkju%GjxOg4*d`a!I
zb}s7W?=0_J4V>@S?e-&ZpX%x8Q~L&R_f_yaez;rnfzV&6cmoOS#((&HaleMDXIsd>
zqIwS1e96*%X@~!ajyt}eXM(<W$M-*1(f1z$`u>B2U%ND^gnFye{z=Rkza61Z50xLi
z5$*LZ2Hr{Wu3xGhbidKbi@N*;JW1=6^E!^a`hCM0%2Um$w>|j7ig!$Cy}EB%O7UiS
zp+Da@lC94LeB>v;ihr7X;4#{7T%_?WNpb#5)yJ>vVp)#arLp>9jPA`VJrexIX@2i>
z0eIvGw9Dzo^l{MNr~8mHJneGxB;=E{j(AGWT^j$1>T~~+Dqrv(@bGswtG@xB==-R5
zjw3JoQN|9{C|*517CihN=j|&Y?|B$J&b&N5Kk}B?->L2D)&lYcX?=Z4^IkrG_i<my
zU!{IoD9;~Fd-?l5AE^F(zg*1<;K@b)Jo0|DD_9$P9#%Wl>jn9FRPWl$z(bcH@A%tC
zn&(6{&vEdv+OEK8$Zu8tTQboYJM;Z8eMA9+TlAh%xE=K5^KnyTVm0#bQat~49&kRF
zRKGIt24sf~QQ&;9%g1|w$7tRhksEjp$|v2Ty3wZt;oqK89$k+-sdh-52JWNv+FIS`
z>`s2TK<5$uerT_w;E8{p>Tk=x0`8&wA@BomuRdpY`YWCV{4C||M>U@e<ORPoUMdIF
ze`W%gZnew7|0zH1OB~t=obO-ly9sz`1$gQ!Pi4JV%A#>}$%T63ugCzG@{`J6QM)}(
zc1xFdZu*7qZyzV~put;ETuOcjJWA`GK{-(`pF3Ni{p+Ltt)%<;Zrab^>H+@((zDuN
z;Qh#--_|^Y_j5LDzGqRs`ua5R@clNAeFxmD&kdb;8`OE(L-TU~f{0_ue^UMGRq=ll
zfA}0(Kj}Dw^LLZ?YhJ<M5$?Sd`tbdtb#&Z?Xxu$6iM`Dqg+3YD@7uP4$MYU=nRe}R
z=QqG(<p2AM0N+pJbdxM&4SzX`4>4Vz{z!f@OydmS>yj;rit#JH|MU8K;GucY(^-!+
z4x(QE?$1)0cMQ){RPQ30-wl30>HpWG;Nkl&PL7BC65?rB1NaJR@94(B_t5^sW0Sxi
z+6VoItDP%L;%Dp@`UvAunrWALg@E(@!%JkIvitEG<jwP7Q<4o(?=w_yb&2yvp6|)L
zV}j~W?JBMI=kNRctosr}D86Mk0nc|-Zy(J=CKCVsWg)+k#_tTZzk3+$<K)%N#Zacd
zex-3;Uib|@miX^gKj(Xcw`!go`v-Pg46|9kJpudhy^$w1&-ZSFJ`+{G!W7ionAST*
zeg_`Zyt=T;H_rw93fXOOQQ*hP59fNI4}TB3jX1pNN1u5RBSpfUT~3H%2IukHYrE=0
z>#TS5y+Oabvi<uqZQ12k4#@8&J(u05_0~YW15|#B*um5r%OMGplo{%WJ(hrn&t<&(
z8gRa^@`ViL`5*e_mhwM8AM(}(;Bu_PF5}(=&i6W&`w4h}_8Ee@&&KQSZd$KT>m8!?
zM)E)(KBu{{8t~}du)|@MpZOqg{@(N*O`*?48W#ng2OhkFcJ<MA?LH0MOZm(rPl2D$
zd3~w&=XH0s=9xjwGnL-TRUQ1<6t6Nh&*bx%fAv7U<0+4wqx(YqT<K3e?;rdX_HpX%
zq3`?sL(dB?iJi^(El>VmzZLL%D9_I=ao)&(O>v;|80g98F4Hv6=le5f=)9Jsd2QSG
z;P>i!%E{}FTmYV+b=H?9fR~^+HedSPjEn2k@89=_d_~efu5patvvoZK9{$ePt9nky
z`W<#&tA6;K&V&4X>L=yl?^g7YxMA%14#m$qrla22OYl#pA7{w91|#oYClkGtVOsCL
zF5sz7esXA#@=#t{P~)>jdB~g?cw(QX`p@13@F=}sw6ZSnDrDycU4cJH`V1NXefYa?
zZB$RU>N!-~H8%vFd=%&B>ib~fLRD-@=YFlxdm-;u{aY%}cJ)KPmt%Y{=(CXYteOCi
zhuWLebKic-Co3I+yr0%DV^z;ULFnnk&%%o!&-bYO_BHS*jf<qrU&f#LzK;AiAaBt=
z)g*nsAE5VX19JbN@t<|H{~33KKYkGYwqEVl^$FnoeVk%1qTWz@_{k8JU;e4$U#9xc
z+g|{WeFxkb*W+?PpV%L%@^5?uc|W~}xmx`{r2a3{hh19BykOcjg5v7VHo#j`92=?e
z)4c`t%Cv5mvyzY)o@7bj0~O!>E%2Z|AC&3XE~Bmi_y3ft|HG08n*QSNKQ7dH)2q)_
z^C?dOeV-~w`&uvPyvg51z27-sX<n3)lBqml-A9Vk`H_NRaH9`@m#3nrZE!x<?~x8Q
zIN!@LQ1hz@<yTKi95(WNpXAPxuunyb&-HXZ<$ESijRg;Xr{>ia;0d&Xouk^`_y)Cq
zIJLhv=zI~U`C{Q_$n*2)j@pkt>c@*|@Xw^qr?s@+t9L@r;B}0bAsV1-Oajj5kG_@x
zW&71!=qb~IUG9@#H~49?PjEDNcwPUl7r3V?cpAuhz?9>P^SQ-gXTTq#e5PLzJbbRd
zz&gnDeH|;O0q1)T3+@3Pj6)x1p1QXf`1wABsv7tF`5`ZVwaYOX2*wWWC=c1HcHr|q
z&0m1L^&ohrDo^b)!26Ls&&d2}{4+ET^0J(<OY8qZp5JeOOx9&ap3ed8kae`dc^%zT
z^FTg#Gie+6`JRW!kHCZUeB_qiGa1zP4O*+7hxFdsIF0ua5&(@pe17MoEGrDo_oR&z
zLm8aUiQg+5o5ue9oH2)<dwQ1Ui+ATiK5!cT|BmW;{W|a@jictWPB-=PI(>`k!{@js
zy$2r8@l?OcnE)Qw{U66Zllp_7&riK436g2o`;_m!-VJyMn)jRO{O+dty}icUu+BSW
zb=(bD1|GiWu);CmVS4V|Hb3yk$j?7%1pWCQ!qSq58@q8H{^LmCe6D2fZ1D5Fg+ZOa
z{5pR*<20um^5aRLha{dGyA_~u^wOiKm%k4(SL1WMCHiZ!`oo;>RUeJRg%uC}13aeh
zr#bqR)A^zd)jL|}ujCE*TR-LbMgCy)EJ6KSVj}PzlvgiRI|RrMBbq}#C*=vdB{4Ak
z`6$1ANB5WBBm80k*oW^G>`)XuN65~1=7xOub+qe!ZP$IPA<y>&wpt6E?+F}I1o&R!
z-!8PqfB1f?NjhKfcatw`T#YtHKR&1(^Qh*}yngJd{=naZsCHfTr#P@g_i4SlPuo`8
zoAxT?J=v&N+}JLEM1k{jv@7R<$1XtL@t@z@p}pR|z@5BwgcIMk0)ImJo5_2vrhnaf
zUrIH_+iL#f*Y~5u4eYW<;)$^*$CJ}KFYxyz+vS0L=uPzRN#*%n*B$)c!mBRG$5%uC
ztjfP9al_Qh=gShKfO~ZvC$x4cHV5|Md-m4N2hQggo|A?fe*XT#6iI-MeOl7^TB!XJ
zdK31XtMzu2xM+A{x?T$?{)dj=;Gbw$cg3I5d4a!QIY~Om@bEeJKXjjg&$T^p5qd`H
z+~t${ex|QX6}v{~{me}|PCYT`^P=jLQ^t|0m(P`4)O^OH_XK{T@^kKoJb%A0zvdgB
zv9OQ0ja}Xy3_XXDJ)fDO`mcjN;!bwyw-dPkD0mz{e@gZD?uGnmmA@|iW%Lie0Qn_~
zw-JR+KPKq8q*v|ZDV1uUOi|eI@cZXOp99YKCgu8C^`v#{d0lV%{8*=CYXj=2KkzyF
zWg3_GJj;T&)IPdya_sQ@e&E5+Qtgxf67UG+R~@t;BMI2e$y;jw2zf8byFF;{44RLs
z)&y?dhk9G6LYo_b=QWxa`sac?-*@+k`Z>RUzVS8igw}$8x%Q(^$0@IK+rI+&=n%*Y
ztz8nj{}3;R{*}kvc3FN0^a-DZ9o7Q2PKcQ4cm7V?(`x?+*}s;qPx(B^FOn}6wDMTb
z47%H9b}Rq;Q^4=fs$e6|esx=&mwhxZU%!BMC21XZQ2gKc^C+6{{w@xjzYBRm7VM^7
z(Hg1!wMRCZ3{Ql{V@#hLy6HWTgyso(DNksw`>tNvcb%y7CVww-wh#Km$xn9a_~m;(
zGxRxf;uiG3OYK>>EO;X6z{M@?a$ynL#rHE@dILD$KlZu2Pj2kP?~`B53HhV+JR)8X
z_40Y@Zi^xBy94@2ShmY|I!?nhPA_OYNqh)-36FMJw?_3J1$=|zM|3@9?E@~qZkH=k
zk?BV_jo(>MK~L{$$RAhvdg7L*y`D_qLlmFZ0XUz7TQvwczc=1!Ja9fYU7{oOIY54L
zy*KbpwC<SwG;sco;n0P^`JRLf9ghJTk8?kQypQ(H*6aJmkwum4mOAlsn+!u^2R?uI
zj`mlG`fIGjPh;mZH15`S29I|n?0Hb_dGHm;^Yg?x>W7{WATQIpU25t3T^`y8yL=fu
zQF@-zu{7%ae|!f`;;?B~$e-H3Ii`VUAnhkNTM0ba2J+7O<-%UzA=(cQ$@<9f7omCK
z&u<{_p*Z%*56VyZ$yl+!v1fkTw_Kv*E8Z0Ph#T8w`<syWQNLHvebj*Nqc&CfY-itq
z?qRzi|8Df(NczA3C~!WvJ}(>fmL<C_)A-}bfSykLIVApI+Led={Enr-`T5-s`hH_n
z?-i8SMD3DG#;M`iNqP>}d62*N*LNo51Di3gwN*PTQ$I`$g`Q5n`iH(>8le5si!$C#
zy^qtp&}1`kOV<T{<)14KZ{!p7zRm2Tiqms}i<)nQDPFmC|29baEYx*kd>h&o)_Mo1
zecmPe9MrhM_o}tq0)65eU<U~!cDW}H`kl|Se7_Fz;Sl7Vby)?~GemlZT;SnzDF0}_
z%KHpWbR6+M!<-tZ_hlLv{W^i4&-YDz68I$QuL2tX`F-h~qae@cT#lVUy?%W^Q_@ts
zbh`;WbPaZJ;%60I@9;U1+G>ZXG;j9O{3=-rJijXcSsm~EU5t1DdLE&1)JW%9s~C7{
zsX}@FhMoyMU+L&sQ~ig(myshM<R7GXK2`gR&zn4y4So1K8a-q{8UN>d<ZkNvB~0s=
zj&aC)X@Bh5>%e2QZ?N<a;1PPx)kys?On$hs4fN#aKJ(XsC-E%o=J@l(29S4a{FiCk
zE_cX6)x^oj#8mk~8c+CsiM7YU@1ghSx1H2>kzf6OMseDoe_AF)Q*SrgziN0^^(Xlz
z(vL>IDfQ!>eS!0LduPki+Vocsl7H9-{4nLE?k3;~cY+;esePUt416$+yBg{z{C(bq
z%fZk0!i`x4Jkl5Z&OFja*NOZcig6MKjsAQuYf}5&3V_GS>%Mss@{>u=G0#GOz9(Ud
z_M@f!=){2&+Fwz!|8L{LliY=RCH&f@r1n>w<jZRu;QK8<(DCd41w3iW<Ngc!$0`1-
zco=w)^2}SZ|6uHMKgEsS9f2Pr`94*F_ojF|Aurm+=b%4s2b|C8H%Njfa3xj$o__+j
zNdJoZy^q9=RQWQpZZi7+N$;x+6~DE|Bi-}n6F)cd7pT3hra(TR??(mHp1Wk=8+o_#
zh?rfP{RTYoGxRB_cttmO_&a+4Q@cGuc3aa3^1Kf)X*h8H?pWFq)El@H^&ZfAd+0id
z??daM=YPZbB!QIDR2`*sPVoEm{9rTX`Jy3k{w_n^hfptnhwjyGkmu(kJNg5UFM^&j
z9ouErSm6Ag=Ml}vd|OiUmLlJy-Z<^w9?|j6-w{~%0Oa{T(z;Diuhj+mysi3-90z%R
z?=iRf1K$^yrx4_qQ9RsI7W{lK(~s4F^Sy8n#K05z6#6*hch1+qL$p8EROea#UO-H7
z-rs235B2hSu^)Z{e^lSQE39^?9E7}oJ@j$%y|IfR&-b>isSSC4-jS;=<jc~$Fk?F8
zy{pi_PJ4&XRek6lw*6~?^F0jvnt+GzK^i1^xEb$!&;Km_esq|g)7_l~{?91y%B}l3
zF_o9?FO)mM&*x<x?Fk;fk96Jy$n*Ca8bu(_->0tNtZTYfwCn4p1}c~io<JCO6SeJ9
zr800n-!WDE-`K%N&(~g;M8(9hbrkoGd;;99>kgr{%e$I)1t{-Y)g1iEZtxRv54)@x
z2%PV8Tdw1X@2l@W1NA29ezqDKZ~Z&a-uu-KO=Sbt=%1wb2i9IvoYqHI{sC_3K3hrE
zvv_^z$?u1B9|@elr}p^!%1_Ui58efN{yy4riSx$(9OpmLeZK(h`#r9H_yzf4krv>I
zbcdavR{e)P4xGQI_N_jjO6q;u532kR9?0|k_Ep|f`%t^qEAAz{&N;~Y^&G1cH=fn?
zPMGHFsP51BX@BO+=hdD!(7zI<?9$>5;J*E6??J^+$-VTZ-(RHn3T`?3R3+iD<y8Jz
znaGU&`QGmdlK&Vyp?Tv7l|R!J^){t(F;&)mrr!G~uWqy+INujhQKnbJ&*#q0YdrDL
z{_ts8j~M>gTIlbr%Npha55Iq&rQ@2<aW9_)dFy@HZJ4%emFBDN1HjiRUVj$!=jX^D
z&H>Kn8m4N0xv!zWobwA^a-&{858hZh!q|bo>#^&8;J!N0=c4MfK@tNa&)-cOmJOWG
zi5J&#<e_nNaWeF;Py0@@b^KbiKiT6r<dgLNVb*oE53MWDIPcd{eEvx0chip(DX;sX
z7Vy6b|7fxD>wAOU)eb#B10JGvME~!BCpG_(w8t(}YQR3c@A2A5$d@C(Jvtupd@g36
z=938z`sF$0@4X5<{Cs@ZRp1ucr$K4(2XDepWEk2d(+BwvC?6==1Nf6P?sCdJV(i(F
z)?ZiEZ#~<=FK%y_-kTxM&)<)09P?2etEKZrRObsPZ*TW2ctV=brz!uzzkqwM!#)mP
zT<4cCt-oFuLZkme$`3EAzxipsKeR0D#@`pncRzT(BL9g={5Sl&$Zjs3cj9HB=PRoJ
zdpghZeR{dW;Nj=&IZp#m(EH5WD+1^9V(;!%9?E0?_c8D=Jx6{*^Ycf@AL_M%d^1{a
zdZxfWe17LAZC7{${9&iIx2nc5e%^g*HTVOAVIK+mb}98H@aRXt<yf~}K0L2Dtvkl+
zJ<>5(1zS$SwO#t?Ih{zMO7`E2D*jsz=+EbLChPi?_f>XPhkTg&eN9c^CkbyLiGlHl
zPiVY6H61wLclXyvz=QPM=jsvQ5!!!lR0Q=#j>FE*_bGOX0ZrVtCctjumUg)`RPj~7
z`zyX~p7PW4^cNz)c^{_ke&9Ze+n03R9Y})T8ISI!sF%<0+?4g5(bK;(RiBxTpXht+
zFQ}fAR-)c;TgW@(XspJEFvW-a^n6GBw^V(eXs7k|0X{_eFH69;^`U*mMo$3e@9_Sl
zej8K2b@HWd8fWHFoN2Cpa*X_BRRgq(@4vm=4fXPU2pPH_<nud?bp1Yv=9jMe{D{vB
z{M``zd~d)4Coa+b56Rb{C*NaP`77W_y<fuV_gvorxAZ+v2meXODZf83q&DQUDE^F5
zemAYVi`-IuH2xRXemtL!cJcYjZW@Pqf2@$Kzf8aL_r(V4x@;B2^HVZ#j69z|nxp=Z
z7zn=-H@3@OeXf`IBh?Q(wuJutUAf&|z|ZG^qMA?g_sB-<0#EE{Du1@l`!SvOW2(=~
zJt04Z+Lb#5JcIJI=?j4K`w4$n!21R<y4Pj1=0E(MzNHVLUHpCCqT9jmKa#5dw?~0n
zw9cKW_j@EXZ<k1Cm)R27O~3Ft&}@x|e2(c=jWc{+rf&@OMtR;W1OCw8h_}tO-X=1R
zOq>sq-=1p=`2_7_XQ}=9JYf1&t(W#)&y)mD>}S*~+p2cyEOFJ;`xDhWZ;<M_0`{!0
zdRAW!dA?_3wfb#v5aeg6{Pld`3Dft}vx=!c-$9>~DnGj}a6U)&`5M&A_isG#Gw=lM
zcMa8ihVKpT{0R7;qdcU-K;Xe;(BH52>2?tOaeAJ+FBAA`TG#*H0X$xP|7y6ltMw|#
z^ZknLL%_3Wyli&t`~dWkZneu*wFBQP=vxOKzE|)^jT=dd8$EPCiofgnl%A`Kt`!4H
z>0$p@(f}Q&AsVOmi({Da(u@4((f6Q_oAUGedarD>9ry<*|4QW#6aOX|n1-Lf1Gvru
zobR0ozoYdYgWa?ntv$*gA^we<A@7SoUWS`p7W@u8MtR8Vy5AU~{l+}HAL6F{kU#ak
zN4{6^?FHcH_a(m3eATb-brev2254UArM#|y-uo2Ryws7e_$2JZ-_LC~6ZP`Ff?a0=
z=lean&R0F@J8?ZEp)zqJtmke9tDdvh1LynqHfa3e^Iv{l2PG(<d8Iq_k4-|qh<bJz
zqx}`u_b!Ghe&Pev>!)!VNCFR1JZz->8>asKM*Sp8esWaj1-{SrJB{aYis!o@hW>od
zVW-ESPf?mTx2Zk(-oN!%An&I5=DrC$K>Kp#B|kC!(v9pnNdmIL`JTi5jy=`?`>Q>#
zABMb-)^~fPI}Fc1^q$FNsn;H-9`u*c+NHs0<)M6X^$o@8IlkHUG5jvl|N94lzeI6H
zre#~+-LMiW)B$grWAx#B3%ibo{2bC}^JL()DGuMO_KE!u?UMA;E?tj6K0xcKYdRnK
z$ls31i%h0nA=)RjtaRD&GXKl4T=Kt6`MdK!gPMOexcOfW{RzpO`Zwoa)8+43HWT>F
ze{)&63Hd+&_TQiWv%r5A_|F3WS>Qhl{AYpxEbyNN{<FY;7WmHs|5@Na3;bt+|19vI
z1^%<Ze-`-90{?%pfMxHL?vZ;ie6qvqmj5>`Kbi9Pbn9};-!rY;)zU1#y(27luWcP7
zPX)8A!y;qK=#qnTnH%{M`gi}m<MUXuBm1ZP-S9tZ|2>zLE*$?y-et?1e=jz8usl7@
zu!_jPo4+qzur!Bd<+RG^-?!O+&tX;4zrSh!J>9CMe1>yvx@FyG{vbc)5sYk#r#DTX
z>JCM+tWc9cdV1Kh0?ns-T=MV5y+(P<%9bCVnSHI+A*nQw?(${j_T=pl@*Z>zY8{Fd
zO82GZvRb-4mb?*VSs~%^iQGMwYoSZ(km9l|j}>y~b>|O-3R#hiP+HnUR$-BGxzcN8
zO`dOgYx<fVboo4KSytcKGow~Weq(_vRBEO?o3aYI#QrnWi_1S+lgk@EUsg>1TeQur
zbjv;2@?>RM#U{J6vOJbAv@KL`M|odbxmcgc>29~|mAmtl4T*3lSiXtJ<8qY@P4&iv
zJv}Srv)t(-;ITs9g6W=GuBK9r6|yR>79>M@#qCLVdD64erdslZ#*^;OvfR0|(mZZ|
z{+w=4M)`8Hn_A&Cx7AeGT_N!e_f#vCzQ&uDmNzXuJtNCyrHPq5?%eXvMQd7VGjjxr
zNRwADc6sE7v^CxGL}E>SmKC<rJZYY?Vk66nq*>my@;=MsUS7)YYG)Y(SZ+^|LizJb
zC#0okyQ`L&P%}MmDfc~Tc{?_-T76)}`&68kVHHk`S+80VmwTKqD-_CYx%N*L{;96A
zb4pu<edR(q(wk?`%F1$0Z_>12@sQOq-BZyc-6j8**Yt5NtE9(WE?fTYGc|fvNY_}+
zXZnoot<_fAzVv{#wuqHGuhlfua%FW5Wy|*##Y$4dnr_Q$dD4no9-;K4%in$39&c8t
zZ<cA3^n%E^^M=x_*1m%7c*G|S$t9J_zlp`<KX#G&Le?Owe7fsg@u@yf*J!%WI-V|`
zl;z2t+tu7-6}QR@#e##DE8wwQ#XVX1ta@UT9b#$A)yy5qWwp*KXr<j@J>+gZIbbDR
z!dEfNlTk<X^jY&fQj7Gr)xp#j3e`;WnLk+8LYFItmDwW89kQBPp|H_P?9tT9a{Fh6
zvT|8|84FWg_m;JCdCFVaEh2NKS`n{RE;Gw2mzA%Hues%MW&6^7r9=tebotE^^82FE
zOkYmRiV43d?uh&+O9px&%axGvH1%lKek-?WaJsLobbyt^S2!yx%h%j1j#<c+Hq_O-
zQz*T-*wwOH&kfw2mcz>UJdeA%w4<M|TzdI3A<ONyV7fr@b06pRS#8Amebx!fswg#O
zx!v6+7nhS`GC-zSp1dxp%}V!(isFZvmM^Wyl+a9Ra9WNGQ7b*P#>$iKbA@V!is#L}
zFZX1Brn`8${6oEridj-+xx7sWWu^N=mM1(juO&U~3b<q0?nstrxo@i1(?lvv&+94T
zS}$GiYbt;9SyM%EDIqbr)STC29z>bIC5EvA<wG(m(hC|(`O*SL#N}CM6)T-4QZ6fR
zdS+HAuS@<RB>&42KKDM4&$8yuw0zcmCCXV@VVf+?FMrRKZ5h`y71{sq8Qj{+2s97*
zrn-E!tPHm+>am)aYwC$uRR&eG(o0GZimkCM`4&hdBm+|nGE*94wVs^Sq^Xtew(fIz
zic|`?Lj{XeEtNZDO+6r0d4yUtlfJTYy3;p=(!6CYmsKmh(3*6scGg4UOFrpRm;5L1
zwscR(@)=JT(zwwm-IrBFB1cx<EO&OuHP@AvqoOq0pKhJE+%4QLcT9$$&zEHt3^~ps
z6NLCumT*W3Wu+G{BtKnK-x4i1_&TP!JU)+=u-3~wV0BGz{g9Q`+Px^FV1+_fddw$I
zYZaI+%BES9%eIu?%G=e-TR*#g`n|3~Q{Aq-=R%gt?aAevDOxT_YhGL)Jr=ieg)FOl
z6U#Nrl~=m9xwO~{HMQiAP3~=8G25LL5T|ivxgtgzcV?P+jp$=}s=AtF<|v<C*&as~
zXPztQYu?amU}aTZnB{Uw1AOjI{o<2-O=JvZxpG?#`!p@vDevU0WlgNbGeZ@#Bce*7
zthBOLSnBfyvh%s7TUqVA&3`eWwuzP1ym%nXot9Qn%#pWHX!2~!-8{`x-X(KPfpn{B
z#!SIc#3JtWG%G8T*XIt)w~1ubrA-qXTg|PIN4mh%+}$KJAmnbnO&l;JV|{rJZ{OT~
z(hqsEipWT`vdZ^q9x0UOF6+zmSgzLPv#b&ab6Fl~WqPG~1FVe5R4d(O6|FVd6P8w`
zxqUe@+Dr|t@Xg8X6I)2Py4y+Mkn)H%)zeyjO_~$&xIN*#(nW5MjItJ?ta2Fwr>BR?
zxjiAbkc%_TOqZ?~G@r+W%)FM*S2I%9TV6tNx-XPwwGbK0ozKch548xTxicr{3R@Ky
zr^n<sTy9qfbBEFz=|#)?e^GWO@KF4393SgQ5}RC=v`G?*kSl7Pp^~3U<!IfxS8H7*
zEm0Dsl2numKV6iBP@z((bdi#zOQ>I=k}mz9VV9Me+5Y?THs5`o&-2W8X1?>C>$|%G
zECG_RwlE?&Dw-rLijWWrA9NdhB;jN}Dp?>A^aC$N30Nj7LQ;S}C{X}HWvK%EClJ0P
zir^O}Z558W3N0<cg97~)g?v<Cs*50*C42)uj&{0&!WRv*MG!NXnNSrnb&HG8XpQOK
z&?IdTz8gp^BA|sniZ%mSO9iR?#uZS1$SYtCha!b~(=?ZOIDT3Rk5WTOl&LI4kfH%K
zf$q%&)_`y_iA5D8QCSE@Xd28L7An(HARg8oC3yzaZK?-FK{j3>(b|k6O6F5p<+9uY
zu}u*~MEO999!H?7(niPv@J<a8RuH71DMUyT!e)pGS|K>bK+9%PM9}V(I#Jkd9G@*8
zq6jsBO=4z(XuM=Wd+!DzA^`0gto<*Nz*IhT&yAN>fTI`e;UJ(2KvyM<2%!QhiB&0}
zK_bgS@68uG4)$6Si$W5L6@(?4B}{_$7mcbi5lEvTN`h7nGbKw1!Pgx2Z+PgxsWT}E
zoNs}F1>2gBZ@e(9Li8n!)|<o!GxUHG6)Gi+B*2OiCMgODh-D@qc1R{7iAMGBsfZOH
z)QOZyA|c6WzfDDqg{{aEWQ1=Cbpk?CLZCLW)GRbJA8RTVstzS%M1qP4@R2FwVymFj
zC66Il+ajZd6%m#IIhg{vSuB`0G&X4dX~rZtHbY{lsxXB`6-|UT8_x%Ad*H>VAxfqq
zxpHG6BUoIZ63W7)=s4pn%2hrJQbC~vnZyXP5VWPYP{nMO6$MN@V1M9)d?x5#B556O
zHck+Y6_sTP!%+$PCA1mOL@1j{RD|Ed4%VE2odjugJed!PW+9HFM<b&VDwPT$>=y}%
zRe@E-hbY71vMMyt#Z*+Z$kd2~y}=bSTTxhiEX@jS+hhb*5d~Jp8wwRl5)=W1M1_L|
z>;@HRGFVtxa0G!*a>3xY<U}f*VUl2PCV_<If}lr0gaY%IAem?&r14->L<Er-Dq_YD
zPH3^Paz*0fU@Phf@bzjdlUyv;3&d<mW(h$<3KHO>3d}*^+?B-#9XVMRS_YWML5CKL
zZm7|Si2w!Rlhx-t0~=`cL}k8ADTK6~60Z^K2H(k1NkMf;Can8dK19$YB_1k@AgtJ_
ze8yDjQ4$F{b1K3&m6D~Gh>#g@(n?0iQ(-9)L{btC4+sb(5z08&MNGhhP~tO%3$=wR
zs1#@;&_l|`Q$Xr2E5L%fn2}=Q;cOx+CPp^;YGw->ECf0P4hSR)zXD~;4Kf^RpzVWA
zn1#3?R2J-rl|p>p2zpivLWCll87&Yiz!IRuLgS#y^21RUp&%?YMwlfHU7MQ%s-vR+
zpgkfKVL0%@!FXH@MNxRXDZGaOQL1953lffqk)YKa6Bbs8hN{C>wt%C*q$C2nAqg&K
z1Q0JbIAexu40KQRRx`N=dnXHF!8H~_r9eHYEyAMXDC-dc3Lgx<5uj*LND|~sN(vvJ
zrM4_N9u|TuAmE}1i4a7Ck}M3zX=?wu8wCuAg@hR;4lN+i3do~L$W&B^jG_vNN+8i>
z2?YuXsXQ~VC@cvI1tH5-8N29_1qFmDmO?2L_|g!5IoPD=sv_}Kn2(d@GoB+!f#ya=
z;Jg|Us9-6_lA@zw5k^xe!m^npGi6ae4*?_*K@ZX-*3|x{GGQ^oCrA}UXU%?PB1d3#
zh$INJC{(`wI>APcmlOp#I+hv{ltqMh@#PATCz<=$!U;7Z1Y4fcVr+)+<xm9_kg574
z3Zel+&q*r6i&zMfYphJ6D#o%PIu`8<5UMbG97B9zzqS*kBFHw0aflk2sVpi<I0gE2
zL4>MdjhHC9DM$Cuzkkys=qnR|-3$IxP=t{9T&h4cAL}t(Bvc?VF4Q@&Dfp`pffp=G
zSH)Nax@p91oGg-so(sk!xhz;&lCV+{N-X?C1d-edW3n3JGIfr3ER{dzh$V#z&C*y{
zHi%{6q@WGGF-49fOhFsD6tX#4C<~23U$ICG+T5YmEZ7J6B@scW)Hqlf>{mQ&1Px)5
zU@V!;Vu9@*LLDnXLTo3nD&$2HU=@H(l8TP;LKKpf(H@x!kd65@%vt=lury|<VrTzZ
zB^2NVy9T__4abr&mPGijA{IiV7}(FL{~(#uS*R1%%^X-guz3W~jm1Y15zQna<6>ap
z%~&vle-PMj&=rWDHmE5n8zacHXp#oPe}iNp-~z4F)<sx?ittg9Xb6v0M#!;+<Qvuo
zrpOMcFpDgSkazLHTw>7%4|RrB9Z!uN2M63N1vEdbs5#))1atrcU_M|1m;;M}rNA;^
zCEyHr0&4*uz#pIk>i{MY2806}0jLlhv*0=&*bAfr`+x($Vc;l`1>^w7fjr<GfWArq
z_?Q_|2~-0&fV;pw;1Tcycm}ipuYfk-J<tX80N;Q<0J<<_6o4LcMFBW=AW{JQL^T4#
zju9A&3x{U}Q36x|4L}pn1?B@5z(T+pSOTm7oPafe8{i8B0t_G&SP!s(SYQi~0HEtX
z703jR0mp$;Kt6B|C;`fWD&RVB1Go)50vdr9;4SbD=mNe0-+`Y1`uYu10TP^q0HOf2
zS!6sQ4NL^&foXs;pblsPx_}`-1xx@_z#6avmIJE*M_>&AtqgGod;sVG5IPV7tOqs$
zn}IDr0+0mk0`>x_z<wYd$O8Ta@`3X}IZy#y1+D`%z#ZTo@BpX-9s^H+MxY6J4!i>1
z0B?Z~;6I=X_yYU@`T!C?Y;!;ikO0O5l7K8Q5s(Ma({V*W377@U0ki>qU_M|Bm;;so
z`m%xPGGGO;nr(A{DGhJ~ynp~;9l!wA1Cc;95DRPrl7QVnDzG2O05XB2Kn`#m$Oj65
zV&DRB5x4|Y09SyUz%Ae|a381xo&wK-HsBr51$+j+0ch9451=mrn2rVDb{ZlDOa$Zs
z1z;vH8_)pImo7{V0aIWxuncejXn-r=2CM}FfOSA9umOky&;!6WU>A@I>;p1@BS02_
zevkEO;4DxClmV5%Rp16t3)}<l19iY7;0f>)cm})%(BH`H1ik{l0e(0d3IamFC_oI5
z044&HfXToVU>YzJPy;joO<*pd3m5^$fH`0dECH4R_P}aj4d4oR1L$2{IuHV^2UtKH
zkN_kByMZ*|5O5SY0ptPafO4P;xCPt+YJrEqW8gXP0(b?q0q=nKz!%^v@Ezy_;C>JS
zSF*?`Km-^Ihyzl9JTL`N0Hy&dfEq9x&;WD*eZUBy0_e*Ork21Gz!q2mtOi^FPrwIQ
z2QYvzU?UI(!~k1>Z9pQh8`ul%0}cY2z%c-Q<-+tda2~h-Tm-6sE5J3N8mIwkfxEyX
zpb>Zlya8H)HsC$b33LO!zz?7g;2#BT02l>`0Ahd)FaeMUrT~h-3}6<Z1?U0BfCXR!
z`~$228~_)<6F`3#)ei^+!hi^1Bd{6R0&D{kfE~atU=Oel*bk%whk&C%4sZ&{2MU1m
zzy;tEa22=))Bv}E`#?R=2%xWLFnz@?zkz8R@DBI@bOT?2Z@_opHy{Kj6Jr3lenn(}
zi2wzd4!|uvWHz7$=m7@6d;rcD5K{m>jWGun0*ioufMvjH0R8R~cfbqq1p<H|04_6-
z5FiYQ05$+oKr|2!BmwAaCrtMMdx12z?Ep+OfgIo@a0<u+&H(3t5}*vY1XKc7fok9u
zPz%%p&wv-e8=wPt4}1i=fiJ*!;1@uGlP^I4ZtEhV02z=3q<{$k1yBHH0O*~kS%3yG
z7tjOH@9HoH76OX^8^9h|1<(NW>5d-|1grysfe;`PKu`QO198As0KJ2i0Bi>mf!)A9
z;1Gbma$tHAI0F;{ML-Eq23!IvfNQ`l;34n`cmg~FUIA|b^z|O5AAug=C%^}1s6v1!
zAO*+(6972?J^7vnr~<Qqxqu;H4442Gz!G2?U=KJ0u7DTd2haf~5CKF1F+d!U1ndIP
z?<(C38~_dhS->ga98e6D0T+R7<E}M7+NOGpp_QU{uT-^j?M)LoFZDHP7UsJ{Z__(W
zlHSgo>vYuWY4Tcmi36I+MoFW}wR)!KRLQ0>PngXY&A-;$^zS3p%I&r#V{>%{g7-M>
zZ&0s^3;HejT5w)o-&Xk(yI9S&_utKWx2gS+UC9qU@?DJ;b4SYtJx-cm>GnB(eRyeI
z)RF}Tigx^|@;!?+Bi$TsADON<>fhZ7!ZwzwE^AP`#S9<ql52?`D_teBf9CI*s%QB?
zgS4}izf|7sT$<OaHTSmBq*a6-6uhWjH2cJp5-F?0FHV-PK9ZtKn?!b67|~f5U0*al
zT{UO%JA;kiLqcNDo*?TzU<&^p_ak$1PQ6<T_48$$dcPfQ;aXbW5>Gc~A3XJ*|Gz%B
z<j7d-UP@j*>!;_MPj0F53P&R6XXJ<;WlBFGYuy$N+4-A(tloE*V@;!n#Bw{+wObr0
z&uCY7Z%r(jnwheEf%@wcVj?-uXw!S#+LN0Fo+jxQiaQ6lDvI2xikR@S<SlvS9J53Z
zw~I;qXLNrv50yH8nen3W;uuFg@#}$BqP{`&+rF*ytM~M6`Pf`!vMKGibYBhR&Yin|
z%Y<hWrds4)zp?vam1O<J-Il(dv*^*6#~c#fA(k=jRal2zou;?<Jh=?>BeN$-cZBx+
z-019Z_4oTzvv2Q^XsEfZYM*dvk;S;!9B*aEb01=@$zQwni(fk6=>N0l{LL*-R#TPp
zvPd7r7KAWH<^8T+pypjJwI@Y3Xa}oRtY(tl42STsar%~}1=F@q5O<W<yZXb+OfRLq
z=1cLSv_{H*f}bVGJN7(tP(5wbv3OMvqBPU#c=M;c4RW_KJ}TEL?zEY%_-0hbhqu#w
z&$pa;yLVZjd{o&MmNL0jG9kK(eDmfn{f6}Bw4kcU>MO4|Cyi0*u-_zk>iO>nySI2p
z(Cx#L9>{&!?NR<Q#46f5LS@l^IyGwqb#&iYwDnYI@jFpop1pTpSI{!=_o_u_RND)a
zUY{sWD!aLU{aw{3agUb@1PK2&8@)Qw<x#-i)^!yZKQSer7fvXvHr%2cpfu4uJ6>_O
z<@P&vo|&I#M;M&^a@SpM_Xb~&v?~rfQ{5EAroEadADMXn{yx+6<0~i*;Tpb|4sOhz
zU`@N|WapctM0wtta&6J1qV^x<%YHm@AhWXW@7VO5adX^rF9)MzV(OdDPD{+-^OK!l
zQhVyn&uqtEwH^s`oz0{x+ba@;jx^faBzr9Q)jeN(Sw^p^F0I`Sa{77)k5z2lWBE35
z^K6BUT@72N#Qi*Uf9dvD9nHx$O!Y7OnqtLopAem2+P7jys?L#&^>4>4`?Wc&W4U4R
z+0vBeUisBNZ@wBQk6-m<3S$3|zsT*l%1^tti<(XswfS~ERM)>`GN<K@RPkbu6BfTo
zTTfDQ{f+9HN*=prM?4nnHs4e#dhhlc?{1^tqNZ_ceZ|+_5RS2mz81bcyLMND4s}UJ
zWL{b8>&RS@!Zu|UrO5GL=07N2r@f>@`Pt;aNkxsZrx=alZzd*PsgjX?wLo0t*!sOY
ze|a;SUPqBT=2S|unn?VuUXz!q=;}zV`gc*sMDkkoPh#iGE%-vjFU^qB*`fQSRYgNx
z&0o4)+QDgT_88Zm>>Xcj&f9e<O}E_cb~s~t!g-UUcU5<V#EcDEPg^2ib;-c?>pyE)
zsn*$NE?s%ux%8r)*O3OYX_HOVQR`5V*Ew}f4iBW4CaDKq&b5)bkp68RdEx})iNQ6>
zs{YY+fjK9`e%RS&PM=o&@LbB4{8yBQ*>bPnuAQ^F)mdKc!8ca-rG@huYgv->Wk&I_
ze!W(>RXW+G(0qb}l9{UWnXvPFgZ)dJ3Msb|HZ$@wGk@1k+4AMmEA7ZCnG65ft3^(~
zQf1valZ>o*^}y+BcuvmM?lOZS=FAtFjRA%??8_b)6wlhdBQ~nWYq#Z`viqiyA2*jB
zI#JB>N}kQ4)Q>I`Q_zyBQkEC5r5o!xJbJD)`(NcE)8t#8xgPr>d!FTHZr=3oMf-<2
z0-`dveo7vWYK@=0%~15Y{pMX6p0V$(Zk_k|S3B{=vYJWv?A)U@W*cwxyY!Rdacgcu
z>x<fw?|Y9VZdhokuK4g867%Wi?8TcGzO57f=wT%}?TfaFJn}uv{8-EBwkt{A3s<Tw
zd(@f}Yc3dn;pH_+vv04xw6Ci^ADcS*#^T#0DvZTqXGTq)!tnX-xI{P0@LF}ffbWKf
z+s;_0#Pn39%)KV1t#;@~pm?v^s!Pdw=hvH5dq>oz)dn_cCq#Q}Z*VyC=Bjg`q<8zd
zQ(J>e9QSM|jfpw4W>z=--|!XkeJ1`{;WKLsN`vj@Hk0R;#OHWN+zGDGDlk_YPkwz~
zu}1%4ZF1%kmV%APe(%<u4FQbD>*Z|&WC}8aWlt>I)cYy#hR@<RzA~XobyYR~!LP#C
z2$F<I0^eMdqH@yo)gyW|lDB>KQ;QLZzf;$D-H<FDAC)8YHpp2JcA$N4M;99ArMo<J
zFPzqQu(G-@^jn7M>b8>>tdz**iCPV(wyC>?>a`*d_qLjlYW9uyd3^lI{BOU@W5#$g
z4C^1BG@JA0%$k(f?amDwmTI=$9<Nb#xoY_@LF*uvV{t5<6zlo9d_wWYV@=Xa{O$xE
z3A^DMyD!A>!jrF(jslZs-*-E9)AW96k^QM}=}9vlD*O|4c-iyBv#XA^og`N(3RzCR
zF)@ufy8X_jX=0HHM?YuXyRr-QNyJ{wzICQSx*|VOUd`&D#-o`9$0+rQ)mygfjC-VR
zmg9P2a!&84{qoIs7g_`t-U^g=xxdsp>#O(Y2Pc1-m`|^A?W*_q8I?b~>Y@L!o|@yI
zJDZ*<>{q?l(UK&(G_U1I`}>5{fciSuX*vzjlDftpmmFX7&_4NWOV636$d8S>OU6bW
zo#~tvH9kvE?xIyb>u%Ul*0JMq=L@cW-=7w^+ur0iYVW<3lyvXmOvzo_ls>ntnsU|j
z#jJMu_0_uejT6#jw|V#)CsxS_?cbh&)Gu-jxbnF5CAnivOwu|fdPbJg@^f`rWd~Nh
zr}P<KC<=bs`z(K<Rqgbhi1fG1^j6Q<(v;)7I^r|zEmr^RsQtF!*`*JfmwVSk|LmRp
zF(RCGMBTX{w4nLWcOh^2!1YC1fmK<$G4+q1I*!_S;o#(im(}A7+a0>BDN(hb3uDOV
zyUOROOgpdGb3Nm&!3@#Xd#?Y5jJcXoV{IIryZBS?%_6F1$6@|m$E;u7-L|_ROROPf
zk6f%W<DvIEIg_-p(y<#$g71{WJYLRLxOXf+JiQ|?MKa;0rDW{o7rsdW9n<^|tv}H+
zDlvOYXKG||T3x`mrg{4%6Sbm$)_EKYIiW4vO-Y$$w{ec=xuhzO`1tY1v=KFJhoHib
znM-F&mOj-j&`G^m`FPnlsV6I4&&S9;?L~gpKYu$>PvL#@9yw{FtyZBOQRK9ql*AP1
zuQhhv$_!t5NA)}$JImK}R{nIBke3TGtwXggx_OFSl_pZmdAk<+dOAzK7W<_v=Caba
z)$2yYA;<Ky=E)2uYgg+;<<GB8!i&2nIzC8nCiV5cG&qzP?mKl?M#Gdm%>u>RK<&}W
z5oHG*<#YLyDg~&g^3@Ahz6exr7kNadMXV0^HN$pIxMUMe!h&B~Th0FS>jbM#jV<=4
zqjDC!E>;rFUou@~EQwJ8{qdr?vr}J9c-g-0q3z7x5>}3*W@DDMK#2Hb`Pj*07;`*=
zzQh}aR#FevPW3FiId&AO^-Nsotp&4IKYkx{%GIoPzC|d1fQamBtBA%`+Y(2;9b@4+
zLoXw;AwAOe>ajeFRHXb+rR3BI6~E3TsXoe_w~G$=r}b>tH|h0to}SqpsWw4A(7uax
zYm#`;<v>j>=dR#%JHI==rmNMH8g2%;{l2#{=iHRvsdpACn%?=Ht+Q3_xmsC!)$8qF
z4tGwwFYzUKOZNIjEv<_c(@uvLHn!(p&k$$!G4~0JceZ|U6?@+U^W4$)aExT}JXb$c
z<Jrafcg>caHoZ_czUBGL*!vkjZ^}(PmmWSt(|PO4^DzBjBd)1P*G(jqZx6j$+dvkL
zqW?~d*qXmZ|9e6)_#fwdS3X=$ntFBi&5XJGWpb4juHQXM-*EQ!A=`DVeOtR;wnvD+
zkCf>cyI$FN`QbCKG#07El|tObnTp2^+orB7`cd4o?6J`f=A8*Q>MYHbgL<dG>NSi$
zZsZ;3OMVxbB$l_6Wjjv9PrzMnyQ;gMc1rl6$a@L#Tk~HZn{`yuF*wwC_VE_ME1E_Y
zvyNB!FF&Yw<?@@in&TWbm7iRsm?t)=>4tR{O1i0(7#GJLyFX9-fLf?{&!wNc49?MP
z)U;1m`V~bwykr_>#EDr+o|%@QdukaY@xz*N*;C8s)f!E>t9sA1TdMp0;}yaVyDcBv
zCwrJW9}K;8sy;CE`y%}>O8(kKsh{_|$ka$`l*EkH-yg9;=iTk`HcRKvwRF5aU*1+@
zW0bSAd_qNpjFE!n4GE#;UAY^#@v*c%7d=@Wx^?uaZ#zG%o=EckILat;zxs`zJ3fX{
z@~>$-o!gsK{)y~!_MLo2IsMs^@hk1-nx&l!|D2OJ!R!6=R*S1k;@{QI&g&anejsvQ
zm)B-EW^}KKC{gUZoTYf{rtucYKYnhahh5yotf;<~JN*lS#~Xco?DKuY?a7+SYpVR^
z&Z~8|YU}*!cKI1v5h>=D?_u?N^tQv5$9p{8zO4~esUby2l)wMoed&xzU6T-!_3De}
z*{?MHP>~}D>i{CQZ{<$qzKzc&eJdBw-@QrCsakW|o(^ib_5PVXp9<u@i2v;Ul{`bx
z;Q7_u6*TJOrso=vPoDBH;nBTy9hVLDJMOvdAGiMPlneEV4P;8`UilN}yaQQpmM40h
z_V9la-q;&2aqxNcghP?>dHa6ZimNzBNlog~bso2T?y`~%fwBe7#vixFRxf>ia*dWD
z-!1Vab9?CenK}iDchgpMJdIq*>=Aq7v-DlA)9fvu#{6DF-)&i*`?BDw#Dg`6+v8<M
zdP1vu^+He9Pg{9`R8FgTOj6wLcqYWk=6=A@-19=S#6lfD6#Oa-aeIEhKu*}>Q<r#!
zdrQON1Lc#$LT{g3W^#H-eV~q%l&8S&6lBZz+VMK?p4<`b(m&ny({g@how|+PuGrfz
zzVz^w%ys=J>?@bF-R9$~qgQ-&nx20>8#tpz&F+0!WL}(H+5WoG5?<*f@yG4OO50v!
z)FhXeWH!o^XCXV&+Dd+CB2ts@tLn78`X(V?pkX#jCAV``v`9=}r%aAw(2a2=WpfqE
zK32x2O}sumTgB_2MO|l0RQ;s<A>VRCb2G~YlA|k%S9puGE>9L|Qhg;Y-{QN{l_uX>
z;VorxcY2kC0Y9SX=zV*p*ki+0-S=jwMEEVxk2ycV{#p5}9mZ!1ss4wGb2@kSKATEU
zPrTmZB0$giW!UswY&t{Ss3RxrbKQ%8SP9!nr>6@S^}Js`Hs2xUG*j`8vk3XdM~Bvl
zzm?T4*)5oPx7zZQP;+lir%iq7e^!reb9Dtb+?3?M+BWIn50^J4<^Oi=zFnxaByrE0
z#NBUc#d5QaKIzF{)Oz>q(3%eq5+XZ1?XwkD2RPN)=#)fPA~%07RYq(-jC-Z{@1zzN
zm|vs7J<`$2&vEOvH3n&hHXX5+JGDfvMKL8O$H&+1mrkaw^%Tu4>iDZ$uH9Mf1O2_z
z5ksTaZIwpW8lUEStd{D_eR^HhKy%@j4RD;X_PTdGGU0wk<`2h)nyQB-FH0uxB%KYq
zn<V<5&MC*!eF788=X-<~=i6NToc&36eWs0?jFIE^1i7tq^m9}HlZ`&*c9@*tx93&c
z6npuvzx*HM!Tay=UI_+u%g^1GpPb{d+fFhJ`i30HA9E<+zKd^uWwFj2H?gYqCqIn2
z|5OC>r(YDeI-6duB@!^D%HyMoTJPho`iW8sanBcRjAo8sG^S<F;peKy_P$rsr2e{8
zxGZ?9Shl`x&rGu0FWWMUx&t3lw01Z*hIcqHMUu0dM#<~1rQE+MDgU^amASx4aeGZM
zW&NtI)T%EB<g>L)3{Q8D5AgZs`P0aC+)<csj&S7MqkYX1ldKF^TGi83OBZVW(|qaC
zi5_YuHMahKq@K~qO@5G1-eiwY=jQDK1<CUs@~em6TnzI~{qlC(3z%<VNSuN9kAP{W
zf<6k!$7grFYo<pT2zyFu{`X~5jPe<WwQnuYi@r`$l60Qxr!r@J$A#*}I;$$)T<!Ed
zUQ%><NusD~P2ihFMa9Z48|06+?XvXvB}+bYm>Q83bv|`=!kPqo@BQPFoxW5m>rQD>
z{&A{tY3a8M${#$+U+-O?QXk!D6jK%DD&jjg8%Z;oGVSmflKczz+Eu6XtCSosJ^Wdl
z5dGP&q2lIf_vyk{V#UA0JZ0voMY<dawqm(&eD<or+O1-{T=f`}#)q>H3(DV?Jzllp
zyzghj2X}5Bo@6)4fNJ^k<oz8z@`rZJF3_*<C09p^wWf4C$gW-Ea<r?9)P8RN4ueMX
z;^vFAGLel{GV7xZP33iwQc<PN+9%DjPCbrsCn>n>eHN}TfgEnGl<?c0URYEtVp+bg
zXX3<w*Rmbb$7Msu-uJg(IU{4T(uqsok$;m{eV*fx>^MO}d*!H#f;~E!>t`@@4R0*j
zbSFFf*cppUcU)rQ^NMm*+mnhDXRopE_$syP#r7w5s>i>co#h+&ZNu)Px6N;PH7}}u
zoKvAyv0|FoZrcYzR$7VDeRn=7r&)cKT=TN0gjT<HruOf>@)}!2n-75PfrI*X(toDD
z4(jpma@XxhpZ<@>?+w;cdv)rqrjIw@ZWZdkaH4V6y_<~h(UW3~-!elxq&}I#`z&Vq
zulYtzX9Uu|-Z0T{@)4f3q~jP8QD^upS~A??x2`#8Sdd5Fo7L#_pVH2N?1!TpMWwgV
z*DPPveDM-;BYln>{Q<4R5_I=h@2a~ULT;wizG%&x+_XbeKf5YWi}Cy8m$y<quNAL6
z@=o2&qV-jkMcZV5{^q~)`;Bq=I;W$__ixP}Pu)5B*JAQ2Tghzm2-$_>raXD~%h#zW
zI`_uJBeIj;hV`Bh=$wBgaibh#s?zS1>33#-P+p?uQm<51m>6fOX?kKu_GHEF$i8pN
zsW0tKPQKi>xn$QjWkhbuNsk`GI7zXD=p+0~8uex^=_*|w+37mEde#TNTL~|3mMN`T
z>__RO|I-|m5#hH*eE-$TP2LfmR}>Yy$1;CC)swLJ?ld7t$nm#VM^s(Z_)9jH3JZ_^
zj%lq7S|$FdH6d3z<KE);au=p;h$;%ckYS-h&DasFw5t1fL+^s-7fmJQ3o{IU9FCmm
zy7Z^qb0n2DyXwvUbo-q}M}*D@-TWf%06BuT-dVZia+OSLTC6zfWUF0g;9QMU>%1zj
zwf)ex&iM3I$)W9}jA<uTc5;!rN0DFU_p_s~FzG82?+cFIRGoEOhWxLLJu<4fmzDSY
z*rS`?5zi{3^ox#s_PMkqP{l@eyu0wU<k@QT?<YP1pY@jqGh_LlblBT!uG@3%xNlYS
zC@a?5=?P7)I(b6!pDwI%=GRV*s`4;Atv~jMUAvf4VeSXzi2EM#?+;Fn^SHD4Xms-=
z=InEMEhS6&?}hz0?ZstgcI_AQhTfTrG>bZRzwJ29y5OPS5ZH0X+)KlDucx(+v-+n>
z6;%h*R|&{P|0>ZvwBy$Im&_H8``Ws3$1gX79=k4!vy*o^NnHqu+CV<QFieT;eS5y}
zYRWT*WX}W7F3j?2@9wngnlKTuI{Dd&sgV`??2&5AQzqz4<{zKF>CEI}gRybvbe1XW
zR*@-+9b$)SLcC)-e!5l2Ikm@_yE(2l>@dE&iaF|=Ud}?(UC*9e$*Hx@`+Zl15h>5g
z`e~{yp&UO~7Fl0V_n#9}|Dv>5oPNl^b?xsx4ji?;U17Y7pLwO>iv`)Gyv(49qEgth
z|3RtSk`Hf`8TVB!n`f<Wx;piEdeE&ve#VDyqEB5q!V*`%Ow{h{`0%T6t#Pv8l@j^m
z0VykMCOy9J%;5;=0=@1kmOG8AxVe@uv@f{xah<n{q)!n|{*mks_sE&0A0}s%%{P$Q
zCDbHzSuyxy_M;B7%X?z<GVEUThRNkgPt6gZlQ?&AZ}->(7cNb(z2vF7#qqsz^^O-`
z1uJf<zMM13*kZDjZHzI?MOtl9M9slG^Lz3WI#~&V|At<lIsGXd!`!u+TTA!ukt21k
zPuk{bw8HsKf%e2NtBmYpn_6WjOF38wtdb9WRc)wD^6fYsai&2cQYinz{wb52OvmQT
z6r+UR>sd*@+0vR_(kL17d7J(&Ve?&<yPJ<}_^vK|Qp#rQ%1Z}#NK@{P{^hRPl<s!B
z)iZRnN?ebqtBU&fj!)I+BpM1`wocx)ajG<;`MfvCrJJI%`nqLAy3d=QJbqu5doL99
ze(UCrxp2?3r|6rqz4@aLc@iRusYNT|-@U5KqF$gnMY^O%HfRK!K<8E4c`fI*i%qf8
zgVnl^W@?GOKXJJ<!NDj<^Txc2%`1cy_or3;P(I*vZ?8x7wjJbiqTNAf4@8Pq?3Lfz
zda(T}oVU6-PqFSc-nMeL`gpR@j2hJgdO<raX55VxIjL2%b4yeXGkf`DZEItnt4=B&
z6sFeZ&$lybDBgALZR*R4?yqPUJ6)==SJGTGKh;e1m+j)4Hzyuj>APTg;O*NBMDrzL
zTh66R9T)dKY0@gK;n}#!Z<V}w!Lp9oNbSj-l!t+9tV@5r@)gdmYC0tOl7A2K=&ZB&
z6yw}0MM6!{+O6GMGp4TfX$g8SBYbpx;=wof1A`{m{45vM_0HM(?rnN8CCBM>r%lL~
zPoiS(4e2^py4P<yI@$ijSPiMAJKZ;_TwhEpc^zc-p{K$!d-P<h3LnSN55+?Ai!a=?
zb36Hu-!;k1=^E0@x_Xv0x7Ke_tJ1WKJEvUf|6I}hWqWDmxc?kaBqdwEdnz0?Mr`Jp
zZFM#^?*od11#8QNZZ9-_Fm*-Oqr{1KU+bLJJ0PydM;Q~H?NnkS<<orrJXxSyysB%I
zbZ=pJ?YG9y*+)a4E-opv(CjtTTEMrx<b?(0E>leZz{943iCU$~9_;~eoH63djcPva
z7VzJL=H6oQ#OdYF@+GAqkBDxn%)>P?yB|HcHbaAQ*Q`n;|I^{SS^AUX8)FO(x1C(P
z*DLe(o!CeXwFM8m)kY!fCHX`iPW$&ec|&@nve)+A!e?GoP4Q=1lvqzs2!HkXPV2#+
z(-ovzJ{yjz6O=WTIF#$ST}D$X?BwcsZC};acd!0B)tdaKe91B4yY#k2ikm4pGv?<@
zt24#be$*fSQRr##Xw6$m2eI<Y52(e>8?{;6Ou}aFdT3lt$+^5;vD0O9pWY5n6VaXD
zJkKN)R*rKsJA!;a#xIh;-NU`GtEg&Tgj@&Tmpfm)w&|6I38mQ>$mD$=JDT4x|6$<R
zW!s``t{qD}XOs0YwKf0ox%0(Jm*fw~#Q#Jjf6qAg-_b)cV-<IQQ$8w9WsKT+Pt{VR
zb%DTr2cM=@i2Eb|Me)@_-dzt)8Zr75A3LcmlA5Z&?XIe*<+yCx0eVKl6#s(8Hp}dz
z6WzxJt+%^tY&#i&PQTP-_152O-d{Xz`EZSV;%F(a4LKL?2pzs#GJjU}_zicQ<ySqc
zNhp{#`;nyT?9&1s(bFq#ymGr97`Id{wsOV?M}4Ol3xN>(`RcDjw``srx3rN|4(A|e
z>R%|7Gvm+N2No+i-^sJF4{bhkBHDAu?dJB9#UiRVPmSB$7o!+GRye48pK#oXTG~^C
zI8w`A`P_~4QEF$hs+7ck1TBf4Ztz~OAzid_{q)`}`)}bkr=GXk?e^UwE}@^65O-~S
z<?0o^bK1T<So`(Uxk=+)1f+!~o47eYPf>odZsw;qez(o0WS5RAclg<1CAnpL0QvXz
zq-+sS(R{@(cY@ZGA1ao<Vqw)?-Qb}9d3SkZV71lJNuj6BLTM3R?T4+zz13INHtvy&
zFQ}4n%&*Wo;1)|xjcxzuzspLjQ$b?+9VXKgif37@AicAad_VJ1Zq|m__Jr6Q_Pslz
zMz86z>-ymLpQ>JNLDsD}b4uj36cMe=vVFF7_C5^TNgcT_*1sOhFgqg9zIg8ZSkrxL
z`lfj6{5;T670T?Gz_+?qq)_O2a^V=($>I&MnJ1pw<Ta|k4KQD!w#oU4hS`dR9}Kit
z$tQ-swt~9~S~@PPweu9DmtI^rTAw6+d;G^0PaX?@Ueu<&Zbq7HzQ2hnX~ta1);X_V
zNH%0S9Bexy`|;AN%eB`dI@f+mH5yeqm(nxo)72wqcUw}&T7C@OV_UWQ=mQ%!kx-p+
zcQ<Vnm?Lgj{i)cXC?Jz5n6czc^PGmFO{(p>=Vz4%?)Nxm(Tn8IeP)o;J;O)d=7vMU
zS51Gb9$nT1Gf#)=IH41tBgG<+a*6WFB6C&kq=H&u4;5)&Z<(0b*d2%34XSEei_aTR
zd#}+|>G0t4gp!9bHl=1`WkgGxSHHCov-<DkzGsL0=jUrIpM2b@{m1k9mN~x~A{u*H
z%$TvaW;7|C$U4;Wx}hZ_vZAy%^tMUx-Ro5{^K(_EHoj&o$Ww}5?#VFKb79p;+2kZQ
z_@<<u|9R#gzi~I$DJkSu?H6rR&or%j+1|USUR*+&?|26#nKf~Y>1Lr-7eb!0W-t<6
zUW(6sapXnkl$<IC<C-MjH!*TLMde(zpYgmhr13<`k;=4GRi(a=cQQ9!6do;3d+EO_
zC#G6{pQO>J$LGo!SH%TZZjDTA3pJXTA(!t~<h|mm<(QIymAWcU6}AmRhb!{(4ZJ>#
zF4QR7E3Z14PpoCjb%8?Z`a18GF0)N+HS%rbw>pdZ?Y7jHS?-yzKUQ3Wx{xH{(soj9
z=R`i|H}>H*aQ+u>%qT3pa+IawxJKf-Sc_5Ku9u0dsa?MdO`jA!J_`?Z8ZJ!M*|m4l
ztVtr5q(|N7*Q1K&Z(g!JyUl}tCwbM=-tr|XrK08UksA)iHjBFR%FI;XReIguw#4>$
zqP!Q9TRN)iitg*yLrY5gG}^P}eA1dpAy%Dft4|k}gfA)D{#IGTF>a!t&fQrQ<{6TM
zRMjD|8LeJvkJo0ry7zp6@3xMwH3G`cP5+!pw<*XeaNnuxmXPWE`tBy-y6~XpStrtw
zuB@*mRb{F+X}9C2i3PulAMa4*u3~rXlH$2(yDg9COnJfIoGp*kt(@>=WB$r}8R-%#
zF{N5r`|Ud>e|s{v>a1jO<n~~3l_R^#3RckXwJL0AT}ge`uCvp1j$`YF$1&?{S%|FX
z6Swp9lac3W9goMQ&vAslGO<BuBgy%=bE;C{p#qmX#gBrEE<XKeek=3vaq*nOmfjuv
z_4sttSKIWs9Gzm2cA@gJ`gG$DWB&P8nxHyOGq2~Y^LbAL3)fGr>CjJpteiPFCRN(T
z#xQQuybW_Y)1SBdPS{5)BR8~VWhOYgZa-6z9lp>&@|WL(Oyx8&-$j#>JzSn50;-y{
zV)+BI-QQ=PSG*hay2ks>(cGY!Msf7DekPwKzIa@vTg}foxJT|#+^NLG&EJi3Hyatt
zoEW=dP3j)G^^u+(&0p$z4@FhI*=zrvbVzG~mE)M$<)QRfK`mgHUoX!3H0%45DBlAi
zbMv0JriOWRzwYoVpLgPz;cPP}r=_ai?RiyWraEoEIfLcDa(+;f<5&jeKc~GjE#>V(
zvx_w<e!TTCIwrpSY0Le>ig{;fd5fJ3q>8MWW#+|)GV|2+cFk{&Sl`fkCFqOt#CNR3
z1$7e{W-|_t-Eh!5$8eT*O_a&u@HcW7?QU<5>AGjBwcFw=qdrCN*v9F`-VtY}-0EZH
z9bWWgBcEzjjKIGKoJF6{b-Yu$Bfdj&N2TeZ)wj=*zboG>v3Z&idA3rgL{m}n{L{3j
zcdFpnZn*C0I43cYLY-IFeV>thr7a7Mf5-G*)|}+qP$0}|J1I0Zv&gOTUWYg1)_8+v
z?U^y=O#O5G&L$G;?-aCrS@!P-y>_O3#lkzK58Pr+_WYBW!}5-}thL)xdERQvqL8x0
z-2qW;%L+6CPbF_^ZYo~0_`Ml(<z7j9%fu~wl35ZdQVyl}RSl9>rBsf7qjuXjt;{*-
zWw!6LZyRnnMLX^5Q0UB=5olQFR8_G-ygtyWV)Xpfr0TGGFKsV5b&->8W&vRb`e^dq
z(yd>t8t)ke{jiGD-o5#uVBxG+QCoWJd))g93~#J=$|;sL`}V3V-Rw%fqlRe0mvMVm
zd=kmaSX=XT`MEn4OYc64n-}PErOC$ScxcR|!;da@YIRk%O?cMZv^{J3t-xB<-2oGh
z-MDGLF|hdf<}p&G%Qu^h5|JHe5R#B2W8Yk0@amcG(_W4Jt8R);v~0`lvvVjl5seY^
zQ7cj^(Kwgg=Xx}Ulz(>2IVbsvCA9D}p4sLO>$bd>FtD(i%b)J!yvszY#G(CIPs+<r
zN<a2hO1N%{%bq3o@sPsx+%@q!;p=?hnz%MFldPDX;PyEnmoZ1W<<)+cdDMz$>ueQ&
zjL$a-74UwyH^8I5t@g9vuh)0{b8bIKGJInx@wGTAXwmubP3><hFPO}5sN1FB6l!fH
zJGm<6wWQu^HwNV;YLCggm9*UE(8B|xnrNn@=-aAkm#4>HxubIheD+WG$f;dh6Cxsh
zK3mB^fhGCI=6F-V)$9k)bgy?=omV`her4yy?ORu-m+g_uz1UeMeeK=W#L+X!l#4o7
zQcab=$ux9p*`CiOdz9W*%sC%Etx-LO%#!}$keV%*bmBgrc4t!BBJbb6{MXU?Nw>c0
znx!1v+jTj7$LVUxLsBOuSvcNTJ(qh;d+Uswssc`yXVIw!rB6KWg}Xv$=;~PQvHI1i
zDL-bWM!orEy|?zI+eCp=H(wmNEUcFIdrR`s^A^#Dhe8{g-@K3a9JTE*P5!#yN_nCE
zt{FFBs<JRH@4`N9iL=`lb)33)Yre5lccuPR!+`wzeQP4MpY<wyvvbzCxYAJVU2sRF
zuf4@rI94#-3s$A?*V(J%*mm-0y0hh-_{6KSGK)WIBsH}L={KGB501NLFw<VgEaZKS
zxnN9AGK`VAwB@5%U%@WF&ex~-JF@R>u!(OO{h$>3;lf3?59|{Dou{vOuRKijq}qzL
zQ|2UpQ&v#4sB0u2I8?PUPw+wgy8R80THDJ{Rwl2iKf9-|?e_OF^2)3BR?koFIpGpr
zSlW8{W3=?7&!6kn@|0{M0?f{3_C5RP;*iuSzIFVuO8WvCWZj~^hXx1K8s%Or%y!wl
zB$#DVIpH7Mi1qP{#vVx?E4VoK<PI^9aErF4Oh)KUkMeBc&)p@K`@MVx{0r{v*tXKQ
z?)XA0$)YZjT4L~;`_o8^e^$h_v?}K%tljgQ+Ed%`(z^6Oq<-T?K_{4(wMk2^*<_1|
zH%QA2-=1Sd)D^!z9C)WoVWyRpr{)wZjqUs|rC_7a8%^`FxfWg3KMpSoo6vIj_*I8c
zK_g*X^X}wS8?x5in!_p6iq6_^V7)#5!C`c1J*z6blO_4+BHx!)vcE1xeq7mGr#VCO
zT;xVK!*oO{VRqWssz&5yX7Gw5)zXcU^ey36eLvcD3G{?q=n0~%Vb;#`+_UAL?$4%C
zwV}c#N`=nH{SrtYBEyVciz3^e8pTs(6b{shiiLi-A6RQGX1ei)%gOlynMIPbsnxy(
zmPh@SgnlTWPqZ|6_o>yuCNGWr&+ltvWs0Y}@$2rGUMJINqrJcTi2YvApN@U!o6PN1
z?^2z*meZ7iORsrHn7z94dSzo>mR`r(Y1Msd>eF57f?kTWJ&OG@Yh4W-bM!Jc<#_*!
z49%*U;vXOFVD+^jM8->}Rr(ffp@Slfm#MiXVe&w$;Mn`tpDP#|+by#~9KXPGX{F#D
z0&5q0ojRYgWrxOl1L-=^S>^4mEn3?$OD;w)`Y_JM_FVC#pN=AzZNyT`&s_SKaWZ1o
zA@#%RzAZhE&Kc-iOMQQ)<9ZV2*)+CZ+QqJCMd0PC%?k4B&K<D{a*y@vr+i(w=hlgU
zh2xj|eo@+zap@wX*lA9M%uIu`7X!}86(MmQn;Xc-U3#)LrwYc#Q%dCS`9-ZeF=bzq
z&5mE2Y+PsSY{*a)jR~2eC*auzPj{Ydn)6eZtQd0iaQ7pP1}9}}AxTSG`n2skbKW=J
zC{9r-tq~O(opU0qjS4$^yjbky)?8ubm5AwjTl(hW?&ehUV(S|Ht0wth<zG%6H}}c+
z$R)_(1@eaHo&;JjEhSUL^{Fdho>|NG$cyRvr)QZ3-r8{tNmVy}yr=t!-`5_^Q->MV
ztKuWJU;f$|vNinYv?ItfiTN9rZ>(RQXMUyN{c-!Ftdyx`vdF>M&J*3=HQ!t9`S;+J
zOK$TEJzVGpRWV{ghmXViVi}~okMF9)?uTD*oP_tW&gz>qR-F&)FjGy>(WoeU&R1ra
zbNa%;dvEF_+O(gDfBboDbktHq6@S`F-<mvwhr)f2R+Ej6)$9%JDVCi2#BBDYTf3^G
zSDz-wWNq*-Y@D%Ux!hL!0N)+?kXzw`;FX}Chl?2N7@^AfyWc9BzAJk9LSAv@uB|I4
ziaCF>_B+fFx%qB|^C)kM-R?<On2hYy<b}mbu0Ny2mc#o|B3?qI`Nxi%Oz%?feUw8s
zya3mJ%H9f>?rCL2!Z@IHEtWC}jppy;=1vP{x-yuw09T(tu&fQB2eK_Rgl4sT3C+Wk
z;kni)nCZz_zQoj@9_YE;)y>~?z=!Rj1-l0NFn!j0GH5=55Etm`PYdu2a1V-r8DRSY
z=%J7j#~JL&1RrDwV$i*4K@3kHNY7;n_ICB4hoJ&GBZB4;5$GD=<4$w+@L+fb2P5wO
zbay}6T2Cg^C%_X4cK7!5K;O9sg+K~V1|yJ81BENohmJ1)Q1w7V^9+U{S0+Tb`uB^V
z6buVCLP!q_^kf8k`vi^f3<bKuY8*L>AwOo~;cl9jYcRUxL$wSFVXXD!)Q-*o-_UxZ
zd-?c#(mX-}&|3XzhIP}wTHV84X|ROh!`7&Wr<ZGpKa)leLRU*L63k@yd!lvk-=6NC
zK}<S>#$>n#1kwF{0zIL@VEEZam;~|)#t?AA(5XiVx&-+1#t;mJ4V7>b!0N=D$B-RX
z57UFr-a~M*;A@70VLtBt8v`c?rk`y@>&Pn@L%<0`v$zI(^2&g%;R^20_ZNu*aWf5i
zAQk)(ZrG82b`VU^8I1G~Z<N>?u3#uK>3)QD80lvR!5#&F__3INwh^s@H}SHM^s|Eo
zCV?UT__M~74B4T?vj!LfP8dvsgNQqXk$!d%Ox#C0|JZON1)MOL2Dveaw;6_j69yZA
za3{d@vyE<qkr>{76B<mktug&<BTO*10@u327I3Uy49{?at`o~WVC(<yh2Ow1{cL0Z
zgrIR^`q@T?r>h74Trs@;CI}`9!*UPU;9P<z5X(Ja>;JE#jbBx40mq8YHWWV?%RON0
zPe2ff>1P`UCOj)0IzllR7{>5nqUSz1hc7JmfXxfb#np;2{cK|pk627U+c+>8u|QmN
z#!(EK`)5wD!ZH19<3JLEU~B=$%AWGBJ;oLrr~!m}!D(zT{wBk657@kdLI?vf+yfyP
z4(|0mGHZW85W}5G&}U$|2W<Uk^|(zE%ZpwHpeOVGSk9qyT0F^+eXv9$IFjRwxIJqF
zh%1BXXB!75M0LPX44U1EXO86_u=W2ZnlrY5V|5K4NQ1liumv0|drCBaJjswfz>j$0
zG5u^KdQL$cjiVSehX%X51`-@+G5u^~XfV%UECDAh)C<eYwX0$axU9i+cYS>tW)->8
zz{VvIVt6@f9@XQD#8P6p*_nAQ_)sXO63Y#lHMRb7Rp1E+@(%NI(qb*(h*jJliKXO9
zJm_X;##}(m%$Uld%xw2ye)PT%#(e^e);<t|nVzd6n10SodCe8eKeY1<v<fT@*Zc=J
zI+kvvb08;XrLmG?3J0?edD&IPssmQqp=IRS9D`|xyaSElud+fT!_o~T<#rC_9I853
z4MtQI;xL7SS%<vrs$$gvEA7xSa?O7*E!!I!#DFteqFxwNzzGZW!gO<02S+hz4h;_E
znF2?`9XeR1xqA~_AJd?pV$Tml_@_5Cn0M7M6<mRwBF5~v))G4BflL^>Kb)biysL#H
z85TOQLvgkwGz_k0GOTKxV;JV54m2qASUBWnXXe>9hGMa$n0|J0&Su0vzaNUm(P9fY
z`G*#V*LuO;99-Gisd#0?^s|lFD~M||VGD+=>=H)v;z~lJ;md~mu$P%D6}Di=$}aJ$
z555d;Q4g$}{uDTyn7cyQ{R~$ir^vHTL_Nr`BsjYvSFLaq+~$D>iP<=~5)C#tOd*DM
zAni~C;kEl?D+d#Exd$>2ZE9Ybv6X|Fx!ml`*jo~7WpK8}AHBp-ZcZv6I^i`qmV3Yk
z_ZWi0;HldmbKt5Bj$+W<UkLFbxc7&n7&Jp+Ej(!?t^<~0FtER{mPZ}dGDa!}1EJ8r
z9*C|;aW!0y{vxamo_Vk|T)|N6Ut%74ur*x4{Y6-Ao_Vk|T)|Kb3iAv^pJ3wb4>*cJ
zbAK{k8P+mJDh315yIVXefhQTVqb01laJ6_m$&j5>5gu!`zp^7VT)~{WjL>k^23N!7
z7+gG}MED|Z&)`DxO4Yv-M~Jw?;VCtb&4r^FG<ybm@W_Cp7&JpF9$OnnF=&QVJTl-Y
z2F-)R7~sJ@{{BOtB5-jvltC57eXZfXfp@Tk7|Ia|v1P+k45o*n8U1CVu9&wNOA*BM
zMxUdw$A9pg$=}GaHLzR32$%p*f*Zqlk|8^+G+s&YBt!PK{&Y9|_JZN<H=+NC9z0<B
z2aOO)3uMv>o;=cMIGSM&R0Bgz2%a+!$T2ir4v#>ZJDq6Q1-5`=g(*BlbqypqH^Gw(
z*@Ni8KH+e(?ixn0F!&;F4_BFB1rw--xx!rg2mat3jxoIbCiEZC-ihgF8_`J+ui%ja
zP8d4X!f()6?g1P6pWuNlmV3Yk1Du#Vnlqkc$lhPVXs28~C7xu+-d}Ph+#&EJLw2qX
ztA8+DkSjd%!e60(BgfW2M~05U^9l5#<EO_LaeJT)r}i-Jk4GvT5qCJ0;R?zRRT9<;
zN5mZtWp8>g(+wSPNSq8;Hax}PJ9NDf)@~pnhHQ8W_Pdd%!(7b{SHtBP%!)3l=3mqd
zOZP{TVQ)vA0*<WUz-k^L{4)*vJuLDF>ByW|s$mHRv!ko}FA};%U<&_CGx)xLJ^fW)
zY~_eVU_e_g4BRB#O4u5%U{`na=?#7kOh4N=Fd^t`u>~A!0NsP&UOa}k--Q0-5%#Bh
zAO!tSF!BJ)Jz!&ax`ze?uf<<S*aD8Vf66<4q+&2I7%mP*y1808wt!>ppYo0$sTd3d
zS1{3e5vHGQgbBl)U~CMgpKXM88sv$NE9cSe;)uAzgCl~8k1E&#j+Gt++rWpQuf`T|
ztmwrEVK}ycV`Wc?`VTzGkeyv3I@`yS4A}=ue^D1WssD3@afb&%19m4koMQUfM!4+h
zA5}vTjwcziL-TMCBRJc~^s|jHp}B`J&>K)R*swI7O_Y}!Ga(EgVbbA)3E}vC51wSm
z-d_sgodiQNEEEnU=yPA*RT{YAOrXUUaOKCS3RiD8JRC!dAqa2_X41pl3GS%iC<e_?
zfQ`W`1*T*u6iR`Z60XI=)o?kW8caI8$8Q%nia|3Jpd!3dU`mEUp@c3DuPoRauHZ2D
z{wzdGgC*dEg}I}3Ad1HDvxAuKu;;lli9VlzD;W%hZp}Z)mFbON1suho8OF=d1ASnu
zB7EqG*8g!H;vVI%^f)4SdY~8V#`u-R7I3VCpR98A@<KnLg?`uz=e7bb=}7;eg~!z@
zq3#bSEGvBctN$}oc=g0q3<h!vBaUZWyDhF}m?J!BE!u_l*M}$@jYg;(;YP>F5T0OQ
z3piHxl<3?5PcmeO9ce9t9uh>*E8$6o?C?P|f|GO%Z@(!B!@;#}u-pT-FgnA7@S_vh
z0**D<XDxc~h+q%E6mY`WQywR#11T^>+~H70??Zd=tPY-PSORzhpTcBDz{wKfvf+rh
z!=ViQnLn>g*cz^2C_)qGl?GEX6bdCegI5kL#b6*5`Y#0n39bimHCzs~$e_CixCRr9
z;KCPidpLLHz($;oSoo*c&DGrx_93EEbUewBeXv9ejtFq0`xEpHyrdX@_NQ5KdUd#+
zKVsJ$ut9ed00W(PJfOyu422Gsi28AS5w{0Ena%VfST&e_wh<2H#KG7Cj&&fKu;cH~
zKSD7W$j&rEz}5faNrvobHW+bE(Ai)LIAQ&X2;*@SgJyOnqNTu-4B0u^h_c{nxE$zd
z6|X}nu7=A2_pRZx`S66!6@SIzYPcK&MdAk%xEe0UU`q71_Fq&9LpD6cz&lh%_^^aX
zHrzMx4*je<uj*lFxPk|Y=$a?$9SBszTmx^=_0KySrfOJ%VDA9J@n4vJwvhn`Sb6|a
zEpQZrX3k*-z2-un&wIe-m%BH*4-u^LzYii({^?sA;>z$~e;WhNG30MU@O6KBg5BY!
zAb#We8#%USgca7S-~YFCuoW;v@Ka-Y{*R?YXZVL4JLL!~`u!d}E)xGbhCqnzf^Wlc
zC%EqY`ye9apFW>J=*@kY{a3ettTV1-?f(%)DE-sPx!R`%5{*V+kGtXQ-1tF6={Y{=
z9q<!iEHa@UCn?7q5CUK9!Kro-eA5iWli=3y{}Dwf#dI<w{-%Q%R$+_~{M48pxENv5
z-RVTv-t28VY{ocY#IwNh!Wd&uA2_TMTv`2}kv!#?0tVeJgy{6-?}Lbx1iru8#fEK$
zfhv!T<1rJCn-K<2Q~dt#*8ewQ#IwNm`uPNUj9eCs9e#KPu)~OF!STY)O^^S3`FRHs
zrN{F1k5uH{w{y=JD+E6^$K&S8VEA}4{_jQpt2iDrVFm=j&29Xp#P$ya4fu%8Kk*d(
z@emlq3?{y{jxFF=*;Aq#;7Nw;OeT+^L^uL=7~4x!2VBWO3J7Jwy*uK~fGuEKIolX+
zrlG6zuXa~lEk-hq36AN1ogY_=kqkxyxYLR5H(>eMK?9RNN11T-PuK#Ebzn+xq{S9+
ztPIb9bs+>J?J)gp<G_TV=GX#`bzn+Rb8G>}>c7@CAb{Xj1g4*D9GDQ)99zJ#4onGZ
zjxFF=2c|?j9FAhpJW%lBonoY9D0HCY&pXLT$xx^VdMU20N0bClGGzDXodi!ZWJhxl
zRRhz{Hp0G+aR-KLAH)_6S)t_N>F+~u9D*;&#MW>*a1_Jqfi<7OKp!}m6AX{Sau3+h
z2RQ^C7^a&Y!t@VDm#zP+Gx3kZG#Y{K52rtTacKYL6Mi~8$&lTDsEGSW8CS#Q@cN59
zxEd}8tRf!EgrgWV`wy1z7YIi(6pDUIibp;0Btv${0Us#h^~p<2$x!G(Y2*hZhc|af
zGEy=W3MC&ebo4Ai<Hgl*Ieffe*a1Aeg}b7J;{yqGe>mYSnhXYAp_%d9GNJAdr@Lzq
z)EYk*rk`zu3E|U9Oh4P`h4ujWv&C``*up&+@V$aGcNm;PbWfH@HrxmAf|(vZ^uI|-
zApA28yr%_w(-{Qo7Edx{M@u1plLlMF9gddaHZuMy#};s`1F3i>2*H@gP$*i0?`|fz
zg@U6PG!NwBog)NmK3u_Q(Hrh$5sWj!lMLAh^8H1c5X^aThofb5sZeM<HWIGxk0b;6
zan)SiA`j&VH)?>UpvSAGm>RC&{=}G~KP!hR;tpqLoul&?b;1yFhjY?m$S|sfp&FKe
zomorsFDiy1;tuEJ#gJiC3_~?60Xs8{gXYoRVBKSgxWhSlF=QAO!%z)NfWBE50-ulN
zQ7=plS8#vQzseMXts0g9ed9(ax&@7+7&M~=&%h9Zb9x-bpcyT=!_9x<P8>rq7>E`^
z{vrpahAS8?dim2`d2Avq$xtX-@}P(Ge~&-fpJ07g8?J`Sffn5&m_*kbc#<JIT7t2U
zgg5wcHCzrT!bfBN=41j_HarEI+XMZMaXP^>J$w<jhZ*eY3SZ=m-+XWcgJB`CLBsIF
zX*4XskTnF(TNyl4;D{g%mj`~SfoCQh4UNX-2*au|SEogzVJf)HVcgZiX~n3VtMc5c
zKU`s4b>o%|uM&<3at(UC&`<Z^Zz?SJfX&N;M<kY?9R%NJ%_A67zzOTG0O9Z?H2Vkv
zCybqDgr95O;Yo(<=p4|WsX)g;6FmnY)cxV?e;f9<NpXdLrtyM|sEuGNV=ION>F~)D
z800|f|5Xs$9Q;f)V&zEpF#X@={X;iAD>C>t&@;^4J0$SWu^C+3nU{2=-^<HCB$%LU
z#PqX`UhtSJnCNO2%g+w-@F(7cn0~g=!yhX#*J{8PaIA3QALI(ZO@n*)2V20gqF3+?
zFZd}MUSGb#L-<D;jv8h|e2;*Kke!y3CIEeijK8Q@?g3jsNFa|;OaUjXKMDMP4v$%n
zR15~PGmTVmZD?E#m!rR%bf?q(e2AJV+8QxM+~L7+2L>I<iN8=dia|44;CWVqFXHx~
zWgZ_az*Y<fq6H=+f_D;Z5qEfh8JZrik2;`p#E}h8fxd(9O5w4pF%*M=XaV+uP*?vD
zf(=ig8s<XZ5cgO(ia|44fL+FisQ<>*a5*^7O`-pR&lkCS`v2uHO+r0R(h+8iF-=^%
z<KM`!H38_8T?WB+!WM9>=%+T|=f-HEFbbCV*FAVBN4U8Rn7=l`z4Swsz?6<m<qp?=
z0Yu-%gDv1#(eEO_?}{+I{U-FE7lTf4J&vOoG^1ZZfInXhZ@&rs$0T?}j_GF`JwxGh
zA#lt^H!l912~@*eoHzYjiFZ0I+3*zT9*Ca@!`p8{{}G+EWBS>~{+%9wmRRlqTmOHe
zxnc{3tl|B2$31s~4?$uGhOGTwqWQC>BP85*f2?_M?TPT)idYIR^MHTEOfYhc&~UPV
z0}d}RLY3(00*+$P++QH-0k8!eYyXtzBli%jr}9Ig{iSf;i-92-3LPl%T$1n+iow8v
z!e1;*IEIER7=F<vgo&l$+PJtHE(fH9JAEFmp#;xua5Y>G`28_Y!c%Z8KRak(LJRWr
za1CVoxa05e#KJ$l=x=$t5glx?{Oln3h5*7Jl*IJ2jp(-v@CwEdaKc~;KNX6<5-|O2
zBbtL(Fou8=22;Y{!Nl~ljc`jP+>1eQyu}u9tn4X`;8F@t!fod+<KAJz6ZQWZC%O<&
z3s@@R#laSEtn4Y#;^0ZR?c8PD#laK74a0$jfLg$w+W)COba)DW(;`p}bD<|F#HaW;
zia|4{K-AE2HC&GV0h&xY%^SvZ!4J_ArQ;<X=|``G;Mv39B<Cd^>4)Kn1fOQX@b;VF
zik~<R%g+u%yEdk$JHhtHQ4E^VDvUT&;5y)7`q@TkK<NDGypF&)BJOZb`7gc|u|FH8
z?vEr~Z$kpyh=vaEP>yhO46xutBMymV!+o50INw&twLN$!N4U8R{VU;b>I=2N)?y?B
z6MWGruNHtQ844XJVQ$mmiw=1(Rl^c+-k_tx2s_69Y#6Fx3E=KSs9OlZP+@EV#|l$k
z(Ifrrpn-`OQOm`X4B4TCo_3C$3D=s%)o?lBN85?N4;fp)vBK0d+>_u$5L>{pqEq5;
z-N8`|nxOy>jR?*iu>~A!5b-@UEcbvdh{r8|Oh4NQ4KXAb{e^f>qWjSVs$nj8gAR8l
zx`Bcx8L~qOc6J6mh~VKiu7=C8E*OT|5p_n`0*<wRiobJXxd&`+gU`lsPQI`O94mY)
z3j6{n+$bQrFv1scdqU8&6Z}QMau3*C8Eb=R@L^@5Cs?=|E(hm^Ca<BHgt|YRoVUI(
z<`sY02voydyvEJKBl<sw)&7sUcaM=Q%k#T#jTi4u*}!5JFsy;-_SSS(cV*V2d)qTL
z(><G&kLq;gYcjI>F+HQ$sEo+W*ousZiAQBs?<^oRe^>|!3sz=9{INoSz<(eM7O?^g
z78ooRjAUCYGho0gu+a*PF!CbV*jDrT{?0k~o_pgya<HnrJ(Cp?=lt&Po_p>&zxVHV
z+$E?3wK`L2zi;HDM4yVOWhp?oOjznRM`^!r<fB$uEA3Ak(=Qb(+jOC2{An}SkD7*F
z0uV0ab6yv2Rod?xDKjysUM?lQ*2F(Gx@AePHMv{XTv!eeT7{$L;<E10mW!DJpR*|7
z@<*g4QKye1q1RD8V;lld>N8P?;x*pNu+;3{Yp(eM59C-JH`i*fdsG5W#`+xQ(6x;_
z>*6h>df>Ek+3!B>oa2R5w3R!_K8G)Dt+-@VevNOgxag!i>z7}b^;BHS-mSVe>Z-Vy
zhbhh%O|G@#65X!+vg)4sSf1@Nge(UBUmQbuD|bCPFB>ndqlMOJ@kLd@&nBYPMc|Y2
z=E;+3&EQL38iOxseNxgU$mIv3wQSr7bI+x1=Z0;KlGT@6m*ztzhZ4x;Mmxy92x*O`
zIOlF9q;hTL8V$Chbs0tQ+T>8AN8RUI-C6HTw=)bCh?j?9&{fULSd;{$5R}|N!m;nU
zMmwRs02oeW&jI?dkz5;;D?Mtxyu*aYT$exDyl~6(yz)YOI7B>$BAhW^fDLDdmp3nu
zBt8B+hnx)?4*t%yr)OaoT2%~lX)$LfY3HDMACx?f57w^gF2$N%o*tT8ay?a;%g5?2
zx9%GB>MuuMdQk;vDQr6a`rdMs*~K+is{(b|`$!+X4!15Yy*fMkH8<Vq8NY=Emma}e
z+<?XNcbBe9&$i6nr*J-Q@v7)7&BA42u~nfB{n^=;6$qkZCaWOI?@``bc|YIb4=Byu
zDmx`J*9?Pp_b9z4oPJrDGA_y=ahgNfIG^PYm9Edqr}NWY`6QvgV$|V`&fH+3^Wk{T
z<#w0)S!@pnWiGe76wYFMcI@SHPdKEq*pnSQx!e;Dk}URwgZzr^;c&fNyBJ*Q1?nih
zB3!V{2f`K0J?UM_yX-#8UD)GFhem`-X*=+xBL=>-9a|N_#f;h=7nqxuw6Ba^yH0Lb
z-3iv*5q&x2LG7;vqBA|&E_XHeE$`6Taz#>^o;R06kvl=Iq{kn^mFo#-y%lIcc79u-
zBRWE@IH;&=NvQQ<j=5W_%_0((THP6FOU-W=?<frK6dUclFMkGK4uy4;&4G2G;4Wzp
z(HpfIbD_Fk4lQp~_IF>6y4WQHSw)vlBJ%dL;#KaDtJ+R)^Dn#7c{0j+oa5nMamsjv
z?2gMj)^`U_Q8{9br}9#N-_m|fgFCb-huEEoR2*qIDXFk3bu3X4RBfe=C0bNC&L}qm
zh1^|m?oJW0!s4o$OWJ92M+N1O)9MZZ%C$1n(aAlzu5j*7;mKvvcV+H#(9+?mo0s2%
zhhJ8e(vIb8(&K1ycZQCc-m_%*;o02bGC6Qs%MOCc!Ju=bhrTR#_~TA;9jQQDd1)S^
z3<qa;gwsuOxM^oN#4r@s1s&N@Cb>%h4rhWC+wByOB~)CH9T8Hv#I7jbJ3a6t2OOW%
zk?Y{ChT}PM?cTeTYeQiuKZqj_l^&syYfcZI2xTja15W`;X)CWUxXkisef#pFBvP?>
zphW&k9;akuNU@=LdHu6K)4J0+ME-n*d|O##I2oa^adD%RF-6~8#xm%N!}oJ{rVkTr
zIG&)`l^rEe?BS`|uuCr`I~1UJfsq@lrm)nL74Oe=1t!W6eKG!AcUr{0IEb_(DwLnQ
zGE}qAHS)gFD)r?Z%WFn4vsP(6cY#$OUv755%L~PWFL&qySBQNXeM!3qU&a$vLoY0)
zsJ?ljC##uVoZYPcb$Jg0l&>|O8@0PED=%Ks=>tSddeY+GC4CaVO5RK;%$>W!d4QLp
z!%7P^9P@>=b0~_SapCJ)z)-BZ9w@3B-3(ZK0mZRZi=O3>p%ioNoeQOtYheTJb|no<
zLD_a!fXY>9%}NTFV$8w%%$H_B3#@WQd#?f<7uaovPs`w56|`o;@`YL(<{}TARdp>L
zM_dZE0dEXy+HciTtSoYx8$nv8w4}9cw}!H#Iq<9|Xn~ex6+m<Fab3@RM_R2j*Btf?
zY)|@C%`0L}S3tPJlf}t$txGCLs8U&Y99IIeh{Lrm;Ry923piW}!vxMcqL5?}m#>mo
z-n^s)Iq2or<grwBxVnnD8O~}V7Kbas?dvS&u2B2(4VeEjUJ}(BEa2yYP*^aBK2!|M
zHA<9PlDi8v{0jKvB6>MoQKh}YB~k6XLJxxtCG7IJsybwCE4_mDv`SqLmFp-sTV@64
za+egUzZJ$@sN+`X^%dHdTT3L&P-`uBbIOL`D9y^46gvx*(2Ct@&9bGAtWw!h7jY)V
z4WjBLp{Cdpyh>Y@Ur^N(YIQB4$|`g%b>#|ZE%mxWV}h}AMN9oWlPi<8)U&AWqQ{q&
zD!r<zkIQ_~r_#%+`nb%OWGcO^sxK|Sw0u=oPHCynmrz=2_l1#`+I=yjrS`Ny(Na&S
zIkeP~mKs{>3B`k!I?#DtL7-v}_NeN;5hb?guFI<H<eIbkIK}3uWK5weE%Q>?;i@L%
z3+xoG$Q7U|_En*?od4n$OAEy0uwbt6A+7mlHI^t)4tlvm;JNZEIc(OOtB;cF&gz<A
zQi|$HrPV&NGAA|t#pg}0j>?YYE>BB?<Xe&AETl5mnwIk@w3Y(HG<<=M9JJVFzVJqA
zH&>T=PeP@O+%>*TMDfwn8We?P7FCrf^u$#n3SCOw_eCRe_f~sW3Sz0N)H1^*SuKX8
zOR7yXT<1$AEM3HOg%Sx%ePt^<szgwD&`7Mslob^w$l+arDpRaNUVrfpS%$rg!pqS%
zS$}yrPRN`u;FX!ZxCYeb)8upGm+8F-o2IVkAVZG0hFZKL^p!>_$^KHjKyAxp!{u}r
zo+iuH&J8Ea!7giF#vSsUiz5l`%U7_JLOOKr7S_J(X0zrp@!C3kw(&9|mqJ|9UWO{l
z=Pk?<H_6g_bJ$$AZ1J&+IkW|I3O==z+8ip3=q@_7*jg5U3b2c|UIH#}E(tz`ar@lN
z91h=XIhVN(i(>V8orMQtG-arKa~Z12E~Yw8vCNG?n}ZjpSjf`$<(*&(aUroWcd2p9
zLULjb!14tP_e;|U^N=#;xWvGGdr0@owep;1{3&;ST34EOReB-4A;qq=zsmg?p)S>7
zLw}ZzRz#m>mX&l=jyX>J%3X)H<&(XN%`Shcq}8Qo6+5)0OD-Xs$R|<dE}+>bM=dq`
zq^DwgnX@V6Amw1w^rBp=2vw9=RP1v`;^NBYBHp7<{mEStvT_PbGi2f9U_&NN0Tw$<
zlvtCyB4onk8lyaxIFcGqFL9&84k5Rl!Nmj%iyYDBht9&SQy|JTT>&bG>Wa83!7^mV
zTk8i^1rm^Waj_0bHAp}*s|HEurv@gVVRZ=6__WtmYFOQvnn43n_TyENHiHJF-QLd$
zsWOM^Mp+LQkO`AaVRe{lU;-MO5(!&%c<O)xq7fcvSa)AhM0L0t-~zhtuxkY%>QL3d
z1T>7?#U<6@sRN1;T@6zuRD{Y!F*b~;5-38{uLMfB5KmVP6`^t_$A&Rg0!4`Wl|Z>M
zRYFCmQWI&zu<GFQ=qm3Tj}3H{a1pvmB~Xm48Y+is+O%=BO3$jGB2+We8yY685-38H
zhEO!%s)GyYAd<)$jU%fEi;!IjRV7e_Xj;8D(wAtXs@!N1vPzg3RV7e_sBS&Sxa#2Y
z=ql%;fUXiQLU-y9;Hg^|%vlXs4q4?b<Ds24??`T0z1Lz~)lkv6DtAbXsS+qcRJT21
zTy<~(9g&o>bJR_IHAp~`x_s);)B#0^nDd75_+xxZ2OSV`yvmH@kA{PJlLXfY8BikD
zXHide584D6Bg}%OnlRQv29!|rG<v)aVFO%1$Gf-^x(2v_PQoLnNMLTT<CE$J*bJIS
z$^+?=hLz*6*1|5Km49NoP)vC9c)?r?o=48CU3tv4uz9rjzLC$aG~PPUfV7wnQ8x=$
zfX$<=eHAGwb7*T}^Ju-k(=Z1PZ7pmbEhB(EUI}X*XdY>U_EMOH2Izo#G&pHm0o5P@
ziHpf)<=L0T(2Doez5c6SuylhE^m`@bHNbhqyn`QZ<jof>-JoVL>6>%Y0`eN*JYwcN
zgwZvuwsgwt;q&MjN<H}U(Cgvz=wA%_!zTUTjk^XikCG8%2OEs1@bk_^x5?g-&iEBB
zs=HC4e#Ek|t?Wis7v*m>o!UX8<|@Ybccs^SrEAjLu-^QPH6d?;3@EjPuZFS_GN7ED
zFR0XU*~DJ!!2+^rpAe5G>9GzdLxii3noj<OH_RVAp&2@$ek$uY=j~P5L({F-1Q-zZ
zpW9gob)>TzBu3IPb)n%|g{lrPAPgyab;GKGiP1Ew_`*{K6eFrMqos2$OqEap6~vkI
z6R2)OrW@sL8Yb*Y&;hkV6gniV!D}1PHbMrJPQ<$!(ks9QwAvi#L$yjtI;~d$E+AGm
zn?=~wEg+A#4mOWg;&wP4qZx4nbcA|-#$<eF(O-ld&j#dIfe+}P4y;B~9mZ;qfMkAF
z4U@j6I-nHMRq#~91T^l={SY(OJih_;4Bq1tjOymB9xO%1_Pq+STCfxu@!8Fsz6K~D
znso7aHV}v!n1E*XVx*E<wX;+S5|Er;0ZR=`il*0{cdvq~9xkAJ(hEmw)jeG;R6unx
zURZ2!9ikeTfM!txlkUJepnzyR?A3v#xEkOBx;eHDLR*@aQ$1Kfc3KaXj;#SMpz|?U
zbu(WL5+kwny(TnOKmk#f2v~=y7Am09s844Wyj6#*9xNb>GY;#pHNXXQA)KuaRSir)
zV|QzyE;TR#4RNp4uQ**#wNMeNnsLXtYQQ37HKUJ_)qn+Liz_^04NO2IELU1nRX_oe
z4Y>*^eYRStfXb$_5mhZzKn0Htc`tQqvmPuU8`guR57_`0(4AfpT?1S|hX`|(N3917
z$cFV`*`wCO1$1%#QQdoJfD7p2WU4xJ4R9&C2Dt2L8vp~sT;^8Y0Gj{<LUQ=<5nSc<
zHbMrJ!$!#Lu^T}HQo{1CI?!g&fV5a}bJ?O+2@z1F$34_buJvFkG9J<nx~O}z7d1cu
zk%4q5u+XQR^;x5n;A-Fj`Nb6k;~JP0O*nm{_F-#)Qba1P)xg_Bsz=GaQZ(t|r6oY=
zip|nPN}y6y8j(Atv;-={RST9PbK|-Kx(2`$A!!nWMvd*F4k|^Jp3zwQoOMtE)nxE&
zOxgNoi)j;JKuE#cW@L3h0nx?N(YVQhPy-Xt_>lIxiK+$(NM<iCs)5plRSOkRr3a7J
zji?SNAe#1lNw7LZbwB~p$vMWxhK*bU6VRO2z@$@D2b3Wi&6`{yrze##IW&zzry;6>
z3aBQ7Nt0?^4H6?!v9-F_B`g&%0gcNwsY6l;5uvEKN&23^Q2`PmdAs3hKvDq`kPOHW
zX$;4$1IiGcG_ILY6traS0va+`8+lp{Oh7Xj>rC=RQ{-hmScEJ~PAnOF`p!UC3m73p
z$hW+=CcgzJ8$ctZv&EB(MlY0}xdtvm=Q0-?mKf;j-~u|O1xhSyz*Y}dL{>#L(ialY
zRRIQsQRY?M`>h8H$Skm`;W=wy0vexiS~s3*kbs2Pnnr=A1}30UoySHOTOCk9^o%3;
z&KkF6EmS~ta^CE(u7L??hBYwhI;sN-h+wSE1F8cGh!$6fMyr9zp=ly7hNl83Aj0<2
zZ&EXAU{W+SFxf=c00l(ag-TJb+Zi=50Zm9Zu0vA;lS9)qoHPTw<Yoa;cKkyLP&z@i
zP$??AUlq~IaMgmP$ZF;?LsbKmBC4B(3{xFcKm|*5#Y^j^stzb1x_IiUPuAh7feC0%
z&YRKHzyvf<hh{u=KmpP8q8U#OOh7Z_jGo1$(>vjF(8Rj00vr&3XECNGU!%NO4;GN|
zo_l1GR#35YVby~bku`3ffUXKKAVdt!N-EbavnIfRkQ-kOVH03L_=E$_8pvriNI*jE
zx{*r`uY1N?sDMgx#H1;FtPUuT=$(dVbC@ci0xCb3SJ3wxAtRJgap}5Q17$sAKq+yz
z;c2TuVkGnN70^@x#fU1CmeXyi0aZffP*p<ZaaDo^WWK^l-Hca*#7KNelqN*gU;&x-
zb7Q(q9Z(+8q&t{4y+6BSHDDg0^53t7uo^HRl)I<j6lPin6cG8^&UKSt4HA&d$kAxb
zyQ%{Uh=}sQ;BW<GO@IMmUxU9|j!rXZ8PdBAFEqwl0b4-Z$j&{=D!4p4#eiOEh}D33
zgp+E(lz_Mq)<Bk^tgzCR;j91+NTKs8<$N_JQx6$XB2?4^ro(K6j8M8W9j`D>kT$^v
zw8V~tsvLEDx*jYby9%Znn1DvHTUU6>da!_uw>upUn(cE9Z~>hNNdsv5Zm$3v(DKHv
zG|C3JfNtWDRSC)I2pix6Iz8}WhGVyZGBm&ibeuCWYqD(Azyvf%9Ne&BNOeFFB6l`P
z-3|azB~XOO9cR*js1hhb)U2j4qDr8Eh*EjYb5sWuAsRQC^}$mG6QQXbbGj<QQwbCy
zvH<m}8^*|L0aJwM7gp%2?$ynzVInkFnDF!Jw-O=hH7#z&Qw0;y3?@xM&Xo`W1$U`@
z9oaP300l%B-|1AF&QnaaP!Xy+S8<H24lY7xi=<jji!nBU7LlUW^aszHXeW<tcx#~}
z)T8lyaN3mJ57Iig2px+S-l8)jnh5H5#Q$nx1KP=nYL1@{&N_qf6Ao8yz<f3EW#qDf
zUKw>Y@G@dEiC!6V4Rk;~yX;Qve6?C4Qv;MkWaoM{Vyc2lQBh~DN;OCqy#m#$fFeZx
zyq?-Iz2GE((+WFBgsLhWJswgOOoV1QR-N*O0aXG8MC6+c2ReSR4pALYKr|X$F{!c%
zFd$4%AFCT#9Z*0-I$!5$zlomILIqUQKABC_<7-CO2pLd<s&f6ME20i4AR?rK6kY`>
zG`!6wzyiW%p6?M>0tSQ(O;xzDLkv!uUdI(+1KNwh*_1q&D}`rX0X9WTFz?lHHUOpw
zt8L!ti@vCa322z+E6?H;U<2BL<EGTFY_yUzgO(w!F!`j99%HS5Euz()z3k3U&YM=e
zf3^nrBKqmY4~4xRJRqM<7M<p*Gqq4Lsw>sp5w<$OfY3rO8&!o`s0<bU!YiPvfeNUc
zl-r1_9xNd9@tlp=8sGvtMUXb)ssjp$Tu^5ts(P@1?9>HeG-7LjOVPPFoMv?Oa49+q
zDs9GA3l<?GM&>HWYQQ37gw1y66;{=hRN*kMgA3@S5jKvj4k#ePL}@u$jmVk+1456j
z(TLUn7tl%bs9WUe#5Dm1gtmT1v&QFbfJ@PhX6M~$lajWmg9@l-6NHy0Om#p35e1*G
zf~p=YAe;A_hExd=P)Jm8=ihbYENkGXSAq_xCCgnM^_8FlYR18N?ahO&1PX{KLqH1u
zX_K~H4^~9>PQ$<gwhFj_PR0`*%~Hq0(=B!t;38tCT$eSBHejxWE~4fI{Sc^Yp#y4v
zx<|ukYoQ9Ln&`d9RRI=|$tNNL*BmI-2pLdLoCo;bhG(q@OOaK3uhZ2tsfGz?l(JT3
zo=EZ30R=>)37mFpYOi8Dy%KalturV(le2M^g(w|o6JS87H|7tvs53{@g9T)A0Coyz
zY}8@BB78tEiK0_Dp`#A{72yMVw*ROpGOh+DpizY(BEUP(E^N~@#k*b&yaah&@MpRV
zBJ6eW0sZNC+T`S`1_?->RfDACs(}e;;33344w`MR^>6{5tKV}ajLo0{DP%o8t)p%w
zHoygRdgS4pq6m$7J&lk7C7Mv<SqlLscf?0Ha4Y4_rM*8lw%UQYK8_74IeCayRc(Rb
zF9A<vGb^^b#IRf|w%qx&q}@=LBAJhuPK)KZ?m_W(v@OAbl-8wYfi5)?70M)I0fan&
zJ3dU0^<j3wQQyw#Tk7{0@uAf9+KWp7!exA(*aTGHdbvNn=*egOsjT}I7k4^k0HL+4
zFM8;*?$DOIA)|4#S8Vl<U2JyGlWPq_DlDdOJMI8EJk@(}7e@{5)=`7qwKv3C!o-Aj
z2m_^+XQD~|2%w57E9teSlah{TvdX$$+XZ2TV|j%uZdlsy8x?6-cWr6EZ$#{?zqlMA
zwEEBPbZW=aR1CfbD%Ztppgj7SLLw_LRX{||_=HaMJ{u5mT3robvDc9GsjpZ+S9M_d
zKKJH2#P2+zJi~18Ssi7tyVk&#AXAff=~c~SIYfG)A+^x+8U*7dfN&YdN)Dc1;XTyC
z6*{R&Tf5Fn`+ei2v@_b8<p8189_s}5)V17-x+y7#Dj>+M??4sujp1Q)tzq94C}3_E
z75bvRkV92m<Auxe>$TA7pC-S2bJzvp7vNI6TQ#}n!kaGKE`4|TYs#AoGtJ{0Q^3VI
zns>1CE-n^BoNmM@+sh1M5RS@VrU*?4b+bwTWN<n-NkSWZByG$1#yNJUP(U}_(AsA6
zv03jzqc5@>eipkmQq^nGKg+?EGDZ=V3zM33=kgo{G~V+SetH(xxfsj{pE^13oSgSh
zo_3zeHx*!rXc<o-Xv@j0M0tt^GeH{c-Mj{$?|eINSIBo3Ote|2J#-~r>#am=y#>tf
z&6-fnDY)H&jgyzyIay3`Kk)WNKkVIyBd#-<UU27qly6Ur^F3ngXr}W*6&+n<#c3u?
zm+S5gpE;Q#L*QQ;XEuzd<eQ_7PxDS>zhuuzE#nns7usEyTDvFTSqu>WPo~}3xvk>7
z{cA8{E(n<SSi7TMCq-0Hf3p=;upjEd@@vi5rJLkpX3cn{n&!odlwjy_ss^i%!8|Ey
zJnp5j8pqQYTy4uf_w39<@5%Ui2jo6lsyD<!HBCC-+`oG_nx(t%i2AAYuhadu28WNH
z9X?CahJ=UlalBy{E2Iiqg1<PlV6Hk%e{g|8j3#N5{SE~Udc)3O-gmD1bB3fFq@lQM
zHVmGV#fcT^BuK|avha-9X)#p_`QkaWOW$-%5+ubBult0}owyv!0X_r{icydc;aZ7R
z^az8s!H5z`k+k(%RlELV{H#N5&U}!l8anjw1cm5ff!h}9Oinsyi|#aURbK9PlNh}X
zyITQA^}Q%oT?qQfZgEl=b*?AxX3uryX;Q8$Z>`RCeL1)IqOLK!sl!+*?-&3o?;8Lr
z?;ZfE#xT1a6e9()0rm#Zsu0svh4QR$f?<Y_7mv1ENSac&af78<QtR@Qt95x)YF!?X
z;mvay-Oh1|rTba)@(rzd`Ht4Sd`pLW`JP{XLalS#w<v+jOIyRDn*XqKI_wfYSqwK(
z_vEDHI{3j8B8NUBD5n_r8#>O01Y^qVkP&~VgD;WZ5edg4%wFkWdUIG#o_C?I7nG1#
zDkKt+o{W2W&X5g6s2Z*o$Y6vJhd6?bqrmwxRH}ZQ8+g`b*H!fm7qepYOBi@Ivd5!K
zR%7P@bk4JO-}10O-#LUAj4sjNzX@S=-;W-(=U;(eQ1<JxGj~19ZI>7A`Q5uF=5!#$
z0kEWT333@(w}@;Bq&I{eb-vl_x(|LWgV49!2&A4%7uqMkm&d5ylGbI|W&F!mF2iUH
zCFnJzVlyJbY~DM0>#eofc<tR}<KvIlAMAAQuDzYmbKx2v?(RI;=-gYoxAxO&Jv%{N
zQqSGw;l@Vi?X^4p)2=zmU-NDfu10zcUlA44ukQ#YJY_t5rYr9y?!RC9>Ra~Zck}Nx
z-=>dWj@rKvwf}0={$o-5pNZQ4zNq~NQTsm-wf|<+{s*G=--_Dbh}!>Z)V>n6|CzZP
z|Bur4_eAYKh}!jR;lJOE+I3C%@3*4%FNQ`w|7z5JJ!=2!QTy+S+LJ{#p5Gg_|5()i
z_eSkM9<~4dQTy+Y+J8K1Z$<6#e6B?8zb}Hn6Se<?QTvyp_MeE_e<o`GhobfmqV|6{
zYCnwH|B<Nu<EZ`jN9{iwwg010`+3y<k45d1sQo9S_UBRiKOVLJGg13L5w-u3sQsUe
z+W%<O-iq3PDQbT$YX9Y^{q?B*-;Ub<VATF6qV}JP+J7}_zY(?nTGalfsQtf++T(rk
zXQK9aU;H3yzZv2A&8Yp&sQtI1cDI6E=U<K5Z$<EbJ!)T#+LLG5`d^FMe=KUh9ku^>
z)P5&wzm~*QCv@D|aNq81iT1>QqFpilN)o}|*$P+cQ~kpFyX^b5{C~8?|Ao5?w(P$@
zUidHEKa4@NDE|AL|3aVet*kuhpS9j<Nkh4Lv-P$2S}QC2<I#rX-Sz}U{eg8qweM{M
z;+m6T7w77rrz^PN>HgwkLrwkJ&6{+OLi=9#xw_65qo?XQ?NTl&F-OnI>{9@xEZzn9
zJ>B+>UHc}QD{=R&_up%c_@whSI`7;}DCnx1yXF#lnoOK0)yxb7N)3GFE1hSiubw8o
z5%M952`Z<eCEk6fGfmP1x7=B3IVBAt&bnRK^WL=6A1y8teIc-w&@THL3=5ai$*Aae
zsR3cCc$y4R@l-$^JR_>)nU4Kz<_5#{Pm_(!ql3Mpt@X|0gN?2I<D*ZK?ahz2j*bqF
z9&Q~cA8hU%bokPFw6%5E-a0<s*?-th4i67^4mKWcZasMXFlp~^ZXEA+HVzJtI!BN9
z!|(RiH#!H8j}NvVtnY7z#^b%CPJ3&2YvVX+A3WaQ?CkA)yv1eQzO%XY;of@tQLf{o
zqn+ce&W8uPkN38c503%d-P~y(CXY5B@9(W2+V{O<jb-b2<6wV#=V78>?JW&{@A2;O
z4#K1J?%ZC?rnhCi8Qwk{kCAzApN}v4w=X%}=8R*p`n<pS{Pst<w`W>ew}-=L7pv&p
zKSxc2)hX2oNOiicSBGV7x;mZTMpC%#jbY)lGo26DP9~G2Ez98j){V~=do6ET?e|(w
zUZ|xdbH?mC3v8VQwlmuq?e#BKoMB`>o^)EPEmmE+vbNWc*LOh$y4XnDvIW5#oz7a}
zrQdIv>d|`BHcV^f#%Ddg^Y2aWIYOvyJ?Vpls#j#hVZdw+My>gaN#DRba}8-{w(e|q
zD*)YUJw4(y>hZ{Jt#n8$D;N>63WmKAtYg+{I|+?<;RJ82FQ>@&!z8tMcjm*H9TR89
z7<3-9B6OrNvfme{r~T=t+VJC3|Ldk^<lEt9Z+z>{8{9+lp^3y)qLLtT#<XUOiJnc~
zx7HgsW^c3xvzFN}dDQM%S7xo&y_O`r?sU)xfdQc#*_Z3dOIaeBKE~p?7>{me4X8$o
z=eAx}R!%*Q6Fpg3S%|o-vJC8%9E|!0rz>wtPwK4s$-dQkWA+CBOqh)5#Y9uk(W{+x
zpcZ;jMmwKE^4i%X9SeYabaJoce(Q`WI^hb0?01+{nX6eXUv2T?-uwAG&u@J8{FeRu
z3tBM{Mn@r`FwP3)md=;-gRvz?R&srF(BJT8yp?0wSf|J5qJdHS&Fjf{F<%{@t~#sU
zD}Ntu51MWtcJay677BA~&}-dT+<)~e#71O#s}1N?FVIvGpPI9?*PqxhHX%t#JJ=+c
zoms1gwGl^TZ|!<wyu%T!8_=dv3;sGH0==dE&hBijOio%iCaotE{X4UNk8kvN`+(oL
za}T1ma^qcy)_peb1iLu6`1z>C@)@6qs{SEBWRMQu*+#RzgKBBHs}BpRF?I*tebyZe
zJ;ZdT>kOkO>;88OAz==EG*WMO9S*te)JWj{mM!@_jdc6;{1(Lc%sFA%YSC`YsOg9i
zF(z&DTdj-12u}hok_*Ui?gDWHrpV-(4+s6r`}cP-d9Aod`E>B@o2&2Jcch!=Vc!kd
zV_&qnah4}m>MhRzy3hKzx!G+F7roiKF&r+|m|s3xw*j($u5+n*aQw9^@g+=C{u+NN
zAG|F6NcOh&4v!ADI|qmG-TihFolp=2A1`c30y#4$3IdPHR8i;v%JY13hNW)mTpRSj
zIG+(>8Dt<#*N{DI?Obg~8>c-IXcSC{iJh;U@;A6h6mC0nvJvd&?z`A!s5$^Eg3>-s
z3_KTmh=oe#DVR>umJqSCy}r45loN?Q;P=*t>U^+s?7uzQ`efr^bIboa{NR&zDl={D
z9vQ0k`q5z!dQ39fJy=hrpw0H7WU2?dkJ>xm+zMd6b$jo)9hxLV9c>|Ywb%C!ceiMd
zgeE5&t?wQmZtRtGv_IKD{=i8??St*(jrAj>CLV9+AVI2`L()5B25?Mi=d3$<@6P>}
z<OAP>NYrwjEjn+xo!6bKncr{BeCPG=ez*0rpWXoA#^l>S+u~Q-Fg}c6Xnjd!vnYFE
zvV)&^cFLwO;a?k$J^9L6iS0*p2~LI|E3pyEcpfO@MtAhayyc?iAe=Z-fqQYkmEh-5
zY^?YA2*o8q8=Njb>z_UAQ0%v-<Epow6A#QnA0bt><Z#qmlfQS$i$b_q9gl`Dl3+`Q
zqb5)u3Qx|*PYI62EqsRScjvUl;=k1b{jAk(op2J<_@Z?=6~Ubv%bTAg7b?OH4#;V0
z3bSm@$E{PWsV#hN{g!Rn*;*vn0nSMX$OO=BPX<>;i{a2TrC0t6E)>^j%K@o)*t7M`
zz<9{PT)c`$;h&WoJ(e$oo_{}Ez;h6nZ|T`aW4`yGk2fZ(Hzr@afyDli&SYf|XeO8)
zrd?sYty6dl66T6s#5Ffzg4?bQRpN4_OS&!N;!`fvD$N7?7?IT=$1{V-v8RN;uo`58
zU(+&P5vMR+v^QY$@*?l1?a4WZ@0<~IwzBft5yZ7W-G|}K0*_|vjz=p9GDyQYe&e@C
z>-|>Ha8q&4<aIpxqRx1)R@@1$9T8F18GP!VNayPNQM*QXGx>lk@3-Lb6Cyw7Gc6j;
zmLdp^Pcl2NT8{C$h{@YINv6ewWlY*so^r;l=yoaoU;0vuL^I>#eubm_`@L)mUJ!QN
zd%li6!44{ZDNh_I;M!WGR4i#3t^rRBx}s`<j`2*nrLe|{DonunnxjQMTVdXyK>(dI
zT+C+Zj;BfB;@$8CrmJFTPhMy(m=Oc|gbqjh1-BwtVw&>mn2S#=AFY@6Jt6dOAXm6!
z@}XM>4u)(z{PJWrVz~>o7}WU53#7a!(8a0ox2=yib{*5*+1_j)8w-cG9<9S;gQ;SB
zcRjURv_D>lIc{J-$gxc`LhN9WSZ>|j*@sOp@!q4YjrP%o?|=(;982Eb**)GmN-&Xh
z>?SKK^O==lFppnfb}2u}?Z`}ELhF)=fF%idFk>B#QEnk!PAm83Ea(FuSfo}~0+~p<
z3+joc(tCo6{e*Bjq5&|S&8OqxEKvM8-bna)d*|Wt(fVP_^B7E8=-j=r(M-v|l$pfN
zpl$3SZnf?L5xU}@mh@@kU1BZ}ys<b<3kBd7%6E264xL@}L<TE&kyQE?)~(T*i05E_
zbFBpzGogi$ke&c%k`)O`GP>#)bh0iv-w}5_tdM4OH!KB`+BG-X%-<IAzO9%IgyYGK
z{vdI($ZRc+CTWwvR&L&EeMucD6_!PCBjqe~bx+P6u`>!UOHYm^V0nagW<wZ^@Ei_$
zt4478`sA_PFERHZEy&OT6K{=ATTU&$AH*@~rOe{=h31H9_A*D`X(}uwf{mn9-EXnZ
zPM&UJo<Z!uf+ZEh*tz3H%p>+MSl!ABH#&{h=gJDEu_x#@*{Hk2%kB%!m(zXx`qbik
zYIiZ+9y0@z!vjMqPr8`XSbAEcKxx)6(W1vFx<!q;*8i|e$W~RrgtRxh&2E7-L6_5c
zv=^|UnsO(}4m+(Tu9I1cF&yo7r)T{Q!F<4S>Dx;)Oip_OmMb?D{$xv+#Xud6i&poP
zbq-&L@YtNP-cJWpltPnrSni6QYpr~=ezcDr^nUAcd#km6-1_9;@losGqy5&=PW#bo
zHxvIN5dKU{HShbBP@U;W<`@F2Fp9B8Y}gi9@3I5-`WJ^@lDjV})d}{wjAru#>4U}c
znO+r4`S~eT7l;A6up~p6zG+vc10^mR(xi_B25q@uqUZPC7CR@u0gW)Fz=8A<C&alx
zlle^R)U+tGFY^palS(>coU__-yj7b<w721Nn&!!OW{N4CCWPp5<u;n&1Jn#zV)q{G
zY$Ool?cMf+$J<7wH;+s=a{87LVP`;=vLm{Tnu5j?bS!>6dVpP8t&bkG^)0eFg-x5q
zUaow}6Wd!Y3$|KFxRzMb{$PFc;G<4^V|{naNp{d>LM&I`cTW{)r#JUM=8{wF0g)Zk
zg~Lg7HdnzdTRQ2xZe4PwIg)_Zlq`*s^4PQPy=yhyk=&RwaK*ASdz=3!$nPe(D?F3!
zJ7=j3Unr1nh{>!2;ZuU!1*S}eylKtm_Jt8SmOrKiBy>z5@P-cQN1WZ)xp`YpNN0KP
zr+2Wh^2`xo>_6{=J?v96OuF5>Gdmn#_NND@$CqPIS5ua@veM=@n4RzW4gat|KZJs)
zbps07Hz@<WD-hcK#l?e#sx{A6ZnnPET558Z@SF!`NX@m*8bq%{Sgs~z#WbV=Li-$-
z&)STz)KNsjUW(wAnQ)h+tz{YWM)H=Kj&3H;zx;0J-FMDkx%it30#laGltTziE6Tma
zCs5-aQKeYnHkrYm7Hkk@?Vu?!mErU2iCcm(Wx8*Qly*Mb%KRE{La#Hg*{M0h5;RDx
zHB*R&Cq&P@WXU?v#TM($cU9&#GUC50)?0QOd2afN!a$AVCsv<9rlR3lmw?y=KEoB>
zF*Z9_9HecQOSTm>ug_KijZ8gAEk@5ps5W-DKGr9qA_Uw>munIY+FQptb<}oFFn)LB
zQQANZ#Cyf1?VasRhLrec5+r7#cJdI}@A3YQy4i1{;?ks2TCsv1ZyOQk5IvqrN6;`1
z<)y+`y=2p2VP`N&eflCZR+NYzUf)IHxH0pxhFncIox~xcN4rEH1QbyqQCrs4N9Sk^
zP9Wr@P;_0RpXpY@qKnM8`N{5PXJ;P=-f@ROA6UiCR@;P|XJd@;6Wq93nuEF8PR<8I
z?4?lEQEPoNL7Rsn%%$7R)>@nW)9zw8cOufFzm|N6)#UUAH|^3GGgbtS)+K&FA-)o(
zY9ac;<93jce@DvDyDFLN_|}EwFOw<}fLF0!uum{U@#3DJ3>QWKoC)wIv(?&r@>Np?
zx+b5mo+8pQTMA<|8<I>yv*8P8ID&1dQ`&;mi&@^-ls7dw5uN>serHMmU#07%HYWt!
zGNV)Qb9Jan$Ic>Z61s(ziwY%Z+$}OR<^wGVC$jmwFe;j6;{|$fO9aJ?T0Gr3IkghX
zY|XxB+?@yNVI7h|jYTnhadS<NL6kPu4~0Ls?!N2SJL1FC9+-8dXBP9HB6c_lo5qta
zEUMRPZ$13bq;cnfeBr#0wj3|J%Z|#cI5*|vN4z!BJ+i~dHYCxxW&Tw63p!*2y(ftp
zVNvYa)0^%S;@vQoDUxx~ml^l9QU8p|dDc(8Io_VdyFl)c9%hf4*@)M)=e?5|h!d<&
z?0u3ZDDd}Ut+oE(;7FkZ*nT9h54tlmp@`>4zr5o+inm~C;s6TV#2%m{dxPY;uH<3F
zXKYQ**y6KqwBv_^r+t>AmJSLbcP(f^!O3I(_(y{WLBo>Lrk|_{^KV?h&vv_*KhL$I
zgE`G~1hf5Nv~|xNtmn@LtQtnh(8iE_5S@<gq1PF8YH51#;vrHQSwxs3Us#+;8sw6`
z6?<PNQi1$yS~oup8s}*K0SdWEf1lo1e48=66pjHs;7y_*`~0VvW|{z@t$IRKgyQV^
zKAVgZR7;K%bKxdV9bu_WE~U<1o}hPJJPG;*l7LJ+!!di3KRh+Pp*uTI9vcgdhMjA%
z$=oBgf}cdNID3=vU)(j$tYaZ87Nzp|aKn3Lwzjv4GLd6uul?{KjY^S|1x~(=acEDv
zgb!XZXGsTV*7}FL$+4nI)*YU<*F!oV-b=RI9k{;k^WoMJ0>HkRcpRIT-{+;ig?bxb
z#==ovB~rY74;H?cDU_`pb_*7r5nFmO@_U#)iaW<UbCM{~M6_p&0k^BLZ8QA@)AZkQ
zh{;Qtq*REIw<RBAQ9d*6$9sG0N1sSw)nBqQ=Ljtlo}f2O%KPuP?%fgn^rlm>IW!b(
z-SlHWhKF`8o`|SiPj-SN(aJ|rtR=)|Q51(rv~WSBIeOC?nm3!vWEry=IpvF+%x**L
z<`-+NJxr_+G8o9|VwhvE=`-VkL5_r%yI6a<#(8Tq(J+vBaXAx+jG>B;tzl8~(f`}#
z82}tS$o=A0kWM5kOeYSBN4nN(_u-&{^kPg1H*g7D{ps3y>OFEsW~t#P)4@apajo@u
z)^BZYZLcFp>lW;3`kkQWz>UpjuBkF<0fGd8=n_f#`Y>UV8DR9Y!AXV0a4&i79Ot%o
zfW3MjgsI_1H}kxR^+!0vpf~%fzU-w|JY(uP9nB1dGCz8TklHlG=+g)URPVb|(^Df=
zCayEXPa~V8cya2UGo%<qv<{tF4O2x2bt$^vjmgbqI+&e^z&|!Q98o6K$Xc12DHjjE
z1;ohy?X$PuN~9QhztG`^_%l*b7nV6%#l~>5aPBx6leI+lx-SS*>&l+eYyBLe*4H&q
z?2Wu!WtFn9=QkzcutB{^L;N9{7-%5w2hlBy=Nu%nwiwh%jLIpa=#A#pNWC1-ioZPo
z3(d1H`TlxhLh^bz_l6=TCi8d9!j@b-nZe+2RiIyXhn8QfJ<Oe$Ez^8Dx^j+_F32mX
z;Z3_pkAiZ{xh-ru3*k(&64u_qpeGZYENaA5e#5>TJ__*&`|CtUZg0Yv4v+Tua{LiK
zkln3JZ{6Hm_l}VE-qB;nJX0$iYV6kjhMXbNX<c(1(&?goT#LnTV(r}6;+4Es<<^M!
zPKbnbwlgAZu*+)26F?%$d}-30c@mKw^wzu>3k~#CEfA@mUtf?rOFxXnY@aym1CMr`
zUA^U`S=PK3ObasttILJ_X+3GWS?A775p2|QfYHbHaoYjEyG-$G5JEaOS&N6x=ex)7
zHjj7Po&Emv`2%>R(r)9p#`~>&w?duR-nz5QR386RKh|S4(dCR=IKyWBmHEm1`Re=7
zV*FtAM!Viw-ifc1?K}*TwT5XL_hi<~_K3%yk&V}})5-x30k*K^*L7WZX#ANgE8+v%
z{KR6Wg8EWhskkndFqK_jL`o{hd~kvfU^f4nVjACHQBdQn565u!aM{#D3|kbX<{dFm
zULTSL=d)VpMr&|m$Au`2Tq`0bz+MnKp}5*EfEY!kQO_@N#>fkn>7lfGS|vNio3`%!
zdMJ=!7cAkWe*yp|%%vPBOeMDWCu1lGVU@|??JvK}i;A?qc!`Mobj+md-h&UCU0wCe
zcC|~S#(T~l=}9@8#H>_#GM;h~F|80e5mRmP9V7<+jc7g@T(^;q_}_E>H*;S0$YzXp
zC|%DXf~@_?-h+c(;-QW@hwDTxAwNY*IoR&E(!nf7eS%;Vlqz9^pa#MSt;_=RFR@3o
zUAsOwym@JBZz8PC+MgyT8*l@h-mC9h=rhXZhRKXO8TcM9Kn2_0zrV4wcih3ByOoVs
z{&HOUGuf=OR-7apS7GW8<UR!Zr|jj_h#Ow?{v0+!NlP&jDYC)p>0IgJV@gepT{Rq;
z^gO{VhrN^VIC9iSJ+P!a#G+gRBQbU9gi2Ky*f{Rhn?_I3wVaW3$Tx2MT=2<h&v?oy
z0+fA8{X2NaDA|ST{!G9N3z2i*p2Gp*a4=t4O3|ZaGZj;U^$^|07Z{HVyEk^Uy}5xu
zKDCy9AihL=T*O^Iin&Wr&h`p=+0Op<LC*bdhE_S{-$q$$KR9}%i~!t2n>*_pFG&x}
zWZr5O!=$^Eu!OiEs4PBfDO9pdnJd6T^Mk!qvJfjlkwh?wWxJ{N&#Ts&f|H#TUm1JG
z)4TR;Yl<Ymh?)x~K?Qn7XjzM}*{7IfDHI|lf^CT9%tgNhljvIO0}NfGajOqw!Gtt8
zWoFE&uWM({@OG;uj(v(cHDt~^y)hDd_bCov5aT66cyyLAXfAu-8IeQS!a3*(X5%LY
zK4~zEk?9lOU+06AF=0U)G)QNob_F#^2(UqW+l?-F-QI+lDJEfZ{NNEQG%CuiSO7GR
zh>@c9<e;OD6uO*P;`~C|pIC{^YVeP|r@8C3EOTwZTxPPfF-V@qTp2+%vlzaXPTj$|
zs3<gU>|$ZWcf8`~Op5YG&Ib6%Wc0QOK(mc%v|>>hD#L@dww_agZirm=#^%=M<HOGJ
z!K1Bx+&bg&D$zNs7q}bW$Qjh{tjYX_qmo$~4^va>8muJJ4kJ!NB)9xdJa-zP!8mr6
z|KK>KMHL{gpbMj#nhLqSI60pUx@!<?Ce+1{TjD&vOlRCwI>Tevu`LzdWT-4kAoYft
zK~{oXwM(=TB=|+W;1{gqyjKP(30Bc1haYV+%6x*xTW>K+GiAQ9s)Uv9P%Pp_OZ5=@
zJ;0rSr?s(AjE{jHoBq|dHZL36L$Qz*M%Z!snN2_zOrel><PjFdOa|5X=dy?<G0$!V
zY4_<GN4rEl9h1Oyv_*_dNZVLhx%*ClHKj?`@zL0ZBYKruEFHQj4m|iR9FW+JOx)8_
z?cDUR(kr10*jqT>Tv=Bq=P=8Zt8Z9btldkyFH<L_E}oKsVd8UK;UERk(JyH^0$kQI
z0=cy!vrSMTB7QA%EX3$sg1n1U;f<;Wk0LfYR*jB_cS$qbG_9*XT^pUjo?oJzx?8BF
zcOUQWw!X&K#e~O{bZY`M?;_1xv=A5YD@KLhmqN_p;t-tE+iSJ4fO>HdNe;Z5U>`(<
zD~<g_(y%~YANhy5kav435x{g|ga`}8*Q8tQNfd1|Nz7X;*t6+)L1a}=<xk)FG8c6e
z#GeJ~tE@K-h5_AQ6K_%m8qe&!PYF%Q6WDk|NS~>vJKOv6%aSJOg7S~nAE5AJQF5BE
z<dCBek3Tug5*POml+74~4h5MUZXIpxDSqD@bdh6%L6=!n;m48)6P`7c?F!+1L>$bd
znI%rS#rDV!curV7tK&UWaed6m!S*)yNHa)Cc4-~AH#cFcrVlC>%C3wB%9pZ8t|G)>
zl83>vhvB%a5pTxn45(LaQg4#6<4$+3#<^IV8FR2@zPM#-g5%2}xlhr+&d9^QHm0T6
zYsK`rv>~RBa23>RZs&z|#VN^N18V1V!}Ghi5L$?VUK1h7le=Wz+`HAn6y(CvU}@bR
z9MOB`yo3Nbz>+O~Ke=n%W^&Kl5JLRo%+w1nxuvSb8}E@pC~|c@aazW!HyQ>vCtQ1G
z5?HB~VKWmiZ%nkLrq2T9hTJDC#Y7w5nOU|ipbhV3WI*xr#mr7la3@*Xb-<u2kxg9~
z(HivfU`o!O60=>F5!3IiZ(No-xK_BNA}fj)%3h*VL<L)|i$SyqWXUF`<~o0^IgqVe
zlG!tAav=5aLNeLK-X=c>Q@#<4uh`}mbl6rIUSI?le}lE;jn%hWfAEg%faEBkbL-pW
z3B|0RXqmC1_`g%&Ozm-2Kj>4YnTa~3mEZ_<wpjR75equlU__9^z$I<V#YI6AjwW*v
z*5>l+jTZ=t_LU*9EPYH3^wFZ}V2KvEQY>Tc$uN@eQ)j9p{N&gh^Ppfm+Z*_^(roUH
z1BAFu_nZ9bjpL);Kr*)u{kK5%wl}wTAGbe%kRGjnw6W(jIn09X-SrRianGiB5%#>h
z_2E+3^Tr43`}<qFZKsOu$%VFi2tk$x__)pY;~mVDD=T~B-h$1)pmLd!N`!j&?A!O?
zqpUe^+g{2n`e1N&zSqBK@t-n^mil+cmu*Wva{Xp+6KhZ99W4egKcV^$To~F1JNwY(
z!D;Kocp>kY@^TaoLLD_iZdqB#v!(^i4C12u!erFUsYDCWc?#{c$j<lfUbifs8K$$r
z^F(UEMYl%<H3+=Xx3K!@Nn@~m)=e4;!TMxtg>{O|j46qqTDMJM5fS;sW|^GNFQDX9
zm2#r~%aJ^4n2MFTg7(7Fje1FvH)mQb!3WXZl(f`SuZZN9sy0cC81<ZBEZSxzA#SuD
z_UC&@F|i+AqL&j!MfDJ{$1;#AtSDPo(SndI#g|eKZV<#@5<QFriLb9)OgG~&dLrH;
z%Qeo8O^i1RWi;f0gxx44zh-7dQ~gEUQmf$=t%zy(g{)s0zF>OHnJEo70<5jgE8-9I
z6TTS?o)TWXks`Bfrr|Vbj_EU8mu>%qsBd`p;}JpLPha`^8h&<(h*D4>>^)g9S*jku
z9#ReB=YRg^UHknTvtRgyUkHvYReOlwPqpZ{iiw2%zZ78%eq$25|CFg<0yi+Z+!O`3
zmes5f7#6fwJEZxgYR6)tl`4&qCDP{QS*eqCXKor(U&NNeo=>GV!xJSbW;2WGlIPLW
zGD#R;WgUo2Vu+Hf!Z<W;wC7ZmWE6=e)|WPcN12}XEa`HDEC$U4rIzxj@@^?x$3i{G
z;#DPlZ+c9SASEdfd~UVy^XA6RU1Tg=Tqjg89rVqnxPNeb{E2;QQ^rG9#`Z@$`|WQA
z=I3HRyjB3aIU>^Bl!6em0Y6jh$2MW-k;U<7<3n0qF%hNTer&;-+XpbuM`XvgsW^7P
zx4lQl2PFOM9$M&96zTD57K5o&Ml-z*@$hJU|MBiR^-vVGV4Co2iXC_nD-GMOR>+Qo
zXhk0x>(UY?#V6%ikzLZa$$5e0X*9##gK>#ck<d6V4QGL0uEdQ<uy|_fgH|d7H7RC_
z%a$W7<SPa1OR+Z%Loap&ON%q+bzCXa>QKw=!>wRf{2du;DYlz02Z>0*Hp%oG5+_90
zUmAg_Va)P`+3H$<aD=g4k%^D@AE7Gmw@3xsUnifAG9koJ!{ZK3`<F=$M~lY0^68z`
zuYCK>n_pk~?7PG74u?0t{`0+Ci(Aicef$f=FiiOSboJXA(6`pU1+2mwPOf7`Q-oY_
z+n{TdDm2?lGR{1vK1U#1b({UstiNZbo7c8QsXy!vvtCOr$lJD%K{p<majPIB7%k#b
znR1*eg12?n$r5W*bH2*&1SXl$O!Mk_eo5f0l5Qcj&j%Na3oA$CrqGxwyfZB!Aw%8M
z$5x?fuRBwapV{PO!L-T(dOt(*HYg}0`zO>lems?aF!$;mWvP&&;gX>~#q^n$$vLMd
z&P%;9BUgOd$5`o~$eR}<H@Pa47X0Kd`bx8#$|llX?{A%MpFQM_(K6B@E9$YRzr|c-
zp+mxG6j?#Ok&moUeNwX^^+lRs7>HkD9KA~dD^ewj#tY`d1@}i~AcAsbq8X_K{pp3(
zJx0b=a*<2*<eStDXgci43Rhuu>3I8(Rf#%2I1F(RzIGK{Wpis+d%&>>b3r}Ek@Vrt
zCaO;s^&sWt!FoHd4!L|!Iil|2i2C?|<R$ZlWd#?u_B}sgWLZiRKXy^h9&J7hKC9hB
z)Tbk-Fzp<DgO{o_e|b2SBcn_{bN0jLQ+N8i@4UZqNJ{c*>w!7y$)NdC>-BJ%+yY*$
zXQ@!wnlNeqTb!E+me=DYC3|ST6qWNdBdABTKC`WBMT_&8iW5oFi7j!hNyVvht(j1<
zr@Al#vqvXe>~3}g(vRiOBLFGzoJ9`XQP6>p(ZcP0=X$b^_%qThknr1bAqi>0s|5Fv
z3F6#PPk^1mN~i41*jozIbJ<3LS>2f3zA;-vwpqgj{ZeGQ8j@>Em*|7>+z4+Jpw-SQ
z>0IK#vNYJBTX2;NF2|xk7V2x)6XaK(vp+|6y`Fqawk=F)DoFM%h{m^)FXHdP$&96}
zzLZNS|DwPug&Y)Z<|Q`G@HUqcJ$aGl#dG1{swX1SATKem)-1^>aqy;Os^DQ}L$MV}
zkcCAN!d&As6N#mAp;X9>Mxl`_MKb_i%%2MuBIHn|<`A~A^E`XAaj+Ao7LoPGs(q5u
zg_44HZ4!Pqb})ZLncl(TVK(Qb4p(F)ZgwO=C0X!HLybOik}D=>SDua{b|0<pz>bc0
zu&r!;V^{ISAU{6Z*naqO7G+V<NM#UJ0-%%Muy{pI+RBu9T2@`96)Y<hfqg`BAsaX4
zDzKR>-jE;5?rt_d?aOZNSG^z20<Btohs4TA!$;3q3V>;0uQ4gxC3~8;)ABrLeR;21
zKc!F=iAH9W%=AY^|EgMxRc?_D+f1Hj^ugw!+Ykz16u~ZrfJ$7qYor32zLkp`w(EO3
za>K3+0l22tafP|Aqts1a$&3I>9*lcJ9MIe>;!C?|7GB}91RzRKkN{HU+G)hBjl~Gb
zoudfg0LXFpT~}r|5i9njH_+0Uw-PGxL9VF7=h9%LRlH_ZPfiAwnvUV-jxte({nL5p
z=LoSHoDR@}#p2#X+Ym*k{$0eJj;QJdlAT#avAoqcZnfUHjIUT(aYm)Kd0P=V2e_S3
zw7n>1ttMU-9JMBS#g}4!mwui~ih9{z{Ja@<gETfcR8;M*rV1V*Mxf-qy)QP)h|Gbg
z@1*Y@Vrc32(zJG-*@6bv=3RV?+1OJtfmizeigl`JV(axAv)6+h3M;|he`6vuNcx^j
zwgE+odNPd~Gm?Eh&Fq06y_9`hBw4BQY7{D8Ia3CZz~PtamBe<%6}J@7#bkL`TXqSe
zfCWK@YFJ7GlTVAeVpn3@LZFvkd)xkHAh&&K)$mEvb)SWglM^@QbE`-;;b?Zw_e@N_
zB~0&hsK}7<hy*pO<S+W;JP70-ayTG7b8sf$5E7<b$0XboV~hz)Iy2N;UWgSwnfW3B
z+Tdg!KgC9KrjTk$#YakN)t3kDZ|a*dLR<kBjEz^-T%~fGnrM7@kd}2#tE2cbuTi-a
z7vUvwxr9Zk7K&qfk!RIeY0P><Qs)kuB^ly#61Yj-)?4$@kJ{KYYmH0nbE2%w8=guU
zd?c~%MX4=%T>?Wng<}o-eVNgWu2SUArL;n#OwZ>1`;&aER;kR;rg)7=_B$SLsjBIU
zzDu$#k${qP9b&D^8UB-$_9L+HCekAhQf4cyNy<`|)!%pLNO3xixG*9xR69A|Y9F@_
z_IE#F-CBNo%Vk`ze9gRBECXfzzkh3OZSAI`@V+1G(C*EY)mXSxYBE;L38fen_M??T
z2(=6k8KYKXTs;*g&g%44QJqw$Od(#K@vJloHEcUalbqzL+h8TPG&;Dd|N4^XM3cO2
z%O*`t5w~AisSY5?y}6a8?4OEP9FZT-0=AYV3391m#zbqu8zgL4<=M;g0W!p3CfeF<
zxgweAO{QzDLtO0&sM00GYZ(f2MG$OPuvb%JPP`OBgrh4^lpHe2E9g|K10Flp@Ve+q
z(XeqCmq+hF{H!iRYV$|gwA#eBX)ptVH}m<jd==qR;rEpn--2&HP1p&dTWI-aDhP!D
zp3Txo6O0Au@RV+*KK6(hXz1F>GpiVBN`l42q9n-9tc(jNr|6@Z4qRnyww-xc7kv3L
z$%SZ764*o^r80>uwjEh?aU_!ZBO~!gW`&7t0{fIS**(~J6rjvbK$WMhion!+%M`MG
zxV8R>x==Cc%nS&3O&5SDr@DZ218ER)!b&$VYk|eBU@TZa+S-1+>qs|1Ub#5ONgC4m
zaFngSZ`q<|Ar9eUFNIy#Y(uA4;?$7lE#-TWSa>&Szyz`*r6_}W<x3*p{SKv3jSzOk
z5h?H0A9kNhB=b~S^^jln_|0AEr49wNWr71zeHzVZN$|EwPl<hVftagS{|+h6g=@MR
zbZkvz5yx6?E97Y{`3Q3i7I=kgreZy&;)xv4ih|SeUr_Qit5~gWlV9|)*^Ax+p+Z^8
z=*z-sF|(M=@21p+=emBxp*2FvUE%HIVSl9ZYn1lFTMW^2&0;?oFV_+ml_%d!%PAN)
zdbj7;=#0UNap=a9-;L9$%mrH%-QQKa%5t;_I+ONc#k_M41d}T7`m}UjabPN9QOco|
zw~B;r6)xyMKkuRopk=7w#NLzEGjmn#J+b1!_et$PonH~W9p;+!@o9h6Gj=c=hjV6K
z$TfUZuKbe$Mh8sa@U^e$-&EfXHBtz0@~W<41kUBNBPy8qW|!0~%8w?dG8jMk>D`N;
z{PgX+7fCMIR$hSnH+o5TdI9B2rSVM7=5lk_LX$1kr$kOi8s{}IgR*A3%#hfW?Fuux
z@GC2gA#0N_PC#k;E>Xf%o#ao^)2ym)%jk*AqY}$di2BNm1keyS<&4Letoj7PQlqW4
zNjw(^EnHZ3kyuu+_bUKwtTi|twH(_^acC5p8(Tz0F6DzL;XA3vlR{>eFzUiR+KAH<
z*V1Jp2A>LDtn~Z$S)`=)TGD6+_EP8wH{ppJgD``0R)UF=wcmXwa{8LCS3KmxIIU)k
zl*#uM;49J*N?h|$c2TH`25`tQ^kGM4viGV&YKjYN7o*zbQo%T+m-rIRuu0xG5l)sa
za&l=|AIjc$tQLDLn;te!u@$8|D%WJ_*lnE9FHTH#j-F1(BCpYM8kO8{*(R`Ub0Jts
z9u}QbPt1bX-fCmRqZrR70Zzw9@LU%N>a!FhHvIwBj{`^E+u7Xd93E|1lu^uM(`1pm
za{3+|ZtZ`ALoR%Up7zF(4wAs|ZgrLeGO>4bri1q3x}`eK^D&wx{t&`R5Q-VWeGNK@
zXpiGLdOE@(B`9gcn7yQ#Y^`3ammz^{yFEyY6jV`1ZM!HX_VOGm;s~)+8kdita3G^E
zGSj8mvv#$%*cfBb3R?<}@wy5MOKK!x$?2MFlR@uwN}04>rZ?~&le^N1{X#_v+_kf{
z*6VBY=kwRGX6am0Ep5~S=@?7Vp7M0UJFxj%p~!rcCl|OFPD!y|olxQe<p?!Rmrz~q
zF9LVIAe&4nW#p?%OjMy;HkBgutSpVZ%+NSLf#zFC<FDU=tK{mJads{7$1i20r<|JT
zszSj5Cy0{uVU@tV<WO&xdLEmlN*$t`5dG<x<3q5t>ZMaW$!TxCRIx<_q%3X2^YB>3
zGX{=LL$*p85VW2U`@1tG#oiEOTCzI1=y%I*m0iWWP|UE;8YB$BwDe#_bo{0zDLc~B
zU-j%q>XWp+l#e<o#*sr9n~IE~RZY4?j<WPBIjYZ=_cP>j-m_aE$0@6&B7}&LBbM}Z
zHKZo0de!_KRB-y#R;Ztn2m^an1{l1OkLBYL^b+#XJ-lG}fMqN2`kWh3FU4L}n=n)}
zHTCR0u{npX2sVRq9eQ-<QH$P~>fZQc0&d?}=>OV?b4Bt@rd0Eh6Xr}VT$qDP1E2xX
zAR%4=7;nT%IZ!S?aXNm2_@<KmR1Y|zVw8}{S_0>}=n(TFPT@*QdfOKA&>56RA%xzM
zLZYEl+14Ml{WnUme8YV`+9TS>QV&T3kZmh<_q=^i_4~tFL4nS1?{(feIPAzvOTK)}
zHJoO4e6*#G;|=Op`eOPVE%edO?q+c75R=3Yr<*C{)yWyl@u^N)p)$UUPsQBrg$0Ba
z%83jJuy#2<_OkYzc?%*gvI>!eo;wF%^rqIG93g_}wqh==8{!wizUqA#a^%tkLL?lB
zCfWs~Rc!6oRB}no(wP*3&@aY>>%xRo<Z^;&U5usKqqKjFu{lQQ7$e3k?`v_HComfs
zOq>DCvIWFPpmwYz5(pG?t_WYqG?nftts(>Dss$9tDbxGA$1l95?2J^ge_;bM&Bze*
zOzz0LBbd%Yr2q#d(qn}K5~_yUaZ4&sAdwB7+f2+4_X!?f-`_a+Fg?A)hnS+dlJc%o
zc+BvFa^xdU6=&!pK4tPt?kCT=KiD)lFV4J_mFD+KQ+87-Sm$$Cl;FiXhw@mYOpupJ
z0rw^=Nz96JBJkjhC7ImiselL2c%Z}$JTDHZs>({xC@X}yX$OqAanx_`BSb;msrmr9
z58}A&uqIn`nrle%_C_#QA;f55fQB}9(LEsnFGMn393@oubV&BO%i*SQ1&cj-;;maG
zWt;Kd5vL8sPnshD3z3trxBf6iu{t=V%^7gGyZ#CC^$zE=Ia7Z-i>V~>%7_0)@-|NF
z%{-Yj;kx_)lAVX^JNqRx^&o0bt7iI|=`MOE_z9{MIl_^$^#-Xt9y{u~;$%x5;-FK7
zl9o5LprGPMkZXaUDxLmAO*iV0D1O}2(D793cz&<5cHW(-ria<g{EAl8IwaUdH-^ad
z5Z;(|$lZ!Khy?t69+%s#ga`qTFJ;HnNT}{$4Ct|Ft?37d_9C^Gx*#I!yZWPN$p^B)
z9JSlt0)tku_he0Oaj5_*UQK=iF{2?)-+LI$(yg;E5bcqJOuoWz=rn&FUqXdhv9%Xy
zH_F)34NR}m`Sl|IMxZS^XyB5SodF1~feYkk4qAvCf;Xs9nc8zpD>NgoJW1%rOzicW
z>)xenq>iT*1Y%#cS;6hv<V3U={W~r<9nI;0T99OKD}P<N=I_3<D$OUo&ZdSfNxXoL
zn3II%YYpNzhbE_GHtEn)9<*;Eg3V1AvQt&CV4?G6<2F+UCYTmvmV={)?0qZJI>X$u
z@2fNhGmPwkp=v)7a|&FD2LKUOoK<>f3@*po2bACnhtQaMkeUFMu~IA?#+bbeExRhN
zW6h%C|EV8}M=sBEVK8WQX+DcawS2UU<K<E8g%gD=G+4C&$l8|Y3Z>b6wlo#;>NUl&
zu&}0qIbBo_7bXq_8;6g1Q{kioQ@j-x>rHvgf_w3C`uah;^|4JDr<<)oY+6R^DFGV=
z&qfJ+cGRK?AGj%z{~q;U>`)!@`2PLDY)e&4SBPT0)jF1$g?Ohv{gGLVm$DpA>`Jm|
z!6=g73jS*%#d4gP@y<nZW5y9Th|w3%{?Jp-4qA$F(N@L#OmTpLD$V^c2f5WLE5<;*
z^)B#Haa-7{wDebT7-wd>hesSb;|0}1Gq-1ICUl<8H$?WRXlc=9^Qp{J8=Pq7%nwjV
zvL8$YP;{Nxusr=Z^=W&WCNhg#`^4wK#&a5wYR*XMLAp32g7wsLC!EX5o67pnkWopE
zHZQ+Zw#cIv%a?iX8TuN9s6w4vXcuMCT#&IV)9r(l?Zm?8GCAFNj<I}Zmhu@|zPF|M
zd<nV1kO<Q7fcE1D8z1b4OhLt{rV7+!>|||Zh2vZ<HCOH+LRv}eC|H(L;Yu81G3SpC
zer=l{jpJY2Syo2Uro6VwZE~_K2h2U#>U?m}#@3g$9<M)qxV0Hnx21eQ<qJBg!14uC
z+sH94QTBv6qE2Lw4sjn!ak2bB3T_O0xJQNv1-RyAl7dO9gg&+t51prP)T)aP&SJsv
ze`rxDNvn*$j$^>&CCb$+)A3VZ{NgG*yS1|6A{bgwi$&ih5P0`TYE_Vt$k#_AiCKY1
zQh-D*r=|kIhO8W+^I5Dq&xNH-<%6=Kh_aIl1(@^`&*`LezGerKW|G)T+0-?1j8mAf
zvt?DiA}<ttQWeO_g;v}@PAw~55p*b=(U76MB~c;*;+?NMXVin3`Qbzn__=WAr>RYB
z+e&R3rWB$Tuo_GFnuQNa0y=s6{&l4{Dg=w%vp5RNicPsVEY4AJ39^qL?`*ai5%X)-
zeP>^b(`LCXW-VD!;j+BlCDL9vIEEMurYwXD+xf*jnG*L{9praoh1Sg&zdZX|S*iJ~
zq2QJa#dSk)M@%7tQ*lccMxRA8mQzV*%gH{9BZ(zjh8=mt``$)C@=V1#Uh=(RZpRxX
zSD`7y=yCe3SH}WZMPf-HqT3us2dzDuLTW{@?K{-fVzS;zMA3C(m6YPBN+a<GU7+W4
zS9Vwum)EvLr<Ej;>T?ieA{b8k!$8(~xa@c;)50DeKI9}wX<<i)>jX}j;?-6nf>BoJ
zCkF*&l+}4ULZK{v>5zdf&Ueg+t4R03{sFnvrY-GmJUTwwqZ*fpC+DMVlECBbVo~j4
zGf>&-Db6cQAc{-yF#)EhM;)9X<NKqL`8FcT&;=OL>Umj<6M@%$dM%+AEvFeck062A
zgmMMXvvDniF+$K%NefB%j1w0KNbOGTFg%m41S$V?v6!4Be9N<OcA6(TpBp`{*MB)e
zLk)f5xm^UH8S&-C*Lu5aajEIAcdsY!y?0}#e|qn|XS?PPqvcB`H7AK`X0YUt=(XHj
zdh^ByTN{tO93lfSeqR@txnh*~mLtnITPhX#-i?LX=F>CcgGpfJj;p_bVd7-6umU)9
zol%7#i{Zj=4Q=F<D@2lyMLY(NHL_!AEi#i*oy7+_RFPAWGFBxh%f4=PVU66G%7wFh
z^?mc02ji0GAaZ49h9w9$W@<>!%Hr*NI{8{fm^pnMB2LwZ2N*xoXY#c&v$BcKDrWO5
z8W6~@v)wEuE4*qhWmDne&|*$?<0N@ZY(Yq}3r+%8zfjD=1(9fbl6J=`IYEq@+?0#|
z!Gg-TI~-7)QP2&%P|}}$ehhJ6Excct<1bffUVXl56&e%0(JWqza$ITEU?wiL;0Y!w
z)|kGe97u3R2YH}}pKjg-iCN4-FEW*iD!DE@GB|ZD;AXfjU&HO>i%M0x7!cQpm6Ips
z_+quBi7fJF4KZ|OT{F2#?anRMexV)Hv$Xh*??K5$|A8am!U+69{zS0~_yZqETI80b
z2Dwlh*ti9B(fCHb@DW{LH_0C%KCOVv$*mVpbl`=HAa*7_%ZVPQMUcdVob68BM5XT|
z$GLqyM@4YO@$s0WjBtU~y=N%S2eKz|-T6SU<hhh*IYWZq!<Afg3`6yfRQuy-{kX(%
z=MN9IpxyP4eDLnJ@s*wZZ#af>Y$0dHKco-r9%4*Xh8lbNK7U_rB}WjpZ5+eLW9NZF
z_;jL(pAU$~)XYp`3EiMU|BxY5sd$qS&jm`9DfIt;ah?(q?vtcJRGn(PLxnHrW0?Mq
zOf?QP7ze!S_$ivqN9!1G{;JHMq00JGl27_Wxj?$|JHCC->{sczIjXunG;0(kZ3A!Y
z5h0&>NLp+ecygixBVRdBc5IZ$Kv`-yr%2usNvf_QM<v${P7;ZF<<<65CkrGp?_n{=
zMlM&xyS$Cp)SX9W6`WP>dFfO#c$8?oFYg;XClZa_nUor*j&|EqL57JN+qz2p={k<u
z)BDBnY6q#r-^FrMMwxXpQrV9<C8vYet;TAU$_nuzlP<3rX;9X*)D$dU;@pmpvueDO
zgCnn_GMAs@0z`a-Gng8~xigU3EJ6#F`js!D%!4nVGsi5)jM*H&JllobKmN^ZXPkB~
zGbKgxRN#VMq%up~OA2{MuVI~BJ;n^05Y8X}xzFai-+4K+eYH_B78Pbp?QTjyG_#eR
zM9hY^V}45wXD)Og*1W_I<rfocMPJLu%@`-;KAyQ|Jk<_`9KXPpHtKOku2iUfoga*b
z23u9~xKq*O*b?Wv>nvu`k(E|$KMoZzAK0;r6hH^F)K0Nm1#oks%>KsCp?>N-U@J4S
zY{as%Bg7fJ6xlSq8vlu+f2joo!#3*KhSW&3`fQ;hF_N6v>4ewvQ?Wu4zLbq74M9N8
zzoN}AeMXtulKEP8($uL-ShJ8gO0!Cedm>X79{Eh7owc4q4-B_FIl*;;I&j-dZZbp=
zp!|`=MO!=@mb5IEO(AS+tz&4B3C(bDi%@my;_H-KgT%dV-X#6N?gKP}2X<0lBb=im
zJvTJ}ObSJm+i;S!6+@g-Ih2~0<?ymmiKG@?TbdHobDX;9!KjW)&Mo87GGi;>!@IMT
z{?l54)UvLnF~i%1+oncuz*DGY&(=+5W)7-A+|R~+AJA(}7E)~^)STlhaHH)B$^!UY
zDQ6a!d`tc=yg#Cv!00`2*%K68MOJ{1mt!8YN?#F#A@&v7TU&M(A?5F5muUAV5=O0H
zt3S>3<?O2<9Zkz(0A;!Iv44^#JS#E7n%L&onM>^-ZNA0K%O0)!xz-~W55%SAVIM0i
zylOlw=*m|phvrM)b$U;%62thE<Lqq+k$nJ7qbzT=t#g4j)RU(l;p2X|z$MK7f9*|n
zb$ZkTfvl16Fk#(6NzgA#PLMLYfU`MUZ8zwRtJz4d15Qg*N}UnV0byd(N<$#UnuyHL
zp5PKTrWNoPw$41Q8&`YI^;p{I^@MsMSkK6Qbv9M0F&Jd;6Sm=Ng83(L@qRyx1~>az
zxyf`6V6dAp0=C#@cX~M(;Yhu)+q$vMr1%h6o8u0i;WUOswW>E<e5|u(;9HRLDKAqv
z9f0{aTV-acgs&+j6q1*6K7h;?WK(Nw>|%Ci6(eQK)S}{(6auGx%9xpbRi%Z5L=1)T
z1uh#(tGH4cywbVOfOgiD(C2PA#z**KsV~^0#EMaAAp}veOVYN&uT#`zg7_5Z7kIFe
zy5x0W9=dae%%dqAsQc_J*~`SNN{`*o{0xPyL;{seM+z>h7f}Fo9g5Dx)iN<Ijd|pb
z*&AIUlCQPmCZyC%*pr8Boir>0&n<1$47g^twdk2FbY`!;oyiJR&h)n3t*ua;82c+q
zIY+}O5;0CMlbKdU$YLoXO9d-w-lMh|qJ`vyUDD7bPAGsqM6cW72hz?ZV-O7)+OK=N
z5T+tr52?Lf<{?=gIG%)HmDb4%9APkI>L`h?zt-!m4Q3zohoof<34HItFE3W#FC0RX
z>fEHS@;*Cj=I4mRW*qYk!Fjm1cs&yrc@u)botemSZUu6zB9#n94$O!skKD>WRmn*l
z{%|Yj?q`+3GFFIzFrLKT6#LSuHSb;OLNGzxYy`$((#Qq{z{#!#qjDNA#6<xC+*?I(
z{c-TK^99nx<%rk|7laaB&(<|pn#V!Fy*~gz2&9L@gE*0zgr_xaa|N>N14oNRCeARD
z4CXo=H?e4j^z>0)fKM|ss&7Ym*-=8s%~Deo4sRAYM7D`jH}uj-D~R+JQ!imG;v6xf
z@hnA!6}07+0ir4hY4DTP<qTZNQ3(Yja#O}Z(KkmctR}q;St?(#2jGHIZJ_bkd=&*N
zcBMGQl(Ve-Q?5ddY}qeGK(1lr1Y$r7`()@3k#%)kK~7g(T}@WL7E-_*GkO1}7|WA6
zHho`NVfyMvofZ^wsuoFkmS8l=h#Z_sp>P%<>I@ur9#ejo_Y~>o?2asZDhs*`)lv=I
z)gJqFkQN>zIMN4GnwQ9}jt-I|QB^OJI|<d07tcQ)X;WAcv1eUh$tSb3s)QuwTBRFZ
za`*^`hAw$O66lFh9QHLS^%oNvYKO1@XIoXagLVox(ZSz7n7m{0=IrkI7#RZI`W=N%
zn1Y9!&7Do-Ty77JHtnakvJxPfdh7CMfNuuH5Cqf#OY?(ecl8-X#YzcPagiIFBJn7p
zW^gs-ao>etx=|1YVaMEX{l>=DA>}GHR$&iSDwW&xauyp?X+{VCNpzf<*lkCcn48Nb
zNqo_GlSVj5nQ&_t&OrK!4*3?XK7({{$}?2b``gvXJzPaFD-n(2qmV%o5+k7t;d#|p
zYBvAyt=3%{?5mT{%+c&CzXyDxwbUcOmqNNQG1EVthZ-vFqsUoUBq?Ao<B3)G98!d#
z&I)(&T$n-S;1(^n)!VGQAf&&UhPhR!E?LnB)9O%nCCB+>XibAETG+L|W<+M1ZN09L
zqu1r`HU*4WL~r{w;W_!5{ABO5uLo>?>FH5KTAUo3H=mG73xB{POwf3GZb1>m<<y)l
zloom>-qgZYpz1wN7`Mq%K5&pG<sHN}jhI2ODUqROK@5zAX{R!hP4>!{!Z$gBhC`*y
zK~z4n+9+PLwfT`bBBVh%#w5*GrKLDy=mut2M18bmnEGsi^6wO5&#vf09QPS9&<$q%
z3|b<l>er~d1-6hFp8^oaC#$lGSUw}U$*ZOj<HlC?M*>q{J&$66+WM5>V1$0IKn~-u
z^RbV(v#bkp%3~Jrn#3Rp17XQ<iJDwIg@QCv3+RnSA~lG0Wcn5z=I0~emXjg~03b%4
zR}^-GZAT}QDRWT!T07lzHk?D=skzqL8sbw|@QOA+cBz&xV-YfxoN(ntwY8Tsv4OI!
z)t;MDVz@MuWhzp~0W$L1u@GJUztT=pakMg`<!hT;3a#IC5pVB{Im$;lv2cozEbx`D
zbe4`S4XUFBBXYjn3nb^$aj2h(PURA<P>EdrEuZQ6qMc5<jHQ*~PO(}=VY{*$TyTp>
zL8!fFF<{ym3J`J5NnUO^qZ^j$;|o=gB7D~=1qy&!R~iCeP`}fJgkWZ+UIyHZqyr0f
ziCCsni?;STT!2hmr!b{i1LoBfr<Cb7)X9)yw4t0{$~kZCQu1&2P!;!*C(<xjz{OlC
zsTq;a;#F^7j0Df@h&6Vu@^wS1ZnB{h&m3pNd>4|J)*^PP5H(Dh>G8@qF*6Ru!qyVX
zo>ymq_1!60@#FUT&u%Zg;_S_LeJp#jvk-b=&NWA470+mic*c#z8)fE$ju!a>B=qN9
zAdCI`3$6Fwhpc{8yhQb3JU{s==3CQYl+JF7L=ON3GCTI+bOUd0T%nSaC)_uQR(6z@
zBgtN{&lo|8DiVjW70N<ruU6NQ*da=id4Oq%D2Xjm;2bqk#&CYAPM?ao<+u?9O!cf}
z?KH08CoIP~-g5kSb#*l66i$Eec1k~da10Y%S&Wd-4_~91=Us=%r;!{FUc|>Bh?qyA
z8g1-j#jH`McN?t-*Hf#R`aow>Ii}&+fj{zP2l-;<Sm&b~obcUJQD{{zSQj^hF--Sy
z#@uQ}m|l{ub~SSC*RjHNEm?w*8ef!ZY8G*xG%OGp<1o#wduk8FRP1%n1}73a){pjg
z_8+oPk%N}Ah9ZM^T9nqWZy%0tMHubX1MAc1kTrZIY04(<Jh*RRFv_)EPQBeL+SMhP
zK^BOXBD5R$m+WQq^DUEIIK<so33V%-$Tv10I@;AL`Kau*`E{ndM9pX71y-kUCyBFB
zI&21$z6!s)R1MQy9YFzi0r1TA@Ht1lDiU;_JPXRD3-5u2h6ZFlE71qZ<m7^hJC|F#
zSeGL5v3M+c^a7b5KVlZlL$f27e96a#+EI;W;WS2SOce`|D{)PkRh@E`g!q0D3fU|d
z1PGX+5tiAuFMAPpY(K_QZGk=1ro}9+C?C2bgQ7}R<Z4$)_1q^S6k}m9I=Ut+R-0=(
zT;IpASYWE@IX-6n%)(1K6Jh-jq#P9PVvBr{3|dqq<E8M;t8MoZ=}gK$ybyz%)wN~r
zms7GiGHB18DCANE5Oabfl$WFAG^?^waC~mNGZsV-f`${Q031SI0@Ar~xG@0Bwwj7Z
ziFbOU6i!$1F#-tGB1Sqa`?E-b7VxEc99W+5QHatF7p`7_o#z!qX4=!R22Uf5C#e_+
z48g&g<Iubq*gKKC@uE;Xs=SmOPeML<h}K$5Om^+Kvr=C~at6Cam7a68p>9(nN>U?w
z^h;S5H5R+<;0&U+piNi}9aBSKVSdxNU5ny<ZR9iPoXDXqsh1V2$gU&`HeDrE>80Zu
zmbaYhQ`d=nGa<?4nXs04;YU>M55+l@?<gy9x&&m<aJ;}7rdx+N*Hx;Q<EP%h%;D_b
z3+G~8eRe%ns3ODjvf(;??@9raWdLXv9L4;dnL~zc$w@8=Z05^VWUIne_6=v<#1W#-
z@QVIzu|{VjNsp0e@lxI``$p<wL#nsi^2XF9KYF*DZ6$V+OvSEth5eh&W0dG-A<#}o
zT~UtD&ES;aUCUk+3)(I%+(bYV3f)0t-njtpqJ$M{u!KhJ_$Nx)@YZuUqiSNfoXAtu
zZ=I8%qK3wAx+k(FCHU{AoVBfDh|YBvG_YLe82b<7*Z{rjEcL-nFA={`!{$>ZppnMW
zlNWAo)=;?9fNW$t%#O8Bfu|4?!|k);i1x#d-a%RPm2G(v#tiqAPS!U6m!%rG<9GCR
z?}3<$G&U5khdOx9iZwmhddL^Yjf_ui;pHMShT0jPXNVIWf1s8F%4Wp=FN>ntSU-dZ
zs&M)`bv)Wqho#%Ov~-I>-Qr&Dh*Ddm%BsRuZnmV!^?MvNf?bHOSp4+Oy=XGu`g5N;
z<x~q(J|xw9p<T6gUdoEDp;nR~Y!SbfQrh58<AvfVCM+J;WV)U>=yd%tHLpy|Xfmvt
z+sdluahDm^nP9!N;23A_bZ*cfkXjThd2fjLG%0fjf`^;uvioB8-4T+@^t;^!#wA=@
z!Nq{96=UcN%(%h$QyIpgcmZrbi;*c*6{b+fW<s`E>lkK-94E7yjJaYgDn57{%I;#_
zv!hxZ6Mg5-SKej!f5Lfw#JEPZ*Vk+4Lddn#Id2r*(p4Zg8U5&ZceAs(wRQM_dI(n6
z+^4I<2qn64hUk2tx^@PgzEg2yx%pgSyz}{-IQ84N&xn&)JXzza+h^nP*|2{*y<m2G
zz)7b4+3k1l-ZhFM%jICGd~Rk?=MY|Yl`rD>JcA9RG{a45_$8eNbzV6sgot9tX%)sV
zJG*~mmj^2?nM0ICnE33>m1L1EtaLg*g_)nt&sT1y4xKl?b%**?Ts>VwdmmW5myk@?
zu&7!Ggv*uaA&VeTKB_#kiui42NEr=DRB;FS9N%oc+1lq=lZA>fi7VqBlzZ9LEksB2
zs=mwS*pq<?3(5X?9eO#N`2#D?6o%p)h13o8zqn1D$q+@PIKypuK#Fk>#-veNWl6qU
zQVb7vg~$`ttv69{?a=uz)}003B?nT@DVk?r{qd(S5^g}?3Dhc4elF#vpGIj2(o;e)
zT<H>TOH;^BBE1K3#}*-Px%^*7zLV3_;bL}f$FE8A;R3idRc?rN%xvq9Xos9M<^D?9
za5)4=91Ln?oaltq-}R6Rx4sTIfd$asEUy5<eDIjkvzGiDnd}@TFQM)fAD23FY+S>W
zd8~AdAjB?&zy%i(rOXZUM4<3~dSh~n{1^9*OosX@F$}!b_uP@UnsXgs18+Ea>X*ou
z3?Q2Qh(wun2T`~nUYqUT2^ZN`OZPaNQb&EZo(-mUo{nTFRwSWt^vJ?ZQYGTLZY!G+
z<n+V-^a&xx)G;4IKB?j$_tgx@5hWtzxuSRmBpQ7IwE_76C)HcDI?`?}5L=5Fa8;~N
z6X#wJw{U?*ct{Vm1f8%N`m9vahl80;{%Ow_I9^c<pwxIY*091E@XFmgx8Au&#T4p^
zH&bvY$Qb+JtIxD-qUg3Yg41k1etM<b$hp>pI2?ze!i2;{!0?j;Sk#mQ#_a9VtXk<N
z%}FIvyaW1<Bsi0fz1%x;6(T|gqqoF$tX9u97S<0C%Q#_YXLK@L5UwE|KmjyZ@Lu>6
z0s<#d!4E~eOFwW~d#UzelVHVhvr^S7_sHN=sSuqx+P9)Yk-p-O<w1imD#sSM(Yh<v
z_S$)us0$<Nl=e2gBt%!hdR@^cM|C9zgIHx;P%}+vb<s~G+QmPMMZI6O6cn_Z5L?l~
ztz=&1uR(xy?nDqa-ls&a`JgNb;+(T0J9<*H=);UUSxd%<Tv$fx>U4)nt**LS?*tey
zHH6mz9}I%7Os?PBEi2+|`&;WqE(D8DG^6utPc1@_#35GKXdanSz4*T)iHo<s$Kw3Z
zjQqBI4q^)dg)T#UnoM85WM4LZHrmLYN&gy&&W_pV_-xMlcH(?@4TZF|FqDWk3&(fD
zPtyE1SCrjY+G+AEIG9rYqlt0;B@*S(8G^DBv=HOO;-qO$UX5|GS4gks#;$(sY0yqo
zt=$S@uw~x9r_$|6S5qVwR<dzOI}*dJY>dgQxz3CxD;*S+1ENHP#RtU(w>J+Dc6X@;
z7-z*%_HXxK9dFw{=Prp6Js{W*gNs)Ic_FW%M0K7#>#T8)Wii~#6&toXt+J+wKT&qR
z<4{(71*Q2wI|#x_{(q2L31pICv!HqBQx$=?u=<*eG+&y=2-EM#l(bDn>YCtAh#tij
z$e&ZZc&l&W;x6|u7jW_3U6X$;*zA_iw(Gq4-rY1jZ!v+G>*z>Wt-tck_wHfT2rdYf
zi<dx5QdMV6WRvzD%cFB4is6Xn<)FulSw2=Ml3MF4wNB$ott%^US_O<XUl8f7x7f8N
z?!Iwf5_1+`;DQrs0}I-r9{GlxvXmQP&A@4p5L<^Eq|TEJ0U_6#S*||eZo05Dm|X>8
zM2PT{fmrb$i+6?&5tVk|<fr?JwY4y{nQPE7K&u<v!Ptli_ijbn6#F=WW2m6`En<(o
z)6A779sGfJlZ8@iVuUR8Aa}{KBT?RXu;V~>u85gi$0sXfB;O&<lUT~{EC#$gwzOWi
z%&;rSeo-3ZN*jk$i!~*x%-J<NAB)3}dgugBk%C&-rClWCTj)sk4E_dj8L+OS@bcYZ
zqSX=ICO)ywEO*lDfbb#9cJOA?HHZbGrz1zRvl%4=k+I2ULKqM9FmIcgc(MmM8JvoQ
zQZLobqE7UYI{z#gN=AVWdy;D>+qpV@>yJ5j(jw3{k2p3C_jV?rcQDJR(t3weEBB8%
zBRI(Ec233N7NNmjaJMqqJNw%u<#`lj99v|Z>zzX{G4({^eHN(vc1e3U<-&|L$DbT-
zIRW2d+Dy%}5MkRTCZ1TjPIB6%IHNluL6NW~@Yaw&om%UO!1Q+{`O8QfDq3nfpIAhh
z6oW*C@?}hrOUNRRWuT>QZ^%p{Mae;6+Qd|W1s76?7uwgG(^`Et#tG<cyQmR&-#vcI
zp%X8HnA+i0mZfmemWFbU@ufGGJf({qXtbl)JZ|#_LX^7%HAmq9s5X?nLt6<Nkn?g}
zO?6PRPb$;f=F~C2rs|l#4Ry(BG19*@98;!vXjh((qUP*Ec`s0jT(EVVVD4)%OJcV$
z1sU?4r*JJqfiY`lILieiDtX|(FA^umepK@a!FezpkNgoBb}0s0ljB|x?u~bec{PnO
zXPh$5?xIYHr|zu>A4=>XfW}$&ROpK$GAyqLPGW~Rq!rU@m`zK}rWl0b0`lI6L*Dyo
z7yhQhWGx8ey<Bugt{AmSmuTW{=;iByG5o6}qgJPQPgT*nEgjXBFIbn_UaHe%b%;k<
z<&n=QS!O|Z@IO^1_cWP}vns)I6|st}^)rdA!KA4)V$3MXjXU?ACtv?ML&aQo9?8iT
zcM*hsm)IjR?EGTv4Dr0uQr)Yk{bc>Y!4Y-N9uiM@r4!aOrYQC#_QhJ}To}hvTqH@v
zA}!z2*Y8u%V%yobm%|OxtZdkmBm*%ZtlUZx$I)p`n$y-JY~kcz>g%3m|C?n?sg!Zi
z5Xl!CK0A0lNKt_SD&!@Ehy8F74ay942UVoJw4>S8k~X`c??o2DTJO7Xr9tQ<-VqRi
z7LCPqxW=SGau|sGF*;-Nr6{IXb2yn2gQ-4<GNvERTI}gVyj7Wv0og@?&7lsZvp}T9
z6I6D|jF^gzf02<*kIhk2$ycJ`n4O{PT#J#{dHq5a%8Awg6aT=TVb^w4b(WA2vQjuw
zsH`?I_WU7nnr?GHK{n@|P*j|SqH{Z0Joq;;%u{0o8O06+(H9g_v@hJf#7NjdT(o6&
zNYmapn2Q877iXAiai+Nmkq@?4+#lx<%GhWkjq?%NR}?eqVx{r*u28$LN1Pr-7=;`l
z?WJZ&6K_7@*zL$HsbFeJIXU6PY?4c&jJ4)t3te48C*$n!BZtK`A8l>yZhdH>8if2m
z{@A67Q5PONW*c=<PP45e9@U*k{C9ED`V9zi><uxZuzWpf;ln3NIu(O6@8ve@?S<{H
z=<Js=@YD~ooHgsru0Cs4<l71)JLLuy%N%2HI?)$PS;N`sQqK^(mv!_^1UseVCLY}G
z$%`x$eJqikOz#c>+?bs>T&jZ+HEeEIDNz-EeYq4Q2eE7AusGXd-et^2H9^iKFXUF2
zIv|Y%vz>)zOWc`ZxAS2k+0JJdmKVsCt#twcKn{|vRo{c_tXy_Vd7-!>*5JgI9#rhV
z;uaSdUZxN)z!&o-vnEH(mzmNuv`Q>rBtiw;7M3pQj-~M-SLXUM&J1J5csWJ~a}RlH
zg%BYo6x4UQx~b8Qm4wGhcVz&<$VPAu0ffVj1%H~y33*33vb$1pU-im!vQCE}TU0UG
zl>8#hN6V87>B0%uvMjjF-GyT899lf3H~$^GpioHd**S|R1dqbSkv!gWfgqC;|7A9D
zUvlyo@531Lbjrk*CR6_%`mSz~HP8A8238eO45Oc24%i=QsHzly{|E@L@v&hDi=J^Q
zpkbzvU?PX?s~o0!{l)tGl5Ml{`y{dZA8wV{@z<CZ#nZ^=MYi#g@X5>>C!e{G%w!q=
z#&?#c|EAhFtSGA3H9z6E43-I7%XAMPG~tWvpC-s`vg<L3PG@K*lE~EQQkZ?DdM-z-
z&AB7i4maCNBjh#?4v&0ro=feL5mFj4tiqSoYCn2l-<UzUag*cA@vb~Ydv@IFA@Vo^
zz|L$B4vYyTdpnV*AUcRD9J>M0k)6fL+WXvHM@Lh#>Qybr{goh-DW}R{brCSzsP9ze
zcy^nd@!N`}`1Ho?+lh>p66N$MtI(_8uf6^VJ0Aih9PRq`54P41TV%&^hQOA_uHqfy
zWdvwNvjn;G#SM;jVeH494Kj;W8W!r$yed3oJsi#6wykPRGKV7bqsBQ(Uy}buq+V#E
zJKC%BJ|7oE?g=q>Whc>;%WsPJdX`{hgJ_HIP%p426IBnU)aQI4Oi)i#D<4V{6~5n1
zR<qc0qc7Lpzf`FvF0G^U90}K*o<YepU}t(}QE5L~vy9I$EY)jDlH}L_#V;g(lE43q
zzdyy_KTF&7!^1)EK419nn)^uq{oMceBfcf=Z>1o1%@_R7b-p~{`{N&c{`r6K?|=UJ
zXP2LU{-^)-&p-dAzy0~=-{ZUf{%QVx&&uyk{_N`2{+_@2yOaNjzpKIv?yp4fNB&;%
zJ@sAB@~{3Y`tSe#x>K%`<O}{m!t?a(`Tx^j`(qnFn*0k7|L*?%@rp6@H-9ww8~l}x
z?~iSjx0k^d+kTB}-`viwza4##VgD`Kf0e)ggugY$_-Fb2%lrxEZ}R!q`21h^`{(%k
zm-zeF`1?2c`*->K5Bd8w{(hamKgZu+;4l79HTeJF4XWXD{Kt0d#;*FlJ^az+|Kjgo
z=I@vJ`*-;J_xSsd`TIV9|1E!ip1;4y-*55v^{-$5`q%H@xvpyqp|Q#RO{zleenN%a
z{q<cS?zcwN8b)b-valmsrR`g7$`a#=+i*dpR=%dxyfytp<{NwICx6#hf94xWLM4mj
z7yt8X$+!6aeLnwrKEL;0TuXkL&!74)uO)wy&tpD+z~?XW`P+PcpU)pV0`5;;Oa1_#
zzryDoK7aeyuO)wi&mZ}(uO;8-^C$TH0iVCY=a03M<lp#jt|bKAC7XZxTJl{!dwl*;
zK7ZoRT}ytG&v*FzZ9eCG{=_kO|NOP&XZhUx?||d;r+#oP`87Vj&*yKc|G&SMtUTuW
z|KVElF`wV(^N;fR&R^jEeEtfbe~ZuK|M6P#wGY9|=OLfp|DUcUb3T8A&p*QFkNnR(
zFQ3PMk?Z)J^Z5flzsKia;qw>yOg;i1pTD2a-v4?n>GAoCe14D5@BbynrQd(~TJqQV
z{NDd|EqUi-#{0itOMZ#Zcm9uS$#3vE=kvGp`~Rf>6Mp|SuIKZ6{}0#m`5S+O`|<fx
z|1bCZCgb^=*ODQhf9Y>uOMaEl?|shoeExRwh2*C`WnLf%$9(<<pA$a+(vN&0`4js6
zqhCn=7N5WJV_!&Cz6IRx{X#P5^Ls!3h2+=x{O#ZOh2;OE?SA0n%m)1dze%QT5WT^m
z2!cTn1cRW+u2~zz3I;(Cc95lOV|NfMR@kwMAQ%KeG3ccz20>BRpqC&RlpDk~sH@y!
zP{j3n-bvCXr|GhPe*Jv1Pv$%4ocFxvJ@21+XC~YB1do%=(GaUQXpTy(-KaS_!tzbj
zXX56~QU4RIFGpD8O#HaBIa*-m7TS4|$G1|B*;I41n|aQ%dfVox#u|^a*rneiQMAB5
zewSJPkfZzwCs^Y&f5j4i%Q^mm^E}Qa{+)F;->x}oeR33SV+Y&V!|O1^8*qp>;VyP^
zlDB4&x948onFpBW0`J8d@5keO0Nb7tMg8pLLz(6y*w4psn8O_7lUd-?IKyXhAIEr*
z&ta7p^9WzWWfs_eZWO(oDZZM$d_4y^!x6rXdA^Ig`998aANTVktnd>&%+K&B4>HO{
z(U+LySJ}mHawn@C<PVtRk2%htaf(N{hreN&zvm(TnTuTJF*e=4IZ8Y=inekGuf=Y*
zvyazjmN(`oZ_Wv(IL+I!#5-}0cjG*Jxy1Xj&NJEiv?w~89UNc}AI=OP%^^OXyEwv0
zK9xm2gL`=a4=~RKKA$yS!sEP@ZBLJ)-R$J6nC9!)&o^<HvmE0)S>Ss)!w+yD_wyh>
z&MH66Bm6v<Sz-IA>mjE24fgUo9N=M&@JIB2Pm2DVyZKAb@+kN7cdYPFJj}oGD5E>r
z{^v!}7AAR3cJbf1lSvNpM$GYM9Otb##V+pQ9a-jGd5HJqB6so_&tT#iQS?CW;6vEW
zLH6-c%<^#@<&!wU9H)66OMDjRcoFA0&LzH(b-skH&y1p%v4d0W;cJ=U8#%<cau@e-
zlJ8-W@8@2Am<L$q0zbtXKgZ+zBHMOF(IIy7>rC_8?C1A6%tiWNwM0K<fxqAkf6aY7
z#)JGLtNbgE@E=@eqTBXAKZ>rw6t}aN*W&<paD+FdzuOw!lDm0Z&a#{Pc^6iA4<6=y
zc$9sNo)txBG06wBix1;YW;w{mGRG%!oab_iquj&uS>}a2#OHC56FkNjGjTx_UB(@J
zCA&GzKE8ojzJ;TF2PatKH2<9?eu#7Y80R_1C4QE5{s&vfqUbB^;5>WyEoS&V4)MR}
zPcBAFoaE10<o|Fl|Ca|?=K}x28vl>S*>XqQ|JhO0%1&OJX<nE8ydj6#!7<)~1>S};
zyaV^KhX;9gR(Wq8;r+SH4BIb^q6aa>bJ)vAa)3h|;S=blLi7~w=F>ULUEI%Sv%=@{
zFkir<oMd!S6up#5zJgtR4R^B0LB5$ezMbQIH>bFld-y??`B5I?C%MQ2JjO3Dk&mJ;
za|gf1ZZ5Eo-({9R<S2i_3D!8xU$MmBa*luCJdbmUe`lS|ce4GT<NVJKwy}rTVTL#0
z5O2a=?BpbG%_48ly}UCIFwF(ti#6Vl$N2!ZJ=gi4oqQ<Md<6UX7!GroV|+3Td>Uu?
zOzz_t5Ar#z@?sw0i@3}J+n?wB&lF$HUcR0KoZ$%H#ysD}-FzQsxsUt#5mxvK9_D9w
zln3d5r5SyRNq&`G{3ds@%0d2sIsTaA{28ZsgnRfKmic=g;-9(5WgcVGoo)Z;JO6VB
zuf=Y*vyazjmN(`oZ_Wv(IL+I!#5-}0cjG*Jxy1Xj&NJD1vGYGWIKUo0oEbivLwr1U
zafFk6DvNvu_woWBV4e$nK5M*$$9XB+Uf}%CPQHq1zK;ET6Nfp=F}{-pzLzun0QYe}
z5Ax%z^3y!R&vTg-wof?!GsSPPm*3$44|9Y+VxIra-TWnId6fJ4J68B79_HV8lu?iE
z|3c?~CV5SE@!z<UNe=Qx%<*O%=dC!!F7DwSS>|1Ni1*|ock&p|VB!+zf9~Kz*v&!q
z@lnk3aUA87IKdpJc^*rA7Uy^o=Q++LzL0gkgsm@f{$~fL*u&Q{!#8q>Z{;rT;UwR~
zBHz!w{4fu&%msdmHGYoA`9-!(I{&kiUuT-%W<S5rVJ>ovKV^Zx;0%AweLTj4{3EOU
zE06FWTxQ}fw*QNr|C!=;_VRih;0})Prp)t}+|Ao^mfhUXyRgE0@G$SgqwHgRbL}i9
z`CxYOVcf|q2l-g$_(YELTuyP6dw4#}ypV_ZJT7vA$M|9<E_MFr4!)AzoMs>2z%1Xw
zQNDu{EODCu&JsVwIev`uoZ}Kd%R2vqtp(?Qc5t3O{1!9(9*6i}+{GFvnYyd~GtEOR
zGJ2``e>da68mCxJTQ8>W?)l4<XEOF4#)TP9v3O78%MzoPvDY}V%pBAAwj5@-7(c#`
zaoFuKvrO-_p3L6Y_%X-C%PseQo@a?uOy6HS@nbGC+h<<9!n|>VbCwkzWi`Ix`AW+_
z)A}&+0R3T>Iaax!sf_-xz-Y>PGsVnV<~7Tlj2}PHe!(g`UZsC$+ipzv+b&Ez$hg1S
z^DHv+V0rwQ9j{UTA<DDPGOGjT#kBSxYP+*|j_u0i!<2um<1qVK;Vvc~Zuu;*^>rRU
z!g{g5eEj&4_T%_5mzd1jPOmpk9ANrU)`Jz!#rdO+GmDQgFN>ZZvfncEM8^Tv+4ctI
zo@BpfX2iT_oyX$*$+q7c<xjDnv(73@=Ni8^8ONM)X6kA7d*+^QzOp##xG`h>&$Hf4
zKGS~6%r4ve&H8`7{<88c<H+0v#*wAx7_Yb3e$Uk(rk`gVnH<;8w`z~WEI!}zn7i2g
zVB!UqH*1_Z&f<h|WcG!QV{bFgm)Jj8dXeK4E0c~FOufYN-tPHJJ<nRf`ZIf(;}z2{
zv!3tpc(>)U@N)BywO5!=aX#hwJ^C?i{<6w(7GJA>tZ{*v*BO`iF?-&r-PbG6I_Fp{
z+TQQd-W$vV)_H)LH`>lDGVyNp*~Rpmj329<Wq!u~z|@<~|B{?#Cf{OvG0QrOOuonS
zOf&OV%VU}Q<D3<yW^I2K+4WxI`!>g8<~hUE+qKK`9>?wXY3JSg!77h3S2Az+>gRjx
z&#Z8fx%Zk+|85-LCueG}<;0KK^?v>RcjLzb_s5UluRT_{^8=Ru0ri;Pr=P4c+9&^@
z^=A1)wjWC$HeMgJ-uvY&bDV{bINmbxQT<_#i4SRqJ6Zae`plK(ALhsHFU)+x`m)Z{
ze&s$XXLinZVwFeY$DcC3AMu!d%zWB%nfi?R#yYz`s=o*9H_Uv_dNTQW>+v!D<4&dy
zT2EGan3*q_=Pa_btpESvxWOW4;>TZfzKtKV{p0F=$?<~eFIyj$d6=oMSk5Ob|B(64
z%70oulV3CcK52dDZD-av%ktNaKTF@xuQ~O<sa@7tVQs<sev04H9?RcWkE!n%FV@-q
zY0Ims&%}4FSNxb2*4X+P<-ccKSmLhu@%L>f<_>%QfN^1pWe%|Z1N~<5hmIS}upU3p
z?$0X!UyeI0E!uxr{*nFebISeL@tIYYSon#0tZ~QZJ-?)%tZ|%$pW2V&$2`pZe_P*!
z*5ha970cYiV$D2emF-_J?mxF*u*zM`{lb3EI+x?ezcfGo!($ehJK}s7KW5t(J^q#U
zS!XeR{6Ch@3KJFU{cGdJJO`LKsvQ=2fZ5+@=S%v{4D-J=9!&qQez3%2OdiwDm#rs9
zSmz9LzjNGT;`ip;SCnI#1@2;<CFcHMKVpr^Lmt=lhgBAs`J?`@%sNwlvVHzjKbd3V
z&*lsBtg^<ouj>DC{bq#|O#j7pV2MYW{Hyu!HRYILmAjezoAV=!JQ_b<wq55vW*?J(
z*DscMi0S{cysujhGtB<Oe$5(ZnU7kcL#*-$(@iZ=>o?SA7gNnGQ8s?e$@p<gOSF$g
z)|g1NL>=GMF8f&H1PfbQqIssadVYc1)Mt@%OtiK{$5>+bw~XU8TB1=_S!U^)Em7jz
z`gbkMXR56wDzVBs3)gOmGT+hO_Liu~#J^b{b3Dd6yQ=E9tIsN@;>Xu%iH^pPnfk8s
z*KLVLSm!JY*K3ImGns6OlHb!04l;dx^;zK|W^bT>-?zT(WB!KLn~5FPk44s*x=~Bi
zbJ+agE@p1r676M`HRf*85_SH-@;S^xM@uxr<V{<mDobqtq2=GKB^qFbyP3Uti}%Vb
zpV5C=erHRxlet^yC$qO~i56MBmHsVihgp_6%3P`?+QS+Tv(BST-?}Ag`;qq8%`%5r
z=Qz{1(ZBe4R^pt;n7OU({A2y&0MlK{v&_A$-cI|h-`;rqMEN^dUzRw-M7Q;gb4E+N
zV@uS_#GRC5jeD5xu{;)-{HdJ1EZoJou+D=_-?b%LVxG}|>jyhoXCEtfYl#Z1afXSs
zdCe3LGRq?@bH~rrV=vQpHy+G$oE4UsxQFqGbGFvhyJt((&qS~7&OA%3agn8anSVdm
z?|bVv^Y<}6EVJzw)_<qvu+9T4-dFxh{kmUEG{y?cOx)jk#*djgqP;%bmzguH4-;n^
zFIL$5E6+c`e#lJ5JZ9o7^Xq>+W)BM-WQAi)Jg_C2j-TiL_<1hG`PnVeGV@ITTF!nJ
zIm{BrnCQ3tndegc_(A5$QO|RjsR!GCS>^(>53!woW4$@X!hrQ)`k|J`8V7!>{5kel
zW*%n#vc|6e)$c*uh3SV|E~{)krag8r_Xzs|6OS~0@nbHs#^moTKWl!l%u!|@Wq)S!
z(dz$R&Rr}%Mjkt){Xba0$J$=3vlKsmocY5VyX)5T@%qg&ce608-%LM2zyGKm4lwyd
z{b!vAn0=Dt1S{<Llky`i(Fl`IHs6@yVP>9Uz5Z-{xsNp-WcJ*a=s3&lK5pD|wkOLh
zvBEi4InNrGn0Tt=7n5xLi{-F`Y4$L~46__!j=PxWBnvFE$h|D_0Lxrpg*8@roHe%n
z)q1m&iKm%YOtPOT4l~U$W>{dBGt6-x^E}7`t1R*eOI&7|?SHd;rdVY!YaC#mBTPKK
zCCW3&-Ar+oY3^r+6=r#uIUZ%6(X!<;$s)U0;!c)1$O?0;a-20zvCchAjGCWJ@(@#8
zWSYmAVdC$W&mGLMn|b!Jz$}X#Wr-6kbD9;FSmhjRoM)X&Oq^%?Gs)Kfvs|W_VJ~wW
zV1Xm7vd9|uvd#lcJj3>8k~OAyoN2cGLwoFGmTBhL&pd}&;24W6u*4abxsMedWR+Fc
zc!Y^(+TKjEJ!*<lOfk(~W;wtdN0?`x1@2~%vn+8x%dD`%!>sWr>x`P3qQoxSn@M&t
z#hpxZkXh!K<2du2Vu5>DWSJ!%VwsDq@EEI1G+Qoru+DBK&bPgpWR@w8GR+BQIL$0e
z%yEu+&a=QJ7FlPBtu2<%4p!L1Dl@Ech;{B_;#u~0CRt>Pdzt0|X1KsCYs~RD^K464
zK08@tnkDwL%wbkI#wrV}afXQtj2DwU#58Nn@Hn$<+hTd_WS(gj*v~RYS>Xh$oMw$B
z);Y(-m~mo~OH8rOG+Vb?9y^$24|B{g&mk7Li$zYd#3IYw%L)&$$_3U~W1YvDc((P}
zX8G)7ifN|V&kTo|<rs4;FwYrQIL|7VSZAGy3-!NMd+cC_z07ieIgT*TJd2!Wi6vHe
zkX2S$;}O=m%;ZJp={2;^Zl>AC471E}jCmGV;0%l0$1)GG!bR42oOQNcQ~&eEgGr{D
zVn5RyW|rg3af*5FVS#0qxWF=NOg+c;yO#RQu*7MWSz_Y3`oScRF~?M!cG=4ccd^Pz
z);Y_>^Bg~z;ylw_VwTIyvHjZGVTuLzvdUeoagynA{bq)Tm}8B39%q4V+l?PPSz?-H
z_OrraRyoER3#@a7iRT-4CV7x4R+;7zX1L5O+yBk-nPQ&3EO3BDj<Cc$%iPThXIbTb
z)>vWcV(Y;?ceHDV-7K-6We&5-an?A+^b7Qp8CF?j>vhy)2P^Dll><yo7)PdAWS0Ax
zV}*GhW`RdpWOQBaG08H!Sm91qImjAwtaF@+7wR9A+`|;hO!E*kTx6EVm}BC4md_n5
zu$x8pvBWIP9A$+QtZ|xkmYBH2@rp^#GtDJtSZAF*N#$N-ely7-rnrk~PBOzHv)s!Z
z53s<)Eb%BSY`wm5U<d2$Wn$8JGszLAm}iF5%(BEB=a}a_i#);-ms#PC8)%>1tg(-E
zW|?@g@nDJ*Ommu9mYCxl^PFdaODwX^5?gO*T-m`Ydst(Jbq+D{663=pCz)cA8SZ74
z2bkjm^Q^JJ<1Dgmhvl)86{cC^AnVLAd8zi9Vv%X?Wrhcs<6-7`lm$jN(k_#%u!mJ<
zSmO}u+{Hve`%JRP6!$X21I%)PdDd9qaTeKjW6NPD%S^MvepWfm8pl{?fr*zot}w}c
zOz|MotTMwR%yOAIw%^3^nPP#xEOLM)j<C!;E8NX0XIbNZ)>&cVGULT0k21xm!}6JA
zhF#2ZCvzNRo;emc&LXE+;vSY+W`&1X<sxf5#yS%>wfvWvzf7{5DfThVEHfNsmJ`f#
zngy0v;bB&Jlr=^-)9!BlXOdk^b0;$#WR5xJInDy7SmYj-SZ0}rSm7e8JjNOmH@6<#
z!Nkk;pGo#H#Vj)%WtJ1nahiFSSl}FsoM(wkEVIrkTRSa}9jvp5iC37vOmc`R?qZsg
z%&^ET_cF%=%yWSS)>z_kmf3a->&Z^mm}Z^*OuW+kWs+k|v%m~znB_j^c#wHkS>!R6
zn7F0(*~u!?OiURkCOOJ9Cz#<hb1X5>ITkt35|>zJofWp;O1tb}jXkV0!^EqM6I0AH
z&E3qh#2n{X;$fC~loduP?J~(4yIAK=CSGkmF~uA+oMe_o=D3%69$@M<#-C}{Sm2IZ
zTQ7DqF|8dYInETPnC2d4SZ0=onByYzJjMbOx6vMVu)<zeIl$Cw^^a*5nc-e$d4M@C
zFwYtbJkBE9Zfktl$uiTda*#FVSm!tsuhVZPxrZs1nc)JntT9h71fv3zEU|}WW?1D2
zYs@qKdi`aF2Uz4WmYBGm<+GQGqV;8pqfB#xS<W!WeJt=0i(F)x$5~<9?JbvGtaB%m
zZ!nLT;x1;mn_12>&p8%2&k~QY!e!RD;||(mHxqAkykL@9rZ~zpCz#<hvn(;kIp#Uf
z0+(21oh7z*TMu@y!X8$cVU0tqa~BhDG9Q>^ktyzFng^KW0&}b}&*Lnx?T(hqPL`Nv
znf<JAm{pFk#sce{VPeMkGsS~Uv&syQFw14;*nTI=Wr_v%vd96JIKndXtZ+B0oMnys
zS!acbH`~9N<WZ&=-P!V)WQJYLawl^fWS%(|IL;!cSmj>Uc!24*=sz=DW|r+e+F^=C
zW?147%iP5ZCs})|{xUtQpUm(mvyASd9VVH0n|`s%X{O(999emXdBE&Dt;b!hH;0&c
zxA9=AWWF=;9_5&QukpN_`tP%T%zwc6vDlSx@89~~E)n&n^_N*zIK~>Mn7n<$y?^a<
zk@>qNqRibb_a2F;$ilZ1(SBCHlZfV-_))@ViLA#`BHGRKILl{oKQCg1<2=j)kFv(p
zJ@xme`oRjPnELNTbdUws<Hxl`w4+!5ImkM9GxzgEG{-#WS>O^YY`vHMv6DYwnm=Yg
zf5u@R;TV6z0)NjL{+au@%!6$DMIx$lE06G6TxL7l@7)w#pDEs$y}UUGnBoY_zf45a
z%=}7wEc}o4V*1zmcOQ?BT7RZ~lZYmn_+R56KR%X-5<5NrJN;+z_xjE9AM}@%x_+|C
zgG~O>_F{_1nE6v8>bNieZ2QF?Pei+z<s@?~vd#+2e@XbTF#l>jV*jQ*bAMN!#jRVS
zBdlLzOVoRR?Obz9lx2})ta6H}Yi)`4GtWw#w{3}%eac;XOEk#*_ASvY6W7@i9c8Lx
zOSJP0{o^2W%(1|6mN>=A&9+2yOx}D;beJg~Wwmol)N!WqyoGj{;UKfjG0$-pImIgX
zFm=l<K3k?d4>89@7I=&$CLUn9+`$^VnY`7OD8nLiOr^F&yP4)Jvn;c~c~)<;B}!-1
zXFuy4X7RRLq64vATYP5Cc=I@`Y&%Q2+Zkt;@3<uzVCAmHl{Fq>_HODib>A&f&jT%w
z1FUnDiTiDdCYbKikJyaz%(Lxm<=M#!`&fOT@nP<4+l|@&Em3>F{;`Mo2W^Q)SYSTR
zxtk@<#?KGv7gG<lU0CDJ2YLRSEzvIK9=0W#VS)SNob#-*#>AlUez5W74yM@6H2aug
zmRas%nR{7hm9>X&iI$mv#FnV*A=>9orXOiMn8;c_YpgK&807|(<1o{YHBXq~RP5ty
z9~O9+MIL2|(L*htNmkg!DtEHRLDrdL;_=#zeS&$x@{`m*NBJjPuh^$p9_vpt|5$vw
zb{}T_&$GVFKhu0=W|w|2d4YO^`a5R7WA)jdXZ}Lt&l(q*xoAt&`fxc@%;e1v<~YJS
zCs}-s?ZM=8)sJ(wKf>~#Xa2FqNhZg)L`7z~pLtf8xLEl|dY&ofImkM9v-|?(;>WDT
z&rjICS@mA1KTL9j1x~WaB1_!MG7qrA1=e|#iA#**qm*NknHQO-tT4kWhgjn-rYH4>
z8O}1tx%hb=iyyzte0#KdFSor|dZqDT;??^581<)ZPv%~`C7NdT4fa<S*fykJ+{x6N
zESDMXiXXpaOH^WR*7`H?cKv*;$M3LxS>Ch7_h>DbGpw`B+Pk)RPgP#BK94g_@3FmE
zdav<dcCYRGc<sI4a+u^K)0|<RW!67nJ($>MUa<Hf^L1FiK5Y4{|A+a*<QJ7^iAPyw
z>l5^Y9ZXit2NpQWG7BtyNj+A%z|5CzmnT|2J6K~s(_eAiVCInd5I@huEb=I;jGpB2
z*UWEb=8ZF}JQVwN^K8WP-!Q+JSg;+L<s1vFGWRX#DHgx2|4%j_*um^~j5`a=#8z#m
z*zeii%zWQ|$RhW!!ZPcJw?s=!{=oKNhOJLgj~%SChlw9*m*pSZ4y>`r)KAP~W_W;!
zn)${G6XzO-pPOGy{z84GIm8@yv35kinD~``G4pHna+d#l+mA(#vdRMMoMGt?#*Jkz
z#yO9(QrF+7YVVKsKc+asET@>`9u|0jMJ}+!8p}M+D%+l>oj+}fI+<pgIrg*2VU{_@
zDhsS}hM7M*jxfs#b3DufkFvz*>H5ngtL$R(xc!VN4l>PL{5+?b{fqHtjt7|M0xLYm
zDifpn@mJ%`!m|D``+w#W^PFRmRhD^#RW7r}_VbJ<Q%wHDe!&z6nC1wxoM4XA%(KJ-
z=U9riM$0U-{TbS0H)~B>qhThuY>oCZvvq59gn2Hr!1ibAA5$!`m+5U=y{F68t<f|q
zJjBE`w|Z|@J@&H50oFLeblcWw53?*Y&qFM7ktH5ug^BZx19z~-Zr0hy)U~&IuT_6J
z8b98?HJV_BvrPWmR`02Lo=ePgnI*PAOS??5&YeuOZ}mM+^_Yuu?q&8mTcg7)v+V-?
zyzbVhpXKD%s2D%K{?=%LCDvHK!B*d=)ZPuZde2gO9A=t%W;w+i_r&f{pG6*H@<#gi
zZ0o}ma~x!e6Rg~LYc$8~O}0kInYgL(xX}9DY-=>h0*fqiFH1bYI*+s3sr*I8lUXKi
zVf>in1T&mw`d0cCKW2@IJLpGVIp&z+I4dm1dG}V|H#9Ch!i>M&l6a2i*~i?SwafZl
zZ4Z|2ZhJmgKiJ3oJ@kiVjxv2u{b7MKao%ftGIKBWS>ke>v;BG6VTyJ3GIQUpzUOE8
z+|P8M@=TrK`EmVWmbHw2#g8AXU1lG$HR^c2`U6{|Ar_fu{T%JE@^J0M`6F!Ki{+eP
z;*r{6hWl8`+MX;v%6h)QdOUh-w2PU?Z1vum{ya`QEHX7=J=n_%2bg`l?aBHRj4KNx
z=G6<;JJ<49eX9Pl{xsu!iTb1FB@5?mjmB7ghW&uKUFI)q=i857WIJ7;-%N~c^<I<Z
zaE1l$W0{9oXN`$xn>S3dZBjXQvBaIMaFBK8n7q)uV4hQ~a!+htKVK|=uJSBitbeR=
zKPxZrw|<zJ(Edw!iE(F@<@oW7wnkN!UTmB$wO$4FS$^48A6`*zw{d1|%JP|ft@*;z
z>$Ue%zEOM3y?JYNkeRm_|Jb(~x66zJhneG8{P^w0gQa&kE;0R1<M%SldzX2^>bs2}
zQ}3}qvc!32-fKOWWwhJ#?{hq4nq90g$12BJ<5cVi^^3U=Sx;s^tp3YAW|GPM$}{^B
z+lPsd%3mR87c=Z<g~O~e&(z24pYikD%i70nPv$;hyk2R2m|^LY=3DHX?ZFD?S@@Lg
zKBb*c+up1*$MR>i%k*dUH-5}ID{Ot0<*<Ww_AvQ5%VC+LOn=_~$qLKNe!=>_+We@P
z|15BXxv%I?oU;<=Jj^<iuhH%y>(4a%Smkj1_&<#o%PcbURpZMN7vtx@W*k^ya#}rh
zvCcjw=51$Im}mCu_V4&H_p$s9`v+6sbo_a(ek@o&rZ^No{+9imSuU{5Wmee!I^)9>
z>+EIX+t!-}j<UoF=D%ZnSgjhL*K6;)TiwIf4o6t!ICI}OJ}mJt%ZDwGg&*iw(f-CE
zR+wk<hxQ-V|I6{3>7VMy8?<-Cc8UF!_Lw@Vzi(8Zz0Ca9ag13`GRI>4_<zk87I>6p
zMsJe;UO!plB-3^K3v-;0a~@%t%dD_{#`2kBjlHaMfQdh9k4fg4;%=rn%MAB3%L;Qm
z%sh{>!064^he?*$#WHuY!a-J<V~yjibBc*SIi4}eGE+Rn3>TT@G3J<fi{<@Ud#oH+
zpEd4f_Al0-B_3y;$+xQaSL@3fhne`BdB8kp<D3VXU3UCug^5|~|99iXBKufkmYM&v
zed0XY7A-N=v@L3To1C4jFdaW`-WCnB&S}<Kwnd9fY}ppIzFj?bu)v+Hwr-22SmPen
zS!UuI+oD5Eagli@-@&$RUV~a5r<lFAdMxu0t6XG_$C%k}d3)4jFB8|_7VT!8mH6>Z
zwfj!vb+c_zhKZYRi^f>$+!pO)`j*?Gqs-pQ`n}8gGs`OT%%}916)wil-+EgVy<30S
z$qLh~-ey}g$vTV7-Of11&$F#$9^794Sm7`Wci0w9Fwwm&nqiGgael{b?wKopr)^O;
zYaD0x&c-2r%!T-Ik9OayA51fI7vsh($C=|4Ydpa8-E5!t$<y1MYm~eDwrGH<dm67e
z@3nr+-)mdczE^(lZBY;FEHZzeZBdo!`x@_m_xOI6$2v<a-+x;)&t#wRWRW}GuiRP6
zG4nv%gXy!kMRgXL{DAfC_dH9SWSz52J;?TspXV`_*|yJkJ=pp(&tVog#tNsHeu%$K
z#|-D==Xr#=hg$Cs+D`0a@*LZj8BR0(Fy-UNTw<1W=Ggin%VP%%>|v1^CI>B_InJ`m
z!^}L~`h8fxImq-Qj5G6D<H*#bw?&=%wKuda+QlkMOh4ZG#y-LR^ASEtJr<s<9xFV?
z<Wsc&QIFZjWX}4t@KnoZ`svp9V|=FdWS+B(|6X2|$@9(QvVOCR<!9*!Qx{ksE1YAU
zRc6P`%a5B+7g`_InPcK2^NlG^vBEtp=glY9nD~U{Jl}rBJV#k$iJ6Oyd;C1>EVBKR
ze1YY#!Z9W$lxLoYnSP0W&#A{gR+(pFw|T}C7n$KP=9u`D`Y%_WHD;N5g?Sg}oMV|a
zR(PDrR~pApTYnBP$C3E)l<{Qh)ygyZ8vE;KJf1cl%(28ekFfUIZBhRL<%-*)0yEqb
zKc3b8XRR-XSY@91w>d7vIrp)`L$Pl+A3x{$ci0b@*kik~@UCsqerDcn{xVmxzkFW)
z9>*OP->cs&z0dY#b+6+Z6Yp1^$q#7vp#JYOznT7^{xI_)<HPKSw?%uI+wb_p{6~xr
z3m-L)zo7k(SwEJ_winAE*MC+%u`N2t+9w?snD~_A!hcu}dzt4D3mj)MTKU&>@gt%p
zA27U0+qTx}rfuzU-r=X`*-g<cSIfj~Uf$nB@!VSZrq%L0*yCqW8?TMSGq$(4Jho|j
z>si~|<L3taj9k<d-JbDtmo|@XZ}n^Z+^C<t=kCsx=PqhI7cZyaXU224jh}n)_VzfN
z@l%$4HqQKuvweO>@=ej5<G{ZOKL=xFck#As-0qp%cbvJs^UUpCXKwE~V|(uz+xsrv
zHnsItTV9=bP0MA?Er*)otf%q3Qp&cUvAy*H+xwa?-0X?7w)YKgmY=!3&*I`f^%;kq
z&ufa>{2u>{v#czBZM>Fs%SL5s*+=3+mTK8&Z123($5LL?-2940N6y^dD)My3&y4aF
z<-g?jP1Y;3eQf=DHSgQ3mtCqZX=h;E-+YYQIqh;+`rX!SJe9psT-Lt<Kb^99*?Zl#
z+oT_7Z$H+wz9TIITQ=*-8H%keV@!M9mo!B;i5K8sJjZ;6KkAj8oEzuGL(%>qGo;`1
zzM38ldH!DB%G{(6ne8JRy6~Wu&ATuXZwFtOjRs!gZ*2Pgl<^qbxE<p1o#xd(<rg<A
zzh`6llrn?L4_~@E-i>|W*^PA>Yy5X*y>==$u3S~QpPy`ZW!{>dmu{Wf@~V}Y@y>W6
zuTIp){OD0W?$@09g+f#GsFU?i&Zo@Aex;Q;qI_BT_ia{wWL^33wcl6v^-g2n{nDoB
z+|_z#tjyb%S2v#@|7IS?llB>Fzen-1vg(Yg6Yrm=jPV23+Fz;Gn3ApbAw{Ru+ppd`
z;(F`%<<&7+*@v4Sv2IN4yq;7%-tXqsZ@;W5x?x=3zfM0sju3Uo9&G-_ZC={k{F%n4
zXf8B}>m|(X{+Btn`aS*^9}{=TGO~CdiXOPV{fXvcr(b*I!}4=h+iO0s(%Q=!tn?$R
z%<kPM`w=f=R92Ls@h{H2Wfo1#+E%l8xzn=vKE-w7On)D?x)1DKt^Z-$al_cG9Or`L
zLCf<`vk#1QsP3(T==dvF*R3%oLyg`))UtX#pY+nLb*d@4UA&Gf#|)G3jO|_VcGxnt
zGBdv(KioJo<842v?y|aH_Iq6S<a~M7+8j~MBx`Knc<$Ku72T~~@l{QcuT-9XnXfx_
znInq3g%S0?x+(IJvVUc4ZnNpwoV9)IL7N{p#^U~Ux!fx_>9t?pn0)cn{>9tbrDZhq
zx~Ayfevkjf$F%XdJ()|J&HFsK;h1*Wrsu6Qwhs=dUeVU^*EdCXH4dB3jr|*s(Z4*+
zG1_JG%J#QE9(i+9bWO*~({2Bj`!t=hvhlt$sP2KcHboDJcczu?fA;o;mP;KlTJD}$
z+w3d*$%L{?@31c^8vh&hGcAi>k2dCBV~$+j`Mmk;?Y+*?&2Qb_Z!FH*K6K{xku$cB
z+LmW-AJ1%`Y&y**?Q3j<<#_p)wL4zVQ<oL)Qx`OEHa}F2kH`Jj@bOcxSH1LK8#8lz
z<^0}q`=)gVy_GQ@SH9@=YVKtDQ}?l<Q--*)tg^a?yoNpFWZe_XYX0WhqRwhuOYG2{
zn({}yhJCr;z2NU_ep<Y{_ce~W(KQ{<-q#du+pw)&A8j}fwG7$Xr<{oTHP@$px7Vxw
zti@(yXfYe+@h46*kNdh*pVHQX*Q!7A`^IC(#M;DZ953SK&M8~)y7aPtZJ)1iy6w)`
zJ{Gr?xR&wsn)2T9oZW05Y}^-HH$8rse{oxb+B)#Drs(|5$0MGT8wMo1X~$NM5wqG_
z@*2=#@n7lJElxAGjeZ@`R_c@f*7)iAwQ(Kqd75rTeY%o%!s_zc?i%sfpEy6P%o)?`
z<mF4t&zs|`@L=Zj#mB^PZPXq7z7(&UfAQE&$@Y7Vmf3upo^q|(wdsafIe#D4R{7xS
z^^tAb>_i%$pN`3w<u8mEy5aoPJg_#&>^ZKW8WX{O(%HDC>AJRE$Lp{0Q|q0*RyM2`
zpEpL-n^NyPalMtfwYslf*4*rr*yw^6P@U(l{oa`3jpOjFx;<XgT|eGtYsXT@a>vr=
zt}XM7?faWroE=Vd*FikKmO89%veFd&GG6Y9Z4{itUw!KFB`zLspB>wsSJc059Qc><
z(=A(&J!+Hf?jYa1Zo}WSWs?oRU_Iit^V&J`6@Py}ZrHyrKf7f$St$;}ahhElo0Zp+
zWo1VXHAVMXEqiJ6^If^Dw7uZD-JWxa)A$!JuO@qbkR6vjBgoqR&EMk;vQF7Rkfmk!
z4YGdOoq}vwcJm+`leGs~LDm#xGqT?skM--ZPxiAQJ1DCLSylF>AUh)aM361Z-WO!?
zi|?5rOUYgyWWBPBgKR+d%pe<)4F_3X_Rt{PExTWk&C2c^Wcy{EK~|AnC&&)VnuF}9
z><?ewIHvK1b}h(~vhN02m+Z?ywo~@WARCnJ4YHi<%|SLUdqt2<$zBj-dt|$UtSoy%
zkR6hp6J(3B`v=)ESx=B9?BuryvK_MP23fbPCCK_@bqDeF^E4~_d613Dz87Q@vabZ$
zv}`WOO0s_svN_pXf^1&)${<^kO$1q8c7BkxUeCwdf~-UKupsM^^#xf*c9$R<lHD@M
zcFC?6WRtQ)kQHTrbevs3rh8?-2(kmR?+4j}>`;)^WS<JM<FfY$S(_K2Zw<0e*;J6F
zWiJe}e%Z5vY*_ZBARChn23bLNMv%?O?iytKWVZ^kgR=NMw|=auvMoV&ME0jIZY;Mf
z`(==|U*EYi$WpTZ46<I?r-N)j_JJT9k<A8KUiPXW+bz2!$Yy011lfMsNRU-z4-c}#
zvNMD1sO)Y*7Tv&gVUQ(d*AKET+14Q2Df_eQll5aiC_55lIoS__Y+Ux$Ae)kXCdl^4
z_61p4_O>89Bztv`Ey`XLWXEJ<L6&gC=*dC0L-vRu>y|wr$ogdIAj`^b9b}`j8wA;e
zY+H~`%Z|HVTtB8I*{_0ZPWHnfo0okp$d+UWf~+q4V34(Xq4@S7>yW)B$a-XxL6(s{
zJIID)PYJSJvPTBlq%6L!UcbIY+1-O|uk1EKc0hK+AX|{N23bw^7uWpjmvLP7KS9>!
z#`k{(S*L71$kMXU23f!CLqRqydq<Fs$)<y>AbW9;&B!hcvVF31gY2Lz8)Q}4SwVJ0
zc8?%imfbeU+HdR{JIGS9YXn)Z?62PUT0f=(vR?<;h-@*)^0Kc7*>2hAf^1gy;UL>D
z+Y@9J*=vLBu<Rv4c2sszki~CA<bo_IdsL8h$sQPFJ7xC_vO!r_kmY1I3bJw8HG^zQ
z_BZc6t{>AqvZFy(mi;Km4#~a|WQ(%T2iY-M{66RUWhC4berJ&Fki9O*x@DIJS)VK)
zWLeo$gKSjx=pdVrogHM;vfdyo$!-^9bFv!;*}UvpLAE4Yc28mb`qpK?39{Ck`hQ9U
zS%>VKLDnNX7-SjQM}llf_O2k?C3}64P09*ER+K#_$o9&f7Gwuxj|s8`S$~k#WcLcP
z<FeZaS=-J0|44$YQ`Qz_Y1!Ys_r88i`(?ikvSHazf^1B-5M%|}7lLd?_R%2QCwq60
z9h4P=tSWnHkR6ddH^`P{PY<&8o4Z#ZWGUH$f~;3|?;snH-66<EWF0}4mt8x^cFX?H
z{jl|8IxG9%Aloln3bKmqTS0bM_8&oZRQ9nTi#ol(8e~b?8-lD$c3F_^lszxV24$l`
zmXkd;$i`(44zelPeS&O{EdE@``f(}CZW?5VWZQ#mQT7j?Em>dpnCw`PC2kQ#KMk@S
zvTp}jx9p2S)+Z|mSyuL*ARCpvF~}xlFAK71*?5qZWakChoa}KyHZOZfkS)n}23cKp
z#~^FHr9a0KWF4}93$h+r6l59M@7xbwKh8t4{|>TUvhM`hq^uHTMcK!LY_IISL3Tj)
zrXX98?GCb<?D;`<T=tA0YrB>Ee?itM8wj$r?7l(PFS}Ea4a;sGWMi`SAS=k4f^0_i
z`?-x{x=;4AAUi0l23b}1r64;Z`$Ui}%ib4c?J4*Ff-EI_d64zWE)KE**)xM|L^d2`
zdD%mQY`5%wK{hM9bCB(qbp}~QcAX$QENc$3qq0AE4`ls3jc)D!Uyvnb-wm=Z*_VTC
zr|gqKHYnR0WI5TJgKS*(iXfYky&%Z;$aV!;S@wh=J0v?N$QEVy53*yjo*+xy#{It_
z+abGdkaf#if~-##f9`OL!$SPeE!HS2`+1O!%Dxw56SA)a*|cmf$V#$*53)JgTY_v}
z_R1hzl1&6zU3PwuwcghKzaZ<7JuJw2WPL%Fk=-T8hGe%4vR$(41=*x5{(R^9@hi&y
z=zYKS*<RT%g6x3o`$4uKI}~I!*{6c+xa|Ew*5-|vw+30KEdE^T`o5%PFATDN*|UOd
zSoWkK8<WMKe_h{RL3T!v&B*Q=Wcy^d3bKQ;WRO*5TY~I}>`y+2yMDRLvR?*S`|aHS
z3$m2#KZC4S_URxSkbNM?Mr5-=mY2OM$ac#v39?z)1wpo7HWFkN*~5eEu<XnrJ1V<d
zki{R?Oa)m|cKsmhl5GvLow7fBuY3KN4$6)MSx)wYARCu`HOQu9p9!)(vVB2Tmc1>=
z4#{2}WQ(#F1=%s#Sdb;|;Qn8b?T|ep$hu_@2(ms|I>@rJTL;;w>;^$LA=?&Y)3W0}
zf4+W9OR`@D*_`Z$K{hY@T97Tt4g^_U_Q4=)?RNh!$U0=N39=s9WRPWK&knL7*;9gS
zm+X;2HYv*lSy6WPAloauO^_Xs-7v@&WUWC~ll|o*8^`px?0<r+?T+sM1zD$TKFHFt
z&jwk)>_b5|EPF?gjmf5itRQ=Fkj=<046=Q)bA#-lEE{B1*;zq$M0SrLTbA86$lCAZ
z{$G%#WY-9?UfExL)?ocS9gzJx$VOy~L6(<&J;-*;J{M%OvJVH@e%YQNtH@p(WQS!h
z39_TIi-Ih=v-^KRmXtjz$hu??46>cFdj{E{tSiWJvKs~2xa^uiHYNL;&mpZJ(>=1I
zK~|RiD98@Uz7b@Lvd;(EG1>kgOZ2$^7i2qRuM4tn*`-0&C(8#}R`%2&8<jme$R=cG
z2idf&H^@q|+XdO2?8ZShFS}NdEy<RBpJ@G<)@8p5vevt}{}*H(vTp`ikL+NOWn>=-
zvLV^Kf^3)U^+7f%D+F0l_M9NwD|=dy9gsaH$QES%K~|I9E69$^ZXaZAcXj_S$U0?h
zL6(;N{ev6Fv|skyARCtbB*?~O3qe+peIdwZWFHN(eX@54*+E$`$f~lJ2H6qWbAxPI
z_Vjg`X%V&G&FfX!z2X-$8$KWQ)V0r8t^b%;<MmEPol$jK;yV7N{S3)+vXM9lKhNZo
zW-Fij@>#Ey&+7Xe`^x7+r`4NM?*(zamCtP{wDLL96Ccy}NzCTpGyVgL&#N1c?Ke->
zZG2{P<7YWXHhvz@=f|60zA=hFQ6IP0VMPXQ`>Uh9PVMc|-h%d?8TZeaT-yA|?X5mr
z|Dwi!S6(|#E7M^gZi>tJH{z!x+addV9ITAx%6e`39iMYI`;@LP>@<I9vnS%u!M1C*
z_3rNDty}J}tV0$bD<2YfX8m$kKO4L9eS{0w4W7-YPY>RhoMnXnXJfRot`pji&t+@t
z8jtz3Y*zM1zn^}Y>$l;`GMhiTaS8D@jMr;PdjrnhYs(ms)n)y%i^KKWbUe@7Y}kxv
z%bhp<ys7r%<qqA${C6F;e!073BeI`)e#5$Ly4>a$Znog~)81!n_QdLP7cApwu-uyL
zi0rrj(sEBbf8Md#V&nN^dPe>CbRKjqf8BV6Hyk&7u(L6D%#6<F%Gv`vbz__+)f@J@
zYHgh2IZ%`h$vzhMcjXwc$$q?R-3Iq99Ans=+Ze@jAnsdDdxx}lN!;GbJYGGf_zY@x
z?O5cn_PDj*SB^y;y|$&-(J$Mi-V@h~#%uMcwFmK9xxeIRNZt7Rt&M$s!+115d~FO=
zebCzP@p@0G6My&h!+t-x-Wxx&dhW)Ltj23?AS`#~%X}xkBNex|sJ+O&m8+}0ZYPBi
z?S1Ns_3ycx^)KE>CbYMpy|ry0cXwJAf1md|@iI<(%xU&blg5--J&&yHbMxxP-}QY!
zT-U$G{x93NF6)=qWk+Ne#)VESV{M#Qzh(8Xb-iBsmX*WWiE(b6uiQR~`tI#G=|1a2
zPPTPoZoOb_TIg!0=XWU^f7kfot7S7zY|YJ$$r0}yQCwzPnTdZ=rmW1IGLJaXPLFmn
zjjwX9&H2O1)Rj4Jwam%yTV7^2SsgYDefHY#dL7T1_WO9h$-VgLxZa6<Z*6Q=zWa2+
zx`9~T^BM)hIT(-ii1v1A@5OO@ry1+>*R{8DG=JX4#7f0&?bFu&tD&tCZ5`KEU0YxD
z`^kQt?t8OO-|WyApHnZ|?1}i87%yXZr}g(<(Z9Bg^EYcMUdE<RtS#f9Wz7G7Tt<BS
zOx@S<(0hwlbS@36H>TdzHJA2jZ@>1gj=6MPTk-cVu4-HHcFx?-_;^42dw#zPbLrB}
zCcxTu-fJ1<e|Z_3PMLTvZF)_ywv4v>J9qi4!@ss3S9}@q@qNrP;=f1m!g#D!jxVdn
z%h&i?R(xD)98o%(`_>+KTG9AeyHDL?>ORu%aov;Ot!(zK+Qz}<#BubnvMHa1SvxMT
zU2Er7ueC=sv8>FXGIwk8;KZ@%#4#YVeZzNxHs9CJ+&<{rcfF2Kje4txz{csJaqThE
z=XmOKP8VCm#y&rKt;-K|1##lLH}Tr`C_1BFUcI$(I=QYJ>YaEH@2B(X9Ne_dhHJ0~
zt}S?NS&1`T{~x?!%j#7p{@mTE{k!~Sjj1=Q-lh6`IpZ<B*2j3gO6oNJR>UcFHZ1F*
zYjxJvYf-&HpM|_4%W6N<JX5D|1(p@}v0uG}&$52&lzJyNQM{dfV}50x#(j+Uhsk((
z>iu-J-pTWHus_Uf+#mXsKcH^>S?Jlg?#letZ2S#0Us1lS`P}$B%O}qCea%l=_c-3=
z8{avLFP08yW82mfZ8R>dTweL+T5tFr(BE$Q)mc^JYli`CP5x(7^Z~!0+@D+;#9dnb
z3l!&VJhR8gq^E3*8eMC&H><s+tEs&{T{^10zOOb#Z$8<-%NwuZaJ=Gm?8%tF+Sq%t
z4deAs*YQcGTgOT5EniLT_36`s_J+T9)z&e2mg`GxTwUv!)83BxtEWAeJ5gDC<J!Ax
zlLse`+qS{k+xIlDO|+Kgrf|&nG_QP*uO8R`y6*{YsPFjgDBfto-=A3hdnwzRHhzz<
z@k(H=ot+PKKKsU1YbRcxf_6r~>F>I1=ufad&HvbJbsFDGtf)V<;GW%v`kSs#;_}v~
z?QF-(Z(X%^;`PaDr~lhmetnAS_kQQ9^}}uuEo!Hydez!#T>pCU(e>S?=#AQ3J!fV7
zB?w<bU3dQJX+CkzGL0I?)?xLg)w}=cj;$O2Zp&vkJ+_{)eWcG~_iC%~Uro__R{OAW
zo>)EBzoz-TwPRJ|>S^7vzHvi916lQsYU7wTPPx9*;rL?b#I(QQlt~|7=3H`0>O0?Y
zyO{^spBAs)cH?91tahUxt^N(9_51ndpHK4Pj?$-@xV@v=8@n3Y+xcMUC+&S*+Z&EU
zm*2k^Z`Qwfo=s_Q<i}TRp7p6$QSXp?S7mOrx=>F4<m%ssBicQn-OuXh6&RyUw_!X+
z`?c4-<o`<(AL~xMemHykbo1)pk~+3-PdG!YSbR)6qF(Z+P0=k*)>}DmubeYh|ESCF
z*KL}{o)aJH8}++Ppu~SSMID!0|2L<tKdOFB{YPJ3{g#gP=d2SQi2J`cUVg198o1p0
z*WI*!d@MYoe)8w;M{cOU@f@-8H<wzvgLA}*e#GtcJk)snqA7avKX2!jr)wv^?j6@o
z?w70cZRNFG<L?_ds5bk$``ShFv8E@lf3Wi2Qd!;BBUgM5998d_dRKN1bf05?|J9Y-
zhVj_#Qa}3jRoe#LnkZ|h>!`mUknq4rUDo`7Yj0imTD9dXP5$cL8EsqL0PR$yrhMx$
ze{03>arw2^!Y^F?n@#cS&DMuGM=0~DxXi}a*6&%{0_)%S>r`x~I!o%ftX^&R;f+_<
z=dQLJml;u}?RQPl<4%-WdCzNnwM;x03d-~=^NzU8LxR5z=T94a@f+WCoxZYvpR@Kd
zNYPF|2i4uD?&<n4qD)Ph!^#}=d%XUQTjibR{-*D{JZH04^DD1YHhGyJzgn?tMg4>3
z=I@)Lw}i`ZZasNo9ou-@I=*`!yjiArG@^|e?G?0lFl_H??DMi^EL}}~?$pl#H@18J
zaQbZ#ml;!L_@9(1DzjS|m$e(_t9v)L-pY2LQ>J`5W#VnJsLYZw#npCBZ1)Y@Wc~X{
zD|erp`!{Z;c$>uSb-Ur-Q@?uJ8`0i`_9nHrFYMnH+s5Z?HeuK9=`C5t{Qt*gbm)7=
z4Y?hE^m=S{JuYp2LgP5l?@j&q`JCr_FX#D!=Z7xm`I6_yPxE|yO;GXtjOW+xPp{o8
zyR3QT<k8rwozH6=#u_i+I^73QZ&|$s^`7bX__(lny(g`!_n39Rue`t1m(`9oZXCDa
zUzeY(to2XM)p4+)-AmT$yY^T)mrW|$t?W}zDZ9S!7p>KcZyPo4E6%BxQ?JqY4ae(s
zuSq)dje3m}_-egl>dmTmMaC*VMt42R_bk-gzuM1}_o+6%pX4s->Mf^6R~yH&oVxvg
z_P46yi>XtOzv176x?r>Ue5OAf)pFO(|0>gnW)E3L{xARkmeJV%AMO3wzcod#I5`Hk
z<H|ME$z%Mf$CFD=J)AV=#)S5kwD-yl?RniFAD>QlzpSVEtaTTejp^K@e%!x#?R73U
zMPJ>}Uf91)&OsNS+QGQJj>lME?M+{9d#8T>W3%-g(Z(+AEotxdVSAe%?_RL^H0axO
zdYrhQxnLO`fA{xQ<LlGS=GwaJB!3UL`E;M*i2K_y<a$wC|G)a1wv0*3=>ET^=t+Lx
zc&u2tZ)^j*Z@jV}uN}MMZ8N7{R=tgLapT-M^?g<Es%&@<J3h0m98X#w>vI>{d)8_{
zPPczQcirwC9~b)6n^$k$9A3Rv3yuqqTvt)OcpWCx?fgem^zM_(v_JjRb$IM%>(H40
z+MCqghc2hRXKt(<Ux7VtV{~#%I$X3aY42kv+uMBIUb41s&U4CU)f<hPqc?0)Z{;&=
zwxl(U?-(@>-7af7m5tY7THSedH_o|@V{*zp^q%I2Znh3_drR8uXljnq;r<XGV@}-g
zIrW&;ym;y{VMKd9k9R%a+#LPN@Be)NI?X;j*!;ZBc5b(m8u!hLmN8=)r47s2I0jGO
zxWe&v3?lFUslTMHWo<oiwXL<+J}ZBT__2+@n--t5TZiojEzK+M)!Qg1=l^S(n?K&z
zPEMmM`$}5b!^&QGvVSM$#Z%TcsqH(l_Pb}}zU)%3Gts>6wb07zC4VK{k;>noRVF?r
z6qOrN?&)#+8_w-3*91DE^$Rzg+eZ|gS8rOqFU9pvJ@z?9uK&}Er`a}rE}BmBIwxL6
z&l8NNWgPgImT}tGQcpbHGDfvqw2YxG&8L0_aOFG^yf3@yREqCcp0R!C?Cm2D-adGm
zH@yer^@%6OQOo*a{QIe67A))en{_*0*4SoG#Pd1c2XmG&y0tkt&ixDL<#SHm2m7^G
z*4`oQy>~;O|LOh_95!5&#r2(dy8ZFfeH+>e`}T~D9f|igt(k;&la(V_m-++Rn%35f
zR@-WPzUc{#k0mMkyv8r_kuEMfu59gJEL&2xx3xKX_K9|V-s#-c=6asH_Df^mSx|OL
z*(-7$v)e?iBVI?S_v6)NG*;}i_Xr#jHrYpfHZ!wX9?yYM?Psph9DOZpf5UNjlWQ}d
zo?Ac8@!2Up$INN3qP<H_ws+zfwEpv5&ED-_e~yW_d0pLBf85yHzMIYY;5c!b&$%iU
zuUFrbjrX;hqc8aV<o>g9E^cxia_R@XRFBtdT6=B&5dXQC(_Z}g+{&Jqd!}P!Kbu##
zPu-afb;G%Q>h-EII%SOGaq4)A{%G&2o`*WMHL9(;w$K=-we!%0>+Wexd9v@?&8Pj$
zN_-C4ugorG!gkJYw6k(Pi?_>R1<K03YIS)l^LgWT2@VMHUVZk)kzKhKikI7EgVg_p
z<qj&hbNkh^+!4#&qrD03-KW`uQ;*NVTs_@ql3KpG>2<!^@v$h8bG~VBj$A%AmT~ev
zK)m-@=*s0<<M`5}?3A+4jmvI0H?M!){h|%$=6L)@)vK!aw78xoFZD6t__P1{b?1IB
z&7*1MI<C_koz=8b?!<A(rikBGI&sPN^i$rAJJV+={&-4i9piS7Xg7J?tH0eoH!j+)
zclEbBquoUE>TNeZz8ur;vUVr^9&bP6yL#LVK5z4Y#&L7))=A^|(rx4Q`D4V5WB5v4
zhvv0?>a?GYU4N`=w72$peL{N&wb!_oKGELE<JHRhG3$?4jVr&N=2xyeUahtmUpLNc
zZ{!BeoBn?%)_=Cn>})J8e%TQpD-td=_p0}mlVkF)tk*^Bjuji$E33V(8#e!w|F_2`
z>-Dlz*Q-yTO4>W5z2C+CTR8^k`RaS$m$~k@4OUNb@jfuV?y=c#o2hn08-*Pk&cEy1
zc=EbBaT`xu_qfqUJigtd?)lxQ`3m1V%c<9|-q*De&bbZWX?oSV4Wnp$Y}%vlqPi1t
z-Ntca<^7Z9a~cPZ2d<tz8uN2r*`6CWuY7iGWt`S+$CXdC1TT`}WBzIH33RwgIHtX|
zdxa<Wm5s0M*MEk*PdBqF>{aiXt9?FsJY9MHvT`^*{e9IA^=7nD)y8J`l;dU0dA{!X
z{OU4J*S8a|w{7jAwOQi#czZ5uqvs~g(R2Pu8!uc}Z$#PjGhDx_ci-?BZ8Uvm&U{(_
zI&0JWRq-7A!DiQr@nqX!88eoV?Py-Pue`S1<taa9&kcF5+wZ422Sp>ET2yXCx&NR2
z?6Z+~-gNVR4tuW8bC=i8csmr7>sRi4zn|Q{S3bnJ{(CkjzE5HM4Jdj*y-D@He6rrZ
zGT+_p)Ij6J8lTx$u5m`R(YDKfZq^)K|Eg`*PoHkP#(f^Mj02W&?Qj{oaq{H8{)<eF
zy``u5v+ItHtG$lL?x6N2Zr&Wd#P6q>3&v#iI;L@y?rdCVA6KusR`0|yb!E&?>@#sq
z-%xV0Xe_bQBI0#O+fZGd&FkL_IAz>7xwd@jX5${WS<v2Y?Ri<d;TZR<)q`7HX0I}p
z%PA9|+vnp6bc^Pd_h47fr#AG;`^M|ff6Xfgg4NMjdEL?K1!L|?)Q$U=QTLF#r|W;*
zw^3yhw_M#$!FX?Yy>`~d@s8`3)E&AKb>nTdr0za--|hF+ee0BM_2i8kCO#)@x~<~&
zdM>b^+^RYHcHG{HYnF}IK<nEx$Z>lwUAKr6*9!4*eo}k0+S}|nA0O-Xc>aLr!?9g|
z|1Q3sI;31(xyF6d#)NHL*To+vIdL(*{__j{>MW}>l4_1#>i5&F%lg-@E9)|T%Bu9K
z-alqLX={45t+jLBhR+CgK7ZZof|d6Y*Ct`y#*{XWX~SWyv8~qH*!Vu?>Cc;S8w=X#
zxb;=*N79X!QEgn6ehg`&coq9GtBs|r*pDS`wB6>ajYrCj-XU#V+3|?4<3_bn(1zR5
z!EyTJvDuCpb$Ys}?3}Xa#AVmc{a!0RrSXc<J!J8mGBstq?YW_h{X)r=`^byRbl$dk
z<^OxMVV_w4TGe|8@oU`Gi5Q<#k{7wYxq|iLW5a-Y3+m0S_Nj4-?OZ!HtXzL?a$Z=!
zi=8+h$NiYqUSHShy@b>E|HSuc9qb#g(W~msT)}$rakedQ|5NW1C;Pv8{@8fE>yxk>
z&WUkf`_;|gZgq~Fwy&qYA3v$yzAI60PQBWdsCP`gj@w`Hes;M?&R)TK@thb@Z%Vz_
zpB&Ru&Oe*XiA~?nGOeTi+FR7#HRAUCOZ%zF4$B<QgMDlLJm2{L+>7fitJi*qD?S!!
zH-GxCM7>?=O{!-TtetN*y*}|t6*FjMhm8AKQt!}}sJEzId-oMzzvT0*-<7C0px)$_
zs8>*L?n=}<px&`7QLnCE*B!6;xO9)3&sU;ePQ95cQEyhgg)337s$T1zuK4=3J>T*6
zO4Q4!H@<1T_1}H-D*EJUrm^2nsJH(L){Czh%jz9b@2>HF)Hpw${JdG8sOtIFJ8%B}
z?D}hMS5l25b$lPp)!2#O<7Mo)*mb=&PPdF+&*!efGU98332p4rhTSwcZmoWB%^@JZ
zCWx<t_bOXg*3=Elp7<Y&_IYYSxq+U4eh=`(eJAU_@@FqrjyG`|$rpG%rj6ZxKmBV;
zuf|ul=laWPt(ilr4QZpQjaT^n%Cr$5_hz-xd6x~>FY!%3*YxMDE!*vqmCw}ay|>?1
z?wM898&~gT@v`-C<NJ`$Iq_;X-UjiFwMAuLxoO#r?^$*$+jrO1{xsShywtbzS6&<s
zPOSaX_{`&wvL$7+tL^%K%epRGc|{w~uW`@UJU_JI`IY}ma{T;^=TmoU{(rQc4_qZ>
zz5j>vXBUkYOuSK1p1om_;*ITMmo2P=%jz6GtC*;iXjrIJSeR&3WGJk{T~O*$af`at
zr7jgE6+gowWnEm!s8Go^-HeQ@XIm^Za?x=4y}!@QoHO$<XaCim*NfdbpYNaN-}m`G
z-{;TFSZB6*+6W70g+T2V6-&Oypf^E3lD*b>c&Cn#H-`s$^Gl9wueA)`MtIqxQT6Oz
zgLAJTe8knaq+Oek8#+q42y(;7%^_F8z18onwNR_~Tl4rCc(Tq27+u1nMo~2nRNw6N
ztN~o$8f0OWSS5?i)LLY}BC^W1@d*)c{Hg`3GPkXAO$d|mtLUpi-^f2mUoHBE(Pz9h
zV7$nGsr<2D;Jfaa_UuJVy7eh(n;Z<PpEOt+)%va5mnZwa>hB_FA~%a1)6YZ5@v;-w
zAW#ZxD{@)?7BIeG%cX2c+mEr{v12(Yctv&`*~Vj#HIZGG{35H?&C;~>+O={{Hd10W
zud$a7rJXBU881o$#uWFq&AIF*?Y@xv87Eb)V#GyXZ!7)(AE<90eR-P$#y8QOvYW%&
zwePt&$zRFM_%HO`c?f+zF}(&^>#m}&9esQLf%<0ASNgVq5y(Kf<pb8V$0e?or1H<E
zAK&kVto(^>7hojI=wm6(sXwe>?fSI)yjKL)8sv)ExW3qybN2k*bM#ht8sJgmkBUH)
zzw$m?8sY1$u70EZKXQ}EvAm)6x72f}5<5u#=i$vOOPxDQtt+|DnBt}B2hKuo4RZNy
zjCaU=M$$;rN7nj7_qL$r4r8v01>t>g*1W9+{jKPKZ%Ti%UfppTvp6d@Bhrw5KZ@KC
zaw@j4@|Bcxk2~_FElUtGaw5Bk?8-697Pa$y^qI$&ZauPt$0$39?D8?n&LJCr$Fb!(
zpM}wZW0b8%cIg;pdyzHYd2D%}LAL)GWwUO@K8{h=M7AP&Y<X@+w(l5aCy-q@M%fi)
zqvgkz=Td^}-eZ()LN;-XvLncropo$^UO=|z7-b7P@T12lTZe4vmSfBF0J2@jD7zQg
zxnq>g>%>P_99y2NknK1|*&bx~9)qmJ5!1*PZw(lK=KjdXtH6JHho@9rlFtA$gT94B
z=vx~j273JT)uN*TeT`=ajHtvV$?-H9ziaQ>NfW;}ax%Y^JSs7FDB7srcQ`{D@pA%3
zmgJc$hrf9+(}hV)unblKHX=+`+#zS3i91|zer0O!OwKZkTrq*lIC4y}o!H7Ad%f|x
zwM1@(>yn3h<U(e^kfO_#&tHoKJz!@DKz+3FiXxdM$L!|@`=LkD&<hb6haQD~rpS9@
zr)!;XNkSl%Rs!CtG`b`&nO)3F)6gZYLg>9mke2XP!8-)+;bJr4ZH0FQ-V3;wI!MK4
zoL{#CGATq6<Y9OtvDA8zd#*&)NvuPKt0JqjLwn(Gg#Q}R?WvzYa`VyAo!L!Xb%eU3
zs4Im(bZ)@NO~u&QftZk*YA4@4r7~0*7<JB5DT}D2-O$m3j=8@>ht%H$I>yhl*924r
zP-*s_KTk*`t3+bI1xXLyD(*e~kM5K@d(q-OXu;fCWD9Snyz!%#l|0rVTZru8>{R+!
z54>gYE^&W|etElN;L<N6uue(-wgrsuiyU?&3uS@8$L)YVs_J_Io&k6svUx1~vD@rc
zI(`I`Q+7fxtPHfeeoHcHZd!Z?`8z*gRHoV;6LRK~)?D{WmjhJoUmRSQ;l-4bjgXI4
z(RV?>h}n5^=1AAL&VCHSQx8wNc~l(PDR>6q`JAms&Cw&Nvy|5Eqt9~;=Xsl3GA8Yi
z*F*e@z6M)gs$X8|>Xx5y?$rcC{75BoMU?^LZMNJ6svJDw!*8KCLXRJU-UGei5cCn~
zU1{i1^v^&ahW=CT#eSSNuJ!AewSilmHdg(z)*3Hznedff7%)x{9b7s56@pcOy+R=t
zRt8oHmTO}L{9g%H19rM#9)EU?Q&-BDHN#g2--R|`%9rs@=A;=~zD(+E2>xF9(~T8V
z&_@qJPe7kdLzh09`B~Z<dZpw?^#SEa<orm~>1&c3$x8{mdGGSiOB8w)^it>_=H9Ez
zy&uU;mHJWj9^v80<URcXu|9Ovq2q1R&lHWGDgEr64XvqnoSp3MR%)`{8eoh-WRJ*S
zl>S))vGEmndf>T=dv9C0ZG7#s=Ot|BP93FO1xYl>CU8~2{a){+uXeA^T61`-X6!k<
z^#k!*{zlQ$jGjlhPt{E_j$%*1ef~1dGh5NXb^dWwRHc#Ei#?FWQ-5C?H>XQO?5Kq_
zS}*puBNKW*^a1D{o;+X!GG5uwhp%0KS-VDAi+loI^XU49N7qr;neX#WRJa-&;yB;p
zOZ4Y&4wqj}GV6<~<Wch6kG@v)-F-xPPWwFL#p&80DtbT9e1kMpjO6iG?mpl?b7l2a
zXPr!ZM+G`EFXb5((dW@&#i8&jM;3+L2yZ^T|4(>5e#Y$^_%KAuAtYlp8DvDqC_4K7
zP94kW$gTeS@=?a3eb3*aLwrIjI<nrKid)>{fO`!v@Gd7bp%#+w>{r9C+7FsSPy0VW
zPfj0x@gJb48a++#`TNQxK4Jhp3+Q<Z_hMW6yHzHjXI<v+LqvWKdeI^1@-oyo^mO_o
z&>IdRABEnPhAw@h7Wy#s8<Y9BpI_kRFcc^G{6Y!zE_jWa)P5pifQ%2k%GTW%dD=kr
zF~lQQ95RVqDRMzcgG=I-Jzymowg6V_!X!;OEL;S3x!imDqiY@<vDU_&K}h6Ek*h-P
zBeq=1Ztn;r$1i$4r;bz<Cb=zzw-LE6<d~YN^pLa0b>-9T*U)J@Lrg(bOfZb>9I|SE
zQfU?0RE%pqFBE8ZMYLpDq^<S#1>~cbr_KgA_3N4QNASlf-=o&d3%<m94st9PI(e3M
zS-YodofUVgNsS4$$nHhfn`dWCNV_-Y{`!K_CuNX0e*}F+SNvo2Nqfl4lUvZ|Y7hUi
z`Ro^$Av=TYpy)}q$6?EMv1`2Zl&uSW#qa&c>D!CG9`rpR`J>NxZOxhyt&Jaee#&P&
zmbxnXGJUHykbIxE+veTzo1AD(8a;B-ZuQ94A<K|{G<jimMsui)+Z(R7zA^NTqVHE8
zeW|`+mCpT)nHq(Qnbopg?$ua?j!X=+;>uJkoGKghk>vMXkc78}zf$<y;U|Q4l_y<1
z#oW4-4z9?lw9ncQ^~J@1^`U<f{jV1NT;g+v!Dhf31+eTbX{!=3SanVwll{=j>pXIq
z@B6>WtIYW-{)=Ti^uJ5;rN$`hw9D1X#b1(quSElVt?+$P_|o(j&s#j`9WsUtA~%Jc
zD!0;ak0Gk}q{a}*%M83(?@#S@I?vg<-?1U~xC~zfe0Ol4>KnTK_`DZo%Ub)XMTI$l
zJ)rM??mhaB?rg^U{OrVuuZKxvku<!0NYAs+I|R0_&9g~1SI`%&3mCuVK3#iMx;h}$
zE&Jr(@<LYsHowaJ8ePvHLYL2;`-oSUtiD+K`q0;YmCt)QWSmSs1YP4T7<J9Z7}oS}
zDQg0KxmTyoOC6anyup<{74a8G(N*#_+8bThaWD0j8b{qReDYYH@A<f9bhM+xStCgK
z8}B+gt0q>rMhod!_w@RFk{(CL2s+M?e5ihrT0a+my%+inbXjJ%ZPNA^+Ie<Hd{qo9
zq*DGm@4q?saE*PYB{_EJw&vqsuUlLH%KxKp5Pe@y<poFOjFoA}BDcv{4S;-vzcKWM
z>fJua@(K7H_56eQjlJ+h;Zd=tr=7g#E2HpcGKlxW`w;i3x;pZ;QSm)-bQFKU{j8GZ
z2d#HJr2JqLeD&~sz>}85Q{HybpQ{~4ZUDIkk@J+tZFlZ<CC{S~MlE8C=$J;w??i{Y
zT`fOVvBC3B@-)w`*75{|&v?eG%2n)LMI)OZ3>e2t8eGl%)q_=lotVK7zjQtCW%FH{
zk@xKQzU+s^>8t(ngMeD=pC|1EY4=EGR1d=3BbF`8Iyvsd3Y~B8zC8hd#}NI|HK6$8
z98{IDm=gI=YOhVk=t{7BFlS!i?;E_&B8boHK&}e8OSqT%apZ_Tlxwh}pU64R<fj(4
zab#MN`GYNE`83Mqw56v`(|pIUlCNL-wq>O%W8{ArPaAk%Rnp*+a+iQ*f=N@#C71~o
z0?Ss2h1Gx=8eb#W3VA-k#n<kmug})!PGi_d--NBt%{L3SOd4)10k))JD`1NnmNUeB
zTf+*#5*k(tHm_lEusIE@1>37(O<;R8tOIOT!}`HyG;9QHTEiy6rZj91*rbLnfK6za
z910xQuw1Y)4J!m2)vz+K5e=&Z8`iKouptd=1{>6{F0cU&8vyIquu-r+4Vwb%)v&!_
zJsP$M)}>*gZ_)oWEDx+*!-~M9zq#!y3f7`wRbb5;Ru9&sVJ%>d8rB2Wpkaex^%^z?
zR;OXpV6_@H2UeqDOJLO+micYV|G3MR4_2x16@$e!tOCr`uxhXh4Ql|4YFH~+nTGX(
zm1@`!Sc!&>gB5Go3|Ntd&4U$c*fLmwhGh*~{@h&$5wLuXuLLYl!%VPT4XXjm(Xd9a
zEDdW1%XDFR_{Ki45SYYy1`$nLppui5yOggT)%eB#PTBmq!p|kxUNA}jM1@dS{J#h$
z<vCk0PYjWKpP)NlReB1(v$j1-!SwcsgX!&23#PY66PVr}9bkHU^n*oQ^&({&0n^)K
z5=?K8Jz#o!EP(0lAukKn+anju-5!!|A(*s>+cwI;3=OLUTOrnP^VNYZYgjYb64)zI
zA(y1l<)d%FN8hNAzA0OuvtA(j_WI~s^wAf(b8Y_ez$Aa}auk6rYFHF(LBp!RB;In%
z)`QK1xy#$)qp!zD-=L4aF<YNIjcKqsE!{b=y<qNoS@O}BIb!9{tuG%;*H;WC`rLdK
zV0$#I8f+G9gETHz4u1_`Ga6qj*tCZAf=y}I5ZI)Kje|{S*bLaXhRuVGY1lH@sD@>I
zkN%}$5wKwmD*+qQFcWN0!)m|=G^`P<U&GqL`ZTN$tXIQ^!Fn`o0<25JX2Ci%ECJT8
zVJl#*8kX~Y#&ZoT0BhE;Qm`fsi-R?4SS?tChBblJYgh+Zord*;)oR!XSdE5Ff>mqS
z9<VA6TL7!nFxiZcYgjIr>B3|@Dg>(lJ6VibjYrJ4?eVC>hrin9_Zz>OH2xe^cG&#x
ze$@{ab>&0&M!?E6Y!a;0g-M!wz)HYQ<zD7fZG2{sCl6#ArTXGBbSaNJy{yr-=|#Zw
z^h&_o=}Ep!FiFp?uLi7G!y3VgG^`!0P{aDb3N&mOETUl(VEGz03zny039wuZTLH__
zu$=$Lc&K3oV3`_L3Kr6^IGCYfwO}i(H@NGi32a%zI>44RtRHMq!$!asG;9(qp<#Q#
z<~3{qY)-@EWe<BbEEjB#3oE8rg<!K_Te<hfP1b%SYXxiKrW$1Q{5OJ0{!bJ=Tw*)z
zV0wM@fk}Nhe9{Mp!K8lu`qUIOy*>8Y{O-D61ar5C=nehQO5aa!1e(rYatQuPjbF-F
zXY;%B(G2EpZ^=g&nADeFK87@Y(K~MQyY<e1x$Q&r&Vz}4`016GmALsuZ!VZ_PlaG^
zdlJ26U}8^xdaHf-8xFzWq47&T`fYx9`9{EWdz}Px+pEa#0TX+5Wb?olz-H3qH}h^U
zzv##Z6MjdBV8vk5E==@RfK8>*TdVP#@Hg4~ALKr@9_OqD+k4=A7m3AcwWKA7<alVT
z!i!jaX98XB_7nZHVA6j72>p>CdHrDt`b)sXAD)%UzrC)0M}Sp-wZbUgAh})-pVVCg
za&G@!4Au%J{#)|x^()DD#jNowqsX|&AJH=fCgac1=vhI=9S=xb=KR=-2mHqQB4|2)
z)aLi=12sPUjfddx()gv$25f$}4UU4j;}cbWFd5H}s{F{*BID>5Jvrl6e04N>Dv)V*
z>5+P@25SQItH&mdU&`5G^E-JFtRJk=rB|>KFg>1}1gqEh_JGxC*aDclyrNG&%vhuG
z<$_gfSRq)IhLwT2?LqWag6Z*O9hkelgs&M)+RI(uF0iPE4S?zK*C?3V{zcyunApEt
z-(Ij{4O;}$<FC*?YvZpxumVlC2+ZAoCEX~PJO0WCs{+eIb`$qj?0J>i376+uih-J-
zOa70>#)gn_*T2}<IGEJGq^YkF?eXDXI0S#@y<Yv2&wMb^e>C~5M8<8OqNff_?DJ^!
z^dZxZjMJV{-eIuTH07Pr_$A%FHviG2n=|2^pGvv~V0~%QjcWWk@K@RVPTmEp2kUi}
zN3a&K9u4aO>(a16unr9y19R7p=$i(U`f;Z_2iBrtOJL0!mbr87d^R7fQR6EHbGNUg
zTLGr~pK7o=O|}87R>NAsbpO)}R;}?3fw}Ep(j5mA`^?~8F2QEN;u<y&ru&~|unLVY
z>psi>I5D^Ai-76+O2Bk|CYY|T#@6>r$sCvHYt;0Wg0<WFzA8-G{$y%%GH}O+)X4@S
zB1_){vg2AgX2HfZECDvEVJl!G8kVzb?RrlE*pS9o3O4A%q@TpW2EhFMQN7LY)UEKh
zfJyp({C&{e`4#@*L-0>&{8HC@ZT<}I<q~WWY*@>8XwoX5Up^wx-29@W#O8PFHNo`y
zr~yk?AI(1eU5DTw()cAG<2JuLA2VS6uJQ@rJec?|H{UXt_#>x1gfHv<wd-{eFwy7c
zD*@B(!31-cU-Z@Z=xem~x%1KvrsuB@O!DXE8wTqEI|1EtNg5MA`euFfC4BU)*!tXQ
z<m_JSe+$4mv~)|s^!&xaB!6ywwLbcqY<+IN4zPAjwjWIQ=ObV(@Hy=v{bJJA=V!lj
z(7IfDg@5S~{5cPJ`K68vz@+@{{FZ{b^DBDeV3J?IeAH|FqPNB7ckAr|)5|vqCh4Dy
zjK20b0nJ@L(K~DNyY(i(-1!y06)?SibEd5H-LeH>&3gI4n!ucP5PfklU0<!O&&}5a
zrq^c&n4Z6WAAKXXK6m~m!Swv?0h9c>`4+$$HB259Y|yY=uzC$E1gq1qGO$_=s|2gj
zusX164QmFg(y%VDN(~zTi)+{@n5kh?U=<p+7c8n_i(q9M7J6{)d?F95MB^(0E7q_m
zSdoTRffZ_4Jy?N;wSYx5tOqP#!v?|fG;9nkSHq^kax`oXEK9?dz%n%~^Ji<<5A(qc
zjjtHYy&fobUje4CUsi*;*Y||40Zd;HZ3T0$-w0nXn7;lt1m<3U6TWdUeLZ&uOkW?H
z2h-QbmciWXW1=r>+FI{&k1r80x4vTdO29<l+qw6y*;voG`mF`mBctcP1x(L>kDZ>|
zUkrlj^)Y7abMsAu>GeAYCiUy)TLRPDEA!`T+bbVT*H;Xtw^s$2p1*1^y}S)zdU;#H
z-1V#4A53qrAuzqY#=-RU$r-R|%Io%f^I%h8?l^qeM_<-2*6NFZ>H12*M4vki6Kqn$
zYQQEmtPyNn!`i{dz}$A%=c8}fN8f~xzFAwJJAVnVQ7zpSun`T*`6cnLh82JfX;>-P
zpoYc41~jY|tY5>L!1}=4?a|?*uir=Ch^^0U&y!$!{`UCjTkz3m%vkwz>&pex^%a8Y
z<t+o#%UfydbLw8=qB<}suU}l$>ciiA2>ua`Uvy5|{HKdQQhS}OH@WsY=b`K6Sq78x
zxZ5D>p|$o90TX+8wdm%`;jaWt?A0lwU?!N@mz%E!Y>oYct+9U}eSNk*cN)WBy1h<-
z>GnDc<}R<aK>|$5>(>UMhu8Awfu-Xw(fCEDY4gkefYk;Ko;Ik5KB?7N3z%L%Jz#qM
z41($PJqD)t=V>s#e&)cWejNK&@xQIluP$;PS(|<Vn4W$qn4W$dOw#|l)B%^+MJ<>+
zek%rR0+aa76eiCZwDfx(L&&(}PpRi|Fo{2}wDq{3p^&c{_-({Uo|cg7)8d-U*|qbz
ze6SvkuNbUL!z#c!G^`q|UBeo{S~aW{Oy)aoJMRT^$Me!oLtqlmpC)Bhef<3%`<jC8
zPEYE2FPNlvmdJSO+4tEL)m}NjTDyKw0H)VhDOj(T?>LxVU$tO*eKmpU_0<99&cC!%
zKbT%$BVg|O5__Hmllt<TAI@p~s{L(#cb#WGx^}*l57wxau^6nug-QAqVD(^r{i)W6
zzsctJ%Wsbl|KK6`Cw%y455d2v@r#{=er@IdX!>I*GVb;gJ#jE;uk*O~+8bXqz-Im8
z;#Oq(UG*t?dcnlrjz-TEGNUd%(k^?!M!@{)dD(|QYfpOqLTI}FvP1A!`|vj$g1^Iu
zzyA>YV?O-Thu}~6@UI+#Kkqkd%UcANu0Bm4{+dJZH{1Ltimh<v@z(`5k*2+dpiR2k
zSNid|&F}8VGhpKy-#pluhAo4++f(#qJ!X~9tuF#LtjU&u4QZGOHmG4WU;`S~2<EnT
zNw*zL_pg0mx_=!8)BWoNnC@R^!F2zc0CV?ONp}TI_pdpRd;P2M6@ZC<b=ONNSd)gu
z!Ngy_5*fKfw$?{qlaIa*AAS9{K6kn!U~c~_X-tBN|8?{20jtxn1+ZETGxn}sugV3h
z*7yp+sx+(&tP<=5DJ+-drP4=VosYg|AAMc6z70y7rEdT%uBAH)W@^|JScQh|1&eCf
zB3PM*g?{U^{smT|@fCp;YgiPlNW-eY3N@@AtU$wBz#<yf1D3C0gJ5|YHU^ffVbfqa
z8a4-(rD028nHrY)gm?W*{82ub0pGt#d#E$7A0p<mp1~-E<}ObmEO9U?Pr2})lX`~l
z_CWIeAzODb=N3C@hF5Q|E--1Ymx(-=@C|^u+bbVz6s#1!VqsFTAWyfu=6kcyb^Azw
z>FKS2NqSDdDMDS&@2vD90;2oYz?PS<b3I2Gn95q8S)R2{@>alK6n=TWV+;3b#(Y(d
zdibi~J5KnxM7{;AQp0+{;x4QdzCkb(?7u`Xxj(9%7vt4_?(<?i0;%4aGKcIivOn|4
zrk-QA-<k564W7MKk)~YyMdqjvo?)!z-cCDt)=S=$V$CsloD{OiRL<d_;J?jXf9E?-
ztOuhRcJ;lB|F=?&BGZn{)U`)0Bl#Lbrr;w1d40*+eDT^TcfO*qB<2`T;6KaGSMpg=
zo~O5;fwdmOmRB_vQ@82~Sy7`NyN-%|7yTZ4Y)ri`z<EZMcjOH@0Unj3-aT3kZ`E~&
z=asy*!<+fhzb|i7=*YeP@aaWJZwcNhc*{ckz{Z2G3_1OWU-T{DyeC<|{21#QpW=sQ
zbIEoOW>{YYw&L03DL*N?dPhYSvQtZ*_XgXt?sppk$*TpO8jf7%x_5t#=v)`R#dRNd
z?p5qGMVb|V4H&N@yOvLQj~o$Dw0US|8k#&j9QUC~-_D0t2~BL&+qaYJ+{y=3z*l`3
zzB*Jiz*h_38^j4v(@7h2o>y*kzqc>==5>A#CU4!)7`!szM=9UXNLf%lOFD%I1IA53
zEgjFhBZ4mrdaaxzre}K*_+=a@n5TRhL8I+UmcHQm&FVat(4x?$`-8>>85XTA5ZsnQ
z%}aS}pc!8Y8n6F*%X>xo@>Y_>B<W0jBWPT4=<@ClcuVJ7UeqbyocF)f<2QpwSO6}%
zg;4}{zlKG@?$WR-u>aApdaynhR)W43upY3BB>~GGlKmw4t!MmD@)N|82mVHp8$+%^
z<g~tNzdO|Gnvf-(dH9#%zuJ%ARNV-E60*7)_m(fue~SOV6*S&$%ccAsU;0q_6(Rbe
zb#M89WNMB=CW7vEWIB-ftZhTVYq20}JV&!th9P)D!$(&JDf=99Wyl?E9p^nw{EXa@
z)Unux3I8DcQUtjKs{tEuVFh4~VEthKDoo1vQG2}O?RH-L<J4m=q(1mY;ma02`x2ca
z;KSfy7d8p@WewW{cDsfxfPGrSj0Abouw1Y!HLMWqT^d#fwnf7#!Adl&4(t>SYX<u#
z4eJ71uVDjV%is2vcNA=&hE0JzreS-*e&)iY?<|6igWYV~nu_n@*4sn_@3(`JV5WpA
zR{XP%ao{^g#$Sy7O88s88#Ffi=zp)RKJX$pb_`jFOgl0icLt5GNt=3XBETykt<Hs!
zsQxtqZ`MfAFx~o-@zpg>3ad^EqJJKl!tVu*FR#^az1Wi;@959@BlZdJtgrrhTYvC8
zCytAwvjQ3O2SMYtB4ha@{DKyr{=gfb;(XoT4O2Og`ja;5MPJ@1e#e~$R}3C}t&_SH
zgHKBO|1X&Qro3v?WSywmR^#*ne8qPKjn}#LCi#Ld6RpxmbN5qUKMWd~zI=hd!lwFn
z8GJcoDch9suo5f_><sR$IOKX2he+RTfL07mhAFwEZCb&Kz+T}_%T?CEOOl#Z&S7{X
zcL$AwB8*vhZ1<TI-z<E~hvAd_tiU((qg4I{%UNJN2b1i`CGAlFHU)OO0Qz{7eDA(k
zkQF;t!C&~}pfSPkHT=nU<MCJrp95Fw?DsB-zwSYH7};n2WUp}X^O{FnSoMo3<eJ8V
z#<E8)`9A)@?_7_g$jjVS`H_kKBxrosE^HmLInS_ede8rD%n~P^X3`ma;2#sSh}})0
zZ}h>SQOmv9MxBb#?D!YjGBh>jK$ABq$hazU#-H$SKMNXfwdLfgR-r}soewPv?R=YN
zz0a<~=CH=NC_D}DoGUzBQno6vA+QT=n_d&=4rI6k-N3ScG&>;TrTh(|Z)`ef{FwW+
z{^fSBe<5^L`)hN^mi;^^%Un*3g>Oo}BdFZ_uI#`!{j(BP<tq9!{RLf}0bgAe8+ttD
z3cfW>*xEzEq@G&PmopPIUiq>#`n>fNoIKt?cT!Pe&(r9edn{<}!|+?MIj{u{TLN1K
zbJm`PFLROj>hTmNz9}Cp2P{|0tM&Wfx<Io3$(oEQ^1lrl+qgIQ;3#kY!XbPj>HF%T
zmq2&omLk|%z>2}XZrh3aPU$v##vKV(XE?X;N*crPk311HF6Ca*c$Z4!)5$cZp)W#r
z(#U~r4r~ETl8{TVC9njTr~EjaB``@xF5$};C-ghpm)lq|SP|IS{FY1jD!}T&UZoHV
zs|ITVQ+>?lYXEE2_*%ibG^`h_Pm>)28+7qWj5rQ90`_`ImwtUEH6i(*1)l=H&c^v%
ztg}33M8KAy7tS%Cv*~IaG24QH8?7-!%9QsU<1jpOFS;eYBCsB?lWdG{JE(6TRDkz^
zzfo}e5?@*aJ^~*1?=M>43%)(W&%F7nZQ#3U2&5tTo0RmQ3>xe0v<2G(*6YG*z!t!I
z!1{%cGHHHE%TM5n4JISSSsS8N8%$oM7TZwDz3&U)Q73Uj{4e+i^plLz?-`$epQ_s?
z_=f&Kyd->FlHU%nQ5PotuODmz?3H4?djC5vBia8%eoEx$gT}pfa8V1k7pyiBG>(@J
zaoF+(9|@(YW=f7bRX!8XnOw1nicLh9h<Bc`%R(BLVH0BW)!@0{Vh5}RkA(Ovqr@>L
ze@)Qpq1|^JKMtR-FC3R9eMx5mT?@|#jmx;V(qRPSR%m;n6(0y%@0e71U<3n~`dWt8
z4$ZDE<&U%eO8*4g=x?v*Y?uqV$F1OsufdkXM%hWD27R?J1dYEsiGYcnG=ZBh1+8zS
z(5|U@b@ClVDv$jl4__8Ca!KDF0jmLfrvTJl8=v3B)8@z*R-|rcptnHJ@GpaA<H0A>
z6d_uIuKed|-_@Y8UDER8GkJajA8&mh!=6H@JX9gqhuj}TZf*NYP`18nUAO%Ohrr$b
z&0DF;JAke-Bb1C!+IYg)ab_}xOhB)J{zxh<@?^bBGH{n;ti+norQH|d&&miT=hcFR
z4lw>{SRPo&g%!eA1ZIHkvd2T(jW1Kn+&y?QqoFD(ZBZP7ts36JV959m_fiMK7~oZd
z)^c*OHU7g>910oZ!ejOCwezNrdE*+E3NQ2`66siR`ZW3m(06Z2pIsMs1Oi`l+yxMH
zDbI?e9}f9^i$UJ_>=Y-W>aB1YKMU{4qMJ)>u?%bmO!}IXQH{qrTT~xY_HWbGxB#6~
zc}lJcJb~OsTh6iJbD>Ln^g>^T?jEy;z?L*@9Bfg;X22F)nB-#~Owx1oNwTH<t$@#g
zR|}A=|22Cf?nP0xHYRmf`U3W`E@bVES+dExPHsQIC;hz+-aL4jhTDA>A4Ppw@pNEV
zDok{C!8Zrr?ZT(&biYSm>Szq!iuEBSCv`9K(_pn=JwEcBEK?duHzNIJ8Qw{FAMoJ~
zB-aE}wo>>a<;V;r-<6#5SpgRA$$;$uYSC8(Z{Bes<IQ$INshCaYU+8uU~z_PJr;46
z1Oo0CPFv8Yy-Z^5V((+5F@Jo>dVe(~bLEFuu_yEMZs$G^nFVB`FQb0M5MT-3E-TfR
zU&#C$<7ZaLkl%7i9p!_KfMF(U+9NRGs!+(HKMsHE%R|OHgdhF9jhKHe{q@iWp*i}c
z4O_qlz+Nu=T!Qt0^=o{CV0{|j7+9~yHx1SUwn1sL^v!{FX|hXT9WL2Y_%l}+55ZoL
zy0L6;jW3dw{X$H_iUCxE7n`tGAZuoajBgy-nDfPSV@?^mdeAj|LdZDj(6&>k*^Y98
zl9w6eyG{%xW3Z&{S$izn_{X;@OA%cg+9>~={}T3+8!}$QcrNX!>EcUN-guxP=&!96
zc^OX|(3g`JGJ3h!`!}yQOm)y3oO>Djup8u>k?XeYHkl8*KUo`>d?7~Lw3L1#buy3q
z)T=_qdH(IAjmN=zecNa2hDs?zF^f;rRLC#5PiH5Ad%R{+2WvfYb@?G<r+*%_b_tfG
z$wM?~`cja$`s|G4?{y*fT4d5M2fppps}&DOA2(K+N1Yrp{wrnoN&evJ8IFK*gyPGK
z;V;@4(!P7(taVx~A^o@-zCrjtAZcJbh5Xk5X8d!=+KbTqd*D1LDa)7i!CM0FDI%}V
zUJzODvc{89XeP9u3C;3-b{ryJNf~OU<Y6y-i}0Nwd|dhbErP9p-6{aA1SQN4x#6ee
zF^5sI;q@W;ER@r}$@&O%dGlBX?;O0>3$LZ?I&r8K{H%snQQ(Qo4z=#`6ThA#^*w-|
zz34ez((w5CtDK^$^rqmOhwuBsr>cPN6iK_z)9;s_)Im;0hEa1$$U0}tZ}IB|V9j9f
z<X&`hvRze}@JFHbLwmK*xTL>TfenFm=>65%=k>(D!K>0(bfxGYK-a<>QsaiS(I{8~
z>^<DueK6HVNk3xwoH=;&-xxAJ;lrEU?n7GM2(L_%_LK1-FOXr>Avd0qv+C3KIrg~`
zzJ;LRq^#Ij6|xCr-zKukE{NIrKl%O-=$VDK-FVtmzTyg_8u|H{s(oLKnjvJWkhw*4
zdS#L?AR{M98L7Ly$c!K(%^;Vuf3RV&D+N$_lZmNPsQOG!FvD1J$%vi;uw^j!xLOLf
zq+xNeMHeQvQwz2LcB-Vu%BuRt?^f#!RSPuZO(8Yz3t7_Y0h9E<AOM%>7zFEp?>_~w
z^85wqXTm=LE$h^f^^Sjj3pNYZ4CYKb1xtW6fjM;{dRM?2!Or2{*1Ofwn-|J32B94<
zG%n#U0viC6dy7ByPJh$p_p>v0HZ9W<JEMiHAC{f9BDa9fPpm_yBGBmex0QnJ0f!kZ
zcy`eBOZwS3vc;Q12A>2``ALq8{{3t)T|bK>o*Blz{xxJwO9d$tB%3M!!B`)3Pg;W?
zOQWonN*8_A=$kDG$>+S%=nHuC{UV({<fW`b=$n6Q$atf_K79;tNu#c^5M4{?nko$$
zMN(E}`l`<5Tc49opQFOn+D&9#hB3XFxr-Rn+IrExmk|88Z@ODI)COPeOOa0|D4Xpd
zovJfK#+yh+E*YEq!5YCNe$Z`0pQDIPPQyPUI_x$KJY+d)W&hA7-x0FT2f-_`%nH~H
z*uzq%o_YvYVtJ~xnXwI#4><Q!N}0&6r+<})j5BPxq}_RZWAKaVV$>#)-$D%GZ)<v;
z`mc*pa}n|~fv)L_ka3Yy=Fh1zbF#`Bqof|^;mJHZWK=jjEEH3?04Jp6KWD25POB|)
z+4w9xhZtNTR^KlKD*+Q9>hAke9~IC_q5VPhTYZ?mLrhe$fgOyn^+G$Aq`XmKIYg_k
zv?HG|u`xS;dVSgl^ZwmkpXHMG;_Ek*p1UD>mc2e(O!8viv*_zMH{|ntvb!a%eq|iT
z_zmwZdfJW;@q7;?_^xzTEPXVNuDSC<#y8XH@{apeX>?WAqN@*G7286_okyhWBEN8k
zh&}l3bZJPRSRsv`^HXbKxnMcRXBb^zs_n53SxuF`RS4b-{ub`7_P16qL{9~@F=(HV
zymARv4OUQ@8b73d8o(l8UFm$7x30_6Y@gLi8;_x@?p>+*KQ@ybHv%_11BKPb^YBi=
z`_mH<RK42c+x5A}ohSO$CZbZV;+JI@@vFUlqynrO%vmS(<R$rGII?FQStPo#`cw;Y
zBgnl0y;hyl743BTpiM$^kH5oU6JW|mQjRvx3<<yV@hNDt(B8+rT}Bd=X@UBz4hwn_
zmPnM7%arz9hClb3)cR3dz*-v@UAbBGe`vq==&}|4+|h!1=%T9v{zMvGwa`|ez2BoN
zl`r>4*F{$k{AKlP%P4I=46PDcqv*2yOsdV3^=$coJ@EFztM=(MUQVwmMCvZiU+Cr7
zFTA)(r|!IC|Ac2>H!$O8ALSb+L|km73VqEVNUd)QRu9$-_8RWx608Mm0Bo~Dths&<
z*fiKr4n4NKCEyLb@r71dW{{6Jgp6;APOIEjSv+GCdt08k9in^n01L(asWCFLfp{5x
zx1{u?>ik1aMFJ()4DI<Ja-nNO21^fqb*lBbz|($JDr3t4ddAUnm*_dqwFcuYkNc}<
z<bK!m&wuet6*tWxU-sdUQSaZj`dE8`ufHT|vCX3F45RiVA!Ret$DDbf&`fB((Dc43
z?OX?K2-<JBmrKUbX0WEl5OYg@pex9Jo{YxT^k|XkgWi5!$oOaO<q~WdYyeEPg^f*s
z<$sj5R$<~2zFDwRFeliT^b%kN*N2P)c6yxCrL2;oC6k#$d)*K+z9&k!L{C202w0^;
zEUXx80qpfQCjGJkEb~TuiRi}Wa5Q2gvM&<JaAGG&qO%d+tdH4ogpIX><$)FRTQ1@2
z11kYjYpFIi3|0$vzA$kK-vn4cm|7>{cPW3fU}IqaO%>R_(9;L(c;Y`ou1-Kaq4edP
zkYN<Hc<a3Y%mj;~PA*Be6s!p>A^?}<Jr33icCRO0;vnZ^<-s3>oYhk&y;fwhZl=F0
zEtc+HurV;TcLXkVJOoyCOKRPzEpVf1J@!m!!vIrj{^Mwl<ZTw-DR`yb<q|9bHVfw1
zlh9Vc7QvbW{6IX&Icv)zESY=`rB!ZEm9ZAKVz3&px7#wwSj~!;w0<4@oo|0)Pe0wB
zviRd((kQ)+G-N}cd}(Q{IcM`(--Ghy9LKTBF^|5&+wt$HlS}&RGFSmvQ~=Vr(k{8w
zRW2WCGoeX5B$tedg<w@+-w?pE@pB})l}Qt4wN);uI|*r3eOJRj3xBrob4i*FU^A{X
zYwa{;F46&Q3EF+aZ24{akV-;*XE^X5&H#>kv(pis6UcSlky@K_bV@n)LK}g0vgqUz
zJ6Z%A2YaIc$#i6FzawzD<IW+8e10zR2D~y($|dO)gNgh<2|%gU-W|8XAGi7CIFnq0
z)!O`StO;xk9d4`xY!pn=kV|CyZG9URVqqg-d*Lf^VUu9XV5&^E>>jY(9xt{4Rt)AI
zgN>6IAHZJc(w7Tnf;ssUJ17LJ0J}{1ID5nwZB<{3f_J%iCErzG9bj@Vm;7H3)(+-Q
zy9KNV%&{}!>jCQrbJ|R>L9ih(ryT?v0~-WWYm)pHY#MAv<C_DU1{1opm)O7(*c8}j
zY)n}h5u&v>J&+M-RJST$k$<9oJ`=Le5=prk^h8%F_#n7DPjRpTFlU`id`vA^KUk%m
z7AjbBj;RDhPb>6k=uTaif%Sq-fiXRIY&7Y=J>M$~{Mq}c7F|QiH;a7VXG6vpZF$Bc
zdwo23U2;4keh^+cT$9zyKImHBuef=|E{otD`J8(WoO;H{gf{9!lQPyp8-wO<=Vq`;
z4eJ7%0dv>s0NAvKje<>qDVwEDC5<VtSupor;$B;y(;iZWMO)vixVQT6rO7!*R-To<
z(-xvP0;Z>50w(D@zAXx$2`1%vjnt#^hV~qbdal^e=sW;;YryaMIz%L`E_4ihK9oG~
zBKaKv>j#T+FM1vSCO&x_+61&kp(TBpq-MFy-@9C<3CjZftzTd-;mFpqzMO9UDK=XA
zD(0VGWIXlQs9N*j6sW!C5qMNz@{l1(m9-i9sxR4dKO5@;YXo!G{{UEnhK+*NgUOPq
zTw=#lV0B=(31E#+b_|yK)+>PGU4nNC-e*(1&i>I<YACbg|5sE0{UPIKo7Xe9WeY}C
zI|C92K9iFwsq1RwmXLd!ZV$e3Lg0D7_@7|Hh@fi_U5PJ;jE_kkuz>&v5Bbluhfh_X
zQ}9%L#oKrGf<?j35;-nuw?(iPu=6Bemd)T>XgAJ>1)O#Z=CRa7nN;8*yAkCOt#&KM
zz`On{WV}KYa*5qmfc1g7$F6FyK`=E=+I$UQBVdxWTq4^FHVo$W_q||a8s8AusD_RE
z=$o<iZIlE|%Qwt}O(E;{7t3I?V2=wQm-y<eeEQ!2ep3J{_+nMT)+|TH+al<ZuTpn5
z9lxq<jHf!cD!U=5l{%<`&xG$Zo3Bl_7`Yc(BeWV9jYd@G%~k)0)(Fj+TL?A))&-{a
z|M)FFbQEj|O!+W=%O$+i;3ME+g;>}e*f5wA`$eHG+46J3M>&)6rSetkd${(N`ha(Q
zp?qSK;(JPY*rnoYth<U1m+w*QEq4Ui=D?`z9#ITSJ^W4ZzgPIJG^xAfI>@ITc9jGI
zrLgrQGl0w!BICAInfp2}hMBL|Qmu3EdnA8MfKJ42J!SOmf59!jaXN*Sd=&7&Oy1W+
z#_Un%L)xSk*>Pl@bu94>O<-eSmC_O1^>5`(9Ju>@;647np;5K@gy{c9YTOh1m=*nC
zYJ5UJXVI12EP#)ItG)})a}Wv-oow;Damjl=_yqh;f0cGG29xwSI_Kow-Hw4jx*AgY
zR4x2n|6|)PX|iI+zt$N!p|?WshyE#BpA{e7n2e7Gpszr8Y+8K%DA+QXT8mBEo;B#z
zJ4Oee@Xo=THFS7hY3Hm6{Tp6fWO{r1>RNjfziJ{I`ew*z=ibw%Yx>kj*7T_gL^_bK
zLjGHxw5_vv=diTqIyd%;46g>|pC^!$e9WL{{97U88jl|3-%;Z1MZ9B;e;0Xa^PG*W
zZ+tryg9ugtmH-nUCYNBPVDn&33?%gz2b%-iCQNDi>-B4^&rsz@b_v->Jo$FYKjSX{
zkYD*{?EKH5r|Uc3yv~DlfH~(tB>&4`y<koZAy^i{Ob^%_B>!AepAoQOur+%-?1-ku
z%Ny6!XRAG4)*@eXXQ~g?qq7OD8tnVxl<5sg|LgRB?>@j(ctibJ8yaI9Y6JhN?#|-w
zLI?BxM7!EnRP8h;`M=9sA4_0EU{cj`<?)yKdd6$8J^?uCox#yCY3~yL3ZNDKFr?NK
zQ~9-QD(I~3G`Skq9tUfXiz4@>q|X0Hj)Tq6CZXM$q}`-wVn@BuO2<OR4MI!Wk!1_1
ze!y7?&l;}1{)=o2vR{_Mt*u+8dft8K^U^r}0opH)iU<po(|3oAfA^1>Rn+8i*1=b$
zIRhv0lZn1Y^kw~oJu?4sN%Q@|(|ohHZ9}8j>o~d^@3r^&_$}BBSU=d!+{-1{JXqdD
zNR6ouwhUGQ)+zv(*g)1Pj1ORy0#J9?+ObI?KMSGFLpx3Cl{R5;cVgzJ-7&N1se(7L
z)83nsIdh{p{fHI+K&!bgWLyZJq|p}mm>LlzpIy)fpdBYPF2M%C`oT_Bh=q-Ujk)-w
zou<G>!M3JrC#~J@N;A2Ub}v9*-W$+&f5`Z_r0Xqj^6<a3XB57y-JGZKZ%-l!_gEf0
z&o^20h~0Ogt9UA8?B_mZ_w+M&yz^OCKU1E+Sk>zkvh&D_9;;qiALUkl&qK@nDg8<6
zCfT37dV?3UqoM^;%KqQTe#$g^n=$~e?N1dOT0MJ1!Ft~UmcwS{r4fBC4~2|Bqb^-u
z0-s-#6;+O5WQ!j@dRei_1hVtUULfgG4tbN4to=*<gjl%Gdn9DkN*lWCM|z_(WS+jJ
zei~K#BU|>Xknx~@`|JJrq%_sjC}Ut3x+Z?j^A+6t)U{?yf$#Xmh2pPg(PQoj8JCD2
zt30%<6(1$vTmzCk{|1}%wX8ScPmptBEvdH%SOb`fwWv31PK4b<D}&Ypt=jep$#E<d
z^VrAa_WMLEQC0V?=otRZ(YL9L&Ev?{K6do|1ySq#3-URSr=I_?^6%{zfv<aQu}Wl5
zWxk8tpVGCtcl_S&>%$qMIeBLf?3T~zLT~XCsq?YU{E{VhRu@#~Q)`c-@YTcTj%TL8
zn!ud3lVb9@7pxI%hZNZAyR>?8&c(t=#G0eolP<C6931>KGM`IVHt!jZgK5g9&T!PA
zEAso0(dMs9v*%#KSJxr-N+iD%q!Ir^+VNCR<LWg1FItPPtRlw0|0M?cKgw@}G#W``
z;pvbOkv>D41a45n-`VP4peTR6@FWr;;|GW9SHUd5P&g{%*B*4#{V`<R&%LD67WkBt
z5PnZK&#d{8aXRDIe)bgYv|KjFc8Fzj7>(+yMer6cgpAHqz989)jX!cGUYsregmW*F
zRvcZAZ2L1I<I{Gz-M+%LIGUN^3hJD-fI@T(AwT&i&J?W8=RtQq#V^dlJN4&d^9$k!
zbN_|&3yUG6oO`(hD+C(_le|cMif=3fn*rNsW9n=x_d=_NHV^F`Hth;^$kx);1TFK~
zR9_YS9bh4_EaB%89sOVi*i{0MCSw-f+|n@ytqPiZ-E11H7VOo+&n0^2z#70*zHMv?
ztOLxw7nAvB=0jk$@W>^y`CxNkmnnp~JpUJi6+OrECDIPcJnVIDXYS(l4`XTiv^-7B
zC_zlGb~a?8NVRsQiP~P$nEDI-&_4~mPy3&_giCbIp)3Ed^tVu&{Ca)$@oB>MTB(ns
zV)l!dQ+81b76mJM!9Lqy)rZ|DSQT`?uFX1^t_m4JPGnn<ok7-hX4qPbb;o<#yyFif
zHIEoWHupuI%{sC=`i*ZpP)E{VLYt`Xzj@AYsa8>O=en-2J&0f8YAieQwAM{4kk4C5
zoxS3X1k?m;Jp5aW{yOOO&@VYMzxtqWUJo%2NE%b5QFkz8+;e1U9Pc|0kj6HDyHw*t
z(OZbmh>k7@lHgj~*X{BX>C#AJ`%&qmJ)|)l3@4v?clqOI-F{ANdmP?@P}o?1WVUUl
zvu)9p`LEPpX4p8zUzc9CXMD>BS1r1#&}C+YjobX&LO-j$oTj}G=_AHSqiI9fIN+a#
zo?oJOUw@-~M5u)IjT6Gg-AAVD1>gLVqV(xXboHMYw$3^og+Gus=t4G{8#Z1%vV43q
zU75sJ?LpVlKZOl(4{`~%0JiADB)`TP#3NuwJLfDyv<%sSSB8zVB@u0&5xB@314vp8
z@K5B0k12i_LN@o+;bV>;63AD*CTy@ZuJW4dSHTS#PRD{&%wO(XvETe7D;MmlZcSwB
zUK>t6hmsFo12zNpU&4efSSuX?c??|1v_P+XUD&g(+5;8`+iA;4yA6V6o}A*7x*P)w
zfpyq?$#}|le}lIonsmF^hN$RULblqaPq55?qkh1gxu4WSK3FB#^8&8*clz94m8lY$
z{8PflM-h>$kiR;x4zL9QIKxQYJ8MUKoT_);i`)Zm{0*tH)*><pW`ccF<k#jy>+6AY
z({%OA{cv2qTIb6wCH^6esMv=)4H=6W1MLm5ue!!S#&RCnzQ~I><8w9o+TR#9zI|kQ
z<{&?kYr>(Zs?$MqCD0{VlS^!C3~V0kp9LVlZJb_`=#We)nLW^#pgZj#`$7v~wS{5p
zd<rsgctuxeGxh*3K8UkD#Vs;&^Db1(ROu>!-T_^7$Q9?W6s#BQi~hb!R**dlUw=#E
zH0LOk*jfkr=1&b<&r+&#ByEJZo~Z`0GG*UOUdE9rJ}qp$drQ_Cd3sCw#w>U}_#e2}
z)^k{`ayAZUynz+0;?r{8hW!?Wt#iBRWRHw^jxNi;f$i!)3;Lk7*Bga@0sdD=TB;o&
zb1QLG4YWmQKd@=ZK1%I+-nkt3ig$g4MwGntBj0#>SbN8_J%4A9wcDA$6OGB7ZyNqN
z_<t$sdFwFwT$AWsg0JIW!q%C~q~Dfd**Z7!U6)=dN5R{P@7^4?o^Mc1Au&JeBx+;I
z?i#$>+Et+{&Yhrb#lGs0uPzQ7pX8psg`Nz~T2Y@pV$b4x+M$m@ckElRKCn?R_xXWg
zunCQC0&E=YL}cWO@HY!K1=cTs*acmK=DSk;%W)sc(=t32o5Dtm@K|*vaXZHuEuWQF
z#(oI&Th`E%?<s}fTGOw9H}frFqg@J+tRK~xi_4{Bd)9yh)9L!U^uHc-#s4*Iyv9Gq
z)X!`GI8C>zl{(yuuE<+c^(y^#5iB3<QU88hv0=p9Z(mH4zzAiPG@@q`ua>694Z*6w
zhFqBR$$GFsun%y*CRRv36Uo{EgM%_jX)#zu-ypJkkagy(()Y)}=E0nPC%rBYd<J|0
z{L9>{wrAoknuN9ht#NbMS{KDGnk<r{WxfM@gZ61hPCY;tQR7zuw7j>4jV_^i`xP(n
zPWmRLrxM;4c-7jgmPP}#321Fj8k|>l^40;Z<n4AEsAOX)L)Ohy`JoL#bH-*>ey~9>
zcliaM0Urie=itzDo?R~qn}6I&J8CTlMOB_d?__)}3mgCL(0G{=MyqHMXyeeFH3$wb
z@>&mT{j>~P&Y5B38Ieo+5#<ytM{<v+ImI2Q`fNgO3b_W4ob7zx=d`~yE)2k%^^UOh
z?wDj6DIfSXPZGtdzGg)~GERL-ze<2jI(iP@NAFA5M+?za5@mdOXV^GX@^9&)872R>
zvYcnHWk3?%T6kOFy;yjY`M1mJdgp<(a~HfL@Ma4ymtX^6!(i?;ol&qcFy~yO@J)e@
zx-hZ#y|%t)?j@hh%adn~mZ9ZFQ)7AwELr9F3$ULFzm-?Z&&y!t44Y#vFVDCmbz6bl
z7;=~S`#){I7x=4x-BzkNpa)&k<zZ{zjXcwZFuC{G%R@XtW~~$PxQz;RR2@$ulXq6w
z_@LxFDRZgAdali?;+I8uYv6UxLy3r#A@eNaUvM=jS#4^qf%va$`9J=Q9%+jzboFfs
zs~CWvMufk5ut~7v?K-l`wZ+M!Q2|dEJR=ohV>kD<UAgn1k0T>~^(g7jqNj9g%KpU`
z5@01@Z?<(i`#2IegtpNCp`9Z%+C4Ql`n2QTR2>z;*9hOQ(zUa;7dY-ac3AOaGrB_O
z(1+5^EjUH$-dBIz_vL(65Fz8*6#ACV3mbj@XDZh0)1R~6ulB5Sj-}|!s~|qOC~W-I
zt`jSEBjyoncKX7D-o7C3)fHQ+LN>E1Y-~+8#(LKri+tzt=#jP=Kv&i!Ve7d@PutKb
z?6Id9zA5;s;2S!!wo!Jvg3QdNskxTePR>^PAJ{tX<r2S805%1-W{tpI7kj*QAu(1B
za;?>2*EvY*X&pH=E@`zu?}6@|ZL9(70UHMEmAa$nCSzu&A9~Kl1>1d1Z0oMw@&Ftk
zglJoTYruKco{B{m(ZBGXuysa>eoh$Azt&M&DQgak8Tprmja#K2J@tK|Q{DLbU_7~I
zz%MC_iCj}nsw~pyYQTEHoN<D3C;e|Tv{7i1j9j9l3v5Kg2Ec|jY!qxr!=}InHEb`~
zfQBuC^=nw@9Q>7r<$?8TSP__%$DQXWSeJ%XfputDJy^SjwScv1SPxi>h7E!>YuFf=
z)VDh?(_oDnHV4+AVM}238kT9YexqUeV6_@n3|6CI6=2mGRt;7K<{q~jeDt;2`kXOe
z{70{?FINh8*s&<^guf@NRC^3_=qkEAY!Lpqbm7OCgE~jIf}6Cd8?0i#`7!3#SJ-na
z@^PUAeq0g$N}-iN^TzWMD5%g+jmtIgw8C?q2wCxj9VgvE_-TjvkR+`Rcn2L`(qhnO
zHgBah2yM?H(uyEI1y8|y!`5>;$+S}W5r4EG`GH67rC!^3>6r9>c`uJ(S?5~xYlLp~
zeNmw1gb|=3=rhQtE3@dRgeSi?72B${E|-o*=nc>hrK1;~F?js)A$2thJ@d*i@BQQ=
z{}K(0yA&TX11%2C=@(^S^I#^JdZ&(6H&$FBHjvui;RK(8lk=})WW^o}&cojzzb)N(
z=H2rP`i^G|j)mxIMAz{9!$vpvUjHhO-%%K~vlLNf>4!I`E}T5)F6A2m%Lh|?X8e{h
zauTe=!K`)@JCjEflV`Q0EDP|>!S`M6b$bl(79~|R7(_(qTvGN(oc?~5y>_Xx2U{fV
z-2R9DeKD%~Lj$^utL-&tbhVMF*q`-GPhdd(AWbP}AG}NO%A8Ivv8`c|zs8<hDtY_4
zSB6!Ifu^BHp_ivyI~efBRA;7Xr~#6ZJmhaf-v`6VGbMr*gO!1~=N=VcrC>srOa8A0
zD*;pUQ+~_X+F<LGyr%q`*B9O73L|7KQ+NlFt47ze>FiI76~C3HToT)hy)2<G^V&nl
zyQ=)><Nx4O^J3&gwis*}>>}<p`#s08UsZni=HXM%oWLqr1K1*1NMyOBKeU1^fK>}%
z_06P@OunZ9MEd<Oypa#HM<=|h{droR|5f}8tpr-91a4q!;wCLN|5n)FQd_B`QdHz!
z!1&q}KI(INl9wuE3va+~{9|$TRvdS1-<Ib2WwFB|^bMjf@k#8S-*(yTI^uD<6N$uB
zpSL=$)afkzd7nzHo2KS7^426(If_}zDGy<E39snZ>OXLw;}ckyl1F@84nq8D8M2Ev
zu?LgNm&dQ#&)J{A=$dq_YGJgPIWzJt$VXclk5XwT<z?VWx#pjH*N2sljH1Ev3zO){
zyE&}d&9aO4DZ7w&*NESkhh7T(Z6cuThHr=SKX1-fbJWm<^e^bWLbvK5nGaVcu5%Sb
z<$z+SK2d@E81m<cJbKmK_6BEaYdz~4fz$xs9DEPjeC*UnHe^9b#l@1>9(cNLNsZCg
z+r3pNp!(2=P0tb?T;iiA!8(x1(c8g)FKnN`<wuqNoOdyPwx#^8=q~`P2D>Jm|MAY}
zUYo}K)QW93qN`=Q?ZcJbVvyEY<yuRODBsWz?+Co-BIorryv>-3vxiC}0(lDFS$MxL
zywHfBc&{HftatOTvV#S9YHtl&?=-Y{c(_efy^6uIE}{Z&3n!m5L<gtSEL$yvRtIg)
z*1yKTcw&IS1AcWJLADV+E9m*6t%o$;t@5DO=yQ}k^}^HK!5mC@ymiFMGBQrVrCrA1
zn}cuK=CktTj3?G>w|KjBAr-7hBFBzhg%`1GIJ1iKb%w2X<$KdizE4?vQ89eO@ZFwD
z)AmJd$oPB<s22XLe-B&Vo>N)3;|DwTviB9;XKQ5GQ}xh~Y!umdOIpY}evFev?5-<7
z?GsJH*9_m0<VnV%CHVKi@2rmsmU%Ju<HDq@`Cv0(*Ku$Ag;ZIskMcPtsQO_X{>YB7
z!BULMuVy3q`qiwTjX3MhedsFa3Mb!X7tGF3hAd<7D74b<u<?}Si)z4ztmQ20$dRDR
z7w(16xE+5Z`APaU?8AMo`%`#lw}G%|Pzie;gH7)pVS}ybBPq+*)GOG|;aeRy`jyIB
z%WgntYY%IWsr-;~a*h&s+_RP)Reg2<nVQdpjXTMP<y(pM`P{0ke%1m_Kwp4<pG`kZ
zou}Iy$Dm5FNNoceoiff9T#Enuti3m@*|0rM*=Br&cbp=RGS1Z@--rAUrQF&(jMO25
z<UkkvLWWlZ_GP`xNc0S&C#N@Tl=$i)2u$h8O{Yi7vxJ_^&mCKN#QsXEnNK1AeeNy$
zvm<!5Zz8X?x5nif_~zidJjoYO{Y74KR0!4rJ@Wamag)$3`%Lw(<O}PqF>VOnCU}1v
zg4f*!&bpY#{{}ykX1TPo2Jt0yWqv6&4txEk-IP3CmFyO@qr9Ac%Noe-N?}5`_>HP}
z<B!n4z`dPUyIgp#T~4@0Oa)oS{C4<D`#tq!jau~QIP^i$2VLzmtu4FmS9bcDviKFb
zB#vG}U*4DPwHf6<<gHT{E$2PVFQJ_+%!<aN%n+r$il9|Od$Z8A_Da4A0Fu~9HM~{u
zzRTuyd^dAa9=lM5P|u3D!q){~DyAnkvvu@C8*uXC8Ix{uRH(Ej;hTcbSyOPOm7G(H
zk6(Z<=PO~)8DrVB$pVx4nN?oiRm`oFHy>ItG-n+GJ4x9mnpG9REV@m2o8WcE6shML
zutqTF3^Z{b1-E>2Gqhf4zfJi<Voq)Be!yjLs^?hGI*lS*I*=Muafs^0fyJab1FZ|%
zChnz;Qgu=1)EVhGw^o~nYVc3UY!n&Q=4y*mjUf?e!_d~Oearam>;w2blO=UhgKX_r
z!-h=9<r1tBtVYAy!KyW^53EYVhQTT|YyvC}Ce<jHwA(D03DzWlrw)?e*P$IPUgL7s
zgW;8arrR{$D%r*|S1N*c0^a%*uireu9m~*aR(sbYyM(M`qsehf>M`lJ1DAT|7?$iH
zvV~tedRejQIb>^)RqI8{U?u+2>`eQ1yop@i6|6@h_qHR8`Sa4n{GzK4U44U`1(QZn
zUEhuYedi~^^L_U;aA2Zq1YIj%XT9La@|Erre6><HE9lA@B3|R(Zu>QLqdl*!;=Gdg
z(jLhDy?K!Gw4$r*TVZ3xziza><KSay>P9^`FpaLoZ-<SgBh&Tpp>*Zd(*ECJ95}Li
zUZ1Z0#Shk@Yv@k5kG)cP!iba~+B7sZ$5XzCRd)VYC(dvfh42i*vkcF-x%c*GS+!IK
zg-ArT%?!LPBjMz`j+6b&-_}m`tpk#gw#mPec=dZ>ga3}iZv7vnwsqB`Wbpf8<G+tg
zS7*9>iJuxpSLP_~?7z;r=Ir4I(;R%CBQG*mhu+8f#Sc0A$-TFJ`Rb5Wy=r_cf^P;s
zmJc<)z(*WLSt;sue$Xh=es#zt#==I-*6$ui*<p8vQI(e-c(d+K^~u!um3+&d%I6q-
zCGg3TaH{VTzvE9mYe&H|)`r4}h+VFrBl@GTajoQwa(>(%pNoY4e#U2LpAnknHz`f(
z+|4bnxr*pA;Sc>dY`jtU(bZ-R$toVKgO&$PJx8SKQ{o6)7a`&KjxIt-vAbUQP55t3
z*Y~`!uD`9Y-yq}d9J;!G;vIvRz&gO3SVj0U>)4+JbKa>VSUy+}n5qMQ%NSS;mVeLD
z#{*(#4am+Qdm8s%J4=4{L5;(`@a=)`W9edjZM{1yO|ajCZAdxh(U-X^RR>azWv~#K
zitVg6aru604$$lhzpN>M)n7`kBK|_wdA=AMm2-)*-@=I|rA(bit%0u*z6j}g%UkWR
zDLZV3uO7ZP2%pP#WUe*<tp(ciLQ6M(f7z8$J5CgKnOn>uUpASF?f8leDKN2sRxC_c
z6F)&$F)I2~{f(FVT1BB=&UWsVObIg8$kd6vM~2g1mJIg&e&=4bX+1K%$V`aQDIQM@
zpx!3pUK2kp-TFLzA{P~7=nFk)pEux~#QP|blxGG!AN*+Y7P^M=AakmJ-t=?+!L~Gm
zT`Tq`d8<NS=`X|9_dO}Ad=pdbumL;@uJ%O01#1P1yD+iMUda#GdXI0@Vzz;`Z7%v|
zkS&@C8z16cF7dzfU>#s@7r?4}+Q&Mt$JZNG3yM8ROq5ekJPhyoDc&{xf{zQ}W+PT~
zOvjKkOyow9bLKiJAAc)LLa8()k4^AS!Rz=Rk?#PT1go^uwa$l;S@e?@UqIppn)s1X
zcoXoRA-o>@<I8)hd{%w$h0l1%+wT{_R**f~`Ah{#6@7s9X!PChX%}bAVS_0d#|O_`
zJLp*NfDk=R=qY_9Y&<Rbx7vd`PP?c2kl%dXM06ZIbLh$T%}-@2JAt2XsOF_P=WM8a
z-<stm^ho_@evtTN*4{($)M;}0mu9e@&n|&?4BmF+y!D@4o>onk@&zAs+C}M*HJD~}
zgnp&(uUU2J%J?nb`4EHqC%p*kGHbriD1B%T`g_rTij1LrQ(AqC_dL-ozMW6E@;TmJ
z)4%xT?((d4Ic@7WC8Y?;ALo9E{_^Xv<sWIEwg9K3P}9m^F|>YY53c9O+P2eUufY6<
zM8=j4iy1K<2nn(imdMz~ue12|Za3vGL4Li*;fwi3e|%V$^$3z3B@c-w?D_55Jm~&P
zl_`Nt$?sDAvn|l9$`nCgr~!Wi?X%oVnOIM-R+8<P^GTmAgl85W=Q)EsurjbYFtr|p
z9M9A6ueD}c4Q&b9v!WAwvf^;fhxNJqh-=&ud+bAgbdGr0mRDn}`qnBLPa2l>j><`R
zOMjm_Q<3sLcLb8lh*%!8f0Yw)C{gsTAlHN3^<pE*JUac^)qjE$e%-lJ;>75+%wPTx
zHh#pttpn}$`R1El2||`~HNsyrPn;!vmfG@sBl-o5QtZ*@?T5GWe-F=F0RI%c6;I()
z{q!zb-<}q}Mfi%IJ~W@?Gw;Kkk4_Lbxb>ay^q;`0lT+cXfH!a7;dv$fMtJl8$oYJK
zy)Q~mg|833(Eh{oO~AJRU#43h8<^6|0|!#&PQW+2aCp9)k8s}qnZxsy!Z-M*!}Ha`
z*Y)Sr-mGfxR6V}rl(z%Erp3eajlfs)?BV(Lz*q6y;rWb4?BOqm=PQIScj?f4Vhfe<
zt-zP(ZV%UdB^VUXYxRW|co+YAcwUKPhv1!mKJ~nZs>hhszCxd}>E+xf`&CPdo`9Zv
zfc(4Z=UH@#*)p%g-k>|@dW21`P0z$SAG*Tqe9~;2U#zU!x@lt5PtEQ#Zy+nHCtaGe
zjzdcSl6`W!;>|ueYV4Z!USnfTry1s%@^90LW5Q-`pJ+PnUIA9;uI|}&wO9Tg#eegk
z)PJ3e=FUI8)7-htS1)&)4;^eeA*o@#twERnOZMwtY(4X;bJrQh4JTe~cK&bT6RWGM
zyTccOWL&;$O?tw=|KC%2t~AG%5#T!6?0)%;m##O=9mj2YIMz88YfEIrvbW5bn_tXs
z>!%>`?y%6e?a0~I`A}@<zTh^PHco8o&d-ds?OGq({I~1FvCS`De|^0Bq>BsJ8|AH^
z$S|_EZ@@Cl?LTeW67P<!ZtHw7w)2m{*v9)1J3iL-w-=gnVi^y{+V0ClH2b#SlTIvy
zv?nrSI}`9tY~$DF<?CO!Z?}-zc7pF)cYb&1=aOeA*|&U0Dmmkzx%s*4Z#274diav{
zMtsNdk~uWQJMS|$-WTuAC9(U+<8zY7XRqHD?|%9B-Y9wec!qJqhb4bi@$Pc+w^K?M
z+qiREckXeqwijP$%8F(DRHeK5*^g~CGafX_t4SF|ZN`11`q4L=JO9kj2h5CL(th0S
zHZy)vz3;05mO!k6tUI?m_kPK+Wbz^G>C4}GX_dU5GXB!)J@T64O+Urb5+^^&2kE@m
zMP}zCi7#9mH2#wPk*ImKnf>9PU7h`rldihzrPZsm8y>nkdv@k8*`Ka7FS89%_|47(
zX6K`cjR#j(6E(JIbyD<HO|;q(RSBEhSDQj+=kl&fa(Uv9RW6@=hFtz8nahv=-jz#C
zHe;FcufuvuFILuLc9*<h?o5DW?KC(4;>O^niCt#**_medg&F4N88iEw8Oh;+Uz-`f
zHrrP7u3x`t!t8u*f1Rqv)dymo%dxh(6{>9a)!BcZ*fhbvKXv*X)gbWfecV*-&f$&*
znFz%;PuwuikEyG(8+ZPRo1K*Rriu9G2bx}WHCM9Fn!$i-z|9ZdxFl7UaZuW>JKRzw
zdB=_(kQU6?O_nwv_*iyN_FdU##+14G06%YhZrAk1<(HOUQhsrH)kT$N=ii>+wLW|M
zeS{OS&gad$QynRMx}|Q@^CtG!_Qwn}d&_g?<|ne-ZWUwgK1t~D9f5e~1FF!SFPa-4
zjCW@RFf@$xH)5oFuD_ffmvxF5X|r^sSCS6SB)k2u(xbYYSEZs<Cz;yTorS5rpgPlp
zvboKBvTu7z_%P9=!9Hkayl8HIEc=#eD6xzu%Aem&{zY%B`;ujI^P^7w&gi~K^0zLT
zzk}w+Uy#3bR{kE6{LQHRH54TC_jO55^4E4G`J2hUtx4r?x7oIv4ki8O7pfaSntcl!
zca-q-H@{1AvDMfnTDL3gjNSisaMj$oKiK*Xqbd95KhUR6E=M?)v3q~6x$$RK&%`Dk
z%%JG8&PUCRUE4aBVsF|@BVT%DjuC%*ZS(c7GCO~_>G`;L`N_=7%4^CmFTbMvy_c7-
z&c(Y=|7J7?C-*5*UA-YIv7ZkN#yT;|tyI!EtFi45P-A4}{O+Q+$5wwHJMg=08(-SC
z@xZptFI~SS-uXxKO^-EYD)p-u8{6J=u<2#-&Q(e4Iey1he;M!mbG-B5ribrFhv7|r
z-|ADmbd>V?=Wcb*Y<qy3lD?MvlI2Yv%Wf;7o6}dX`Jw8o>wZFCm4|rte<;OWVsq2J
z->|qhTiiG8f?Hl~y#Gv86j<PALI+pxSrV`iFxvu8Z--MBrS_i)_~KnEu{IH32DIE)
zEt~ZaKi~4wYGUcpppp2<UYDn^{Mzo@8lT`pMf*li;`}Zb{NLNM&e#`1i~N@Eg%A5%
zy!(uAzBk7x&%W!Xc;X8^L8E-fU8C?w)jXWIgk{Tk_nE8j$T4C&)?;)xWG2oklZIMv
zc5aE9-JisLg(I(8M|I}hBQv>8KTZ68U04mq#ju#M9k&iCbvw<Szt2U3sfJoZj4XTW
zPP4PEmV90&W`XlM^Pi~`y6v|QP~cON1@<Xl%0HNG3#F9ek+aqr@y@1#%<M}ivQMZe
z$T1(NC@4mSA-^JKXIVkK``+5eiDJgZB;wr-1)2L#aQt0tbu!kr^46w!>;uy|RFb(P
zRInqINlfv-=8oI~lTp;{E;`<9yFVxU<O-B@?tDP)zs5Tc#5?~K?|d=d`TJOBB5@*5
z6YmI8Q@`E+ZF5HpDeq`1D5NeQ+)AspO_rv*fjg$L+N1lmSrmi1PfWi_@^Hp>_47IN
zw3FK8ZpX{p<-eW#j*ssM-^?w3DVJ(HrQqNy<J!Nymyc+5pRsbn!PTvsR`=(+{BaC_
z-?kcU%0G8U@8495Y=5}vk1F#Ak$mY2UTgL^no8zgLX&EbeF;FZ6}id@MHquR55iMl
zzRuY6@TQ;clexM@cKK`b<^^eDbNlnz?PV-Q#X6U;Ge)9B$4Jm1U)yeLDOlXr-Bd7#
z>uElJz0vk58mT>l>dd~Y?K;e%9XIqG-jbhU#r)j<2YxajheKD$fc)w9f&tP{H$Bcx
zt94UR&<I{pP^<o}RR2cRzs2g`2o}5VEeBUyOO2*Hvzr!*cRq#Nh<Cn_c*DOK#=bg|
zig(UgL+i_bwoL}C6BuOgtLQu!%UFu-Tp-K{J!x*ndPfUJNcG0dt(%_N_c`tudl`S5
z8Gm17{C&~v{DpY{*LcuuJD7L9aG9MC?tj1X!mGPwbZ&ck1uyf048TMtG6G+%?CURQ
zF#zvW12A`0JJkrxh!bL{z2Qmk2pr44>3(zb!KPQb+B+UR)f#{AyK#vEzlu_{QQ&LM
zv2WRz*(sfbf2f3}f}BmOiRV~A+7Sor4nGrLZ=9BCjy+y-*ROAR;x${$0}TavvCi2K
zFb-16&WEtUcTy9tD9@NK&zLOVadv3yj<YkRC7a$*fzqtb`}b#;@Az;CQl|Xgx@kUf
z4in1GofSLyxqtoL;#oWImu}if%^CdPEdLA8VgnJ?Q#%_93gY}zB>$9{JH}O|o1HBM
zWflBWA^(_?XuP7cuAs7_^QnqAQMpY8Rs7ZPR~!4#{dE!{?EGV-Y5k^$%>xJahc-QY
zfbgxopq;e%uZ#b-Zl1hXQp7VI1QN?KKQLjH*k8EU5|*$XfdToip`flaUQoYjV)M_l
zKQ}?H8Xjmb7=$91$-VSGxypCE!|eW#dJJ3|Ad)!lw4f0iSELm}?w_c;N*=K_-fxAa
zc~{4>Km1DxOMmemZ&;dF-gZ+KeTWSxOm{K9aW1|&al`BB-V9>}od?^VdM6fmM)s|8
zJXX5k<^wlG_MHJY-T60XisyVjarf!cSs#Fw-TrDwOvuLW{``lh<!>>T{k`n$?Oy;+
zZ2EN2ia*xzi@`g)UCu*p+qklA^TG`qi4voDL#n!<?a6nV1YVs_NZ0zCbgd#788o8i
z&AZf#tA+GpqOus$36La<?Qc>yCtUvu<v}jMNTi?ad^)uM<nHqqckT;}slnrsIkS^7
zWnZA}>7bc?=L<xX))Xt7BE~o3s)QQ3Azab9$XLp+<A|2`Y3+ZvRPrKe#5X>R7N!;t
zm^)7TVVOADr`8jZQd#EambVngv$rfu&1`>)fqfg!?6jM#TG;*~K)m}0d+-_~8Mn5{
z|LqeuTuQ|>mD!c@I<x!7bIPlv>P1z`Q&jo>e_C53?_lY#d^ef6c~X3-YsjEU6N`+%
z5;T2v|Ks->j8*%BQc2xsPJdTL3)^3MX*HH?xqHN-D1PEp-X?52YyR8H1gvvPI&$Z8
zmRJAm+X)ZOcyV=AF%cU)YvG>&{}C$-M)?>mV<i4r>D$Fn-V`=F_guCsQ9H-TCU+_I
z?(v{)q0sF7S*-I{X6IjHow&Pjo2k4}c6%w#BvIZGR4)EZ@hsaP&u%|kxtN*)rn@g~
zEWalrY2PdVZp+^ClDYkvA4@*yqrZ)HUrM0(>+J1sLL$~3I@w&Eig&(b9{8=fF%j#&
zAR-EAxo3%5iNR0K-hPrWT0ILT@y+|QZ&Oyh`A^xmy@b}-+xrV7Aj!V%&yq%G75%cj
z#kThVB4XZmOKrqEpEf&pGj1fVqp@P0_ix+&Om_S0V7Z_>^e?gQt&t153*H=Sn>;zT
zalzb92ole(n2U91<;OSg%)U+D(;g!PncKwV=@}eFIi6yHy1C{YyKbH%k;F+XJ$7yl
zfnVyz59P&+E=_vzHzKpfi+fx+-Sib14`$rA?<<npZMUkHXupumVNdV--Alv~^K@V$
zgr8l?hoo<$cq19*x3(8pW}dx0SCwP5K+M~z7pasQx;vFpzT0BOUZ2U{{v=%`@d^@2
zyw*x-JEx4P<=;t+l$ppY3>#yZ@~TzhtIoD+V*3W5O%HcooM|$0@K33^qcJn_;)PN*
zNIzk=J+<7FNrNPIKT84+U9Q}vk(`MST<r0dIdM$;O}{ZS%$(TG2Q!R-@zIlL)_QXl
zWtZpB95-CE?`G-)s<@huv81s(JGS#rq5W@<W&h++v+cQee(%h}hwz!^BY(PAGW^KX
zZBGRc{JMN21O0O>EIk=8k9)4X?O=x4_Sh=p@FEi=VunSMX-p;4baJeFTaLMTU-oTs
z<TuuNEhDbuab+T;Z0w1|$Tx!AGAo%Zba{fr<`@+j+HN)3E{2T61MSw25wy{8`yZqc
ze`u|MWHu8~f0fdx$;Q_#@Y+WKzbl|jHo7fv;Bvqp09uoczs#t~Mx#Z1+bfu}{2F57
zaSN}u@P|H#&~xAt0#87Sb&o3>No*;_ZXRP|6x-3Rx)vi-Jo~H!txnu*?x>>S?>vPW
z13~X?Z{VQ|>r(NsOe;J866;)LD3HnCp4fq>8@I-?@BAIvee$igUs40iv$l6?{~N9l
z*Hjqm+)V^`YTI(|4MgwZ_!g$Pod;y0;IY-5{U;>8)FyS+bVA~5ym5Q0OdJ=jSqRfe
zrSZAM(8gqTocR)N?9mv*5pgi_a~48mDZBMQsPJvscOH;@ZrsK{CK)UzgLDP6^EdL6
z9BO7a8N1=q{ii@X#mT%FQRi>$%s+tlwb_aj$CI!97n1esa@QNJ&FhRN5nMQ9o#a-+
z?^x%aay7e6e0qr?$jN{CJtSuA`y^(Wc#2(^2h_RT#J6uz<qade{}IZl5`HW(YnSn&
zSm%sMfX}G)Mpe3rCRv&K6v+`d#Th^3q@&rX{QjxR+e5wrvKkwdE8J-p!=M-z=M9f0
z{y%eX0^ek@HU2lWg%*pcLKXJ{1zSV`0i{>KhPIeMig>~8x{C`cZjb;9YB8<w7~{UL
zxLud)zHpIEN`V3bvM7Qen+i`1O95F0^822dCr^^Lc=f*T|NqKIPxH((%b7E0&YU^t
z%sj<hq~3XzRUrM%=2l-$qI52$JFk+iSpE3-PASMjqE?#l7`hN!QiLCrIY?;*uYwEO
zC}ZHPP-?fVFS}tB^P5YxuGC$0r3IoZy>>y;>>towA*?8{F;05#J|X>VVV%E0^V40T
z^=F8^vdj~_sMb^XjmO#h8)a139cAhZ{gMY^{|*@h)1INSo&DQeYDGTwT4qNZB&3>W
zZFS;ilf*Mf)blL3FmZW~@f*y4hz<C?9c2T4EdBK#VODjJ<rbSu6Ixi^MJaFbjs&od
zS;Lemv{XB5a~jv_M`~ll<gc~=x9UGRvHoik>hH~Dn(6Ps+IkFYE7nf;o;BEAy@k7n
zb%)R0&XOG<aWP25F>nY5(D5vLSHJldEERp{Ag*SAm6l=-z<+6RWlp(pr5};ZoLXZ}
zJSCkk>Vm7a1&wPISSto#a)lgjtV8a~8h>jS`b((isx$3bHhzz8_*8A~xw6{ah&Q)J
z-SuYv3#~PwrOndPTYMA&1EjJ|T^Vmn$Yjxn2KU6<bUszZ1aUq&Gh@F0G;gcD{&pbn
z4~4)XJ=3ijaA>#21bHon4fD!49&I!p&Er#WiJ)?#LPckn_O*Ru5Na5ly~i86A?mH%
zh?*em|72B0mA|OO*c({a#NS<^*YlgsG6c&-^R~swC<}`-Je9NjojgY4&RaBpiVd!M
z+A&Nb^GVcL$g&A^WB@(~muXqNh<r=eM!%uhw`4z@8n!ug{d_o81cg&8h?mlISju33
z7Sz>05Z@{C#1g^15LTWlGq1#$6Wfc76YHMR_>WdUj&cTME2if60e)a^@Kmi!3qK->
zftn&g@0eDS0qX!}95p{D$CgWX;SztQJLr}v>u=*$ZsQczME7)Z;yxpLtC_M3DBR(!
zd`XqQAX~|gN97?<l`EF$l`!<o%c>jPXSf3EQoYU(t2_>DShDClrC_TSv5iKWu|n-L
zyqRj_A^S|vp4v!y=cTSaY@a@yAq9iycan{Tx3Hvhr(wsr?d^xV75d0r=~`u5s)qfm
zU6Vw&k36C2DXs#GwCa!0KbILH_LvnN8>g5-F*q7k^rCW}@Ci&^oiHFOtAPhWFNur?
z)Rpn532b6-813d9cY+K9KPrw<KNycn>7i`!L=93_aK=N?E7bFKY4icEiC5#XR#7y=
z{ojqptE%(`*@FC1hG4!gOC)a-85XxO_tNUn`3-FR4l^LN@d42rzuzB_X7kTr=2UHx
zF)6GwCfLbY5gC(KYD_MTjme|fQxo`~3c^`vI0@ul;0~Twn>b($Q)kKdYbR|olf^Zc
z1XBV(I?uQ?F|Q18vVF7s%VBIvnBYC~d~6rfcYCr^0;2|`_%h8l*v;L+9(f*ZS4~OD
z^?XTI`In8o$rJieIMuQ%$S^9q3;kIeX_;Uz@<eB|Svt#Ep}c&sshHTSHn7B;coeqK
zqr9Q6N1+C1xNBxoHrjcfczY7cyDLq9j%WdyxG<L2x}6_)@%_skt%_`PkKMN#GvJ6b
z`-Zya#@_jVl8x-hGf)(5eBwQj{&b!fIogZ9)KXPCN7d;6z#UAbQZGidhxO0M1K!ZT
z-$pjE_Z}yIW$$es;+T#*n3Y%lg`%5pusd{8d#(+b3;|NgM`xt?p<>S;oyuzJ=qnV=
z%5zQ}i0$qRZIJ2qLN}yipDuu^zFZTyh=PK$?{rOW^M&Ll{4a^!DqF~&yg37ZWqvNn
z%g6McV0-H%wnyH6Wv)Z^%MPfe`<6j@`9s_e*3>B)4XkmdtE095#LcVnA-Axt5@aKr
zv?&gyp_x-K3dsH_ABOd{@nU(Vx)noW_HE?5IsOR@Oo5sq?m+dk{;gtPa803?DV{H7
zNcp$T+8T4cpj&T};T`uB=~N1@JyVs?i$=fnXsJq=ju*mtw6ulwA}7S!6BH}Ok>CkL
z9M1bcl3zg=T(uHE09@6BeOy{bq`RG8)b*_`#%`1*yp@YpmF)3RpXYm9ZSZF9phiT)
z5@Cd6eG^0}ZMEN9X%;!3M6WhoBJ)C7$9prElvK=7GUPqU>hymr{u!kcdkt&~=9#EB
z-k?|)Y_wG;1kb=FbTiVJ5?PTYh|ol1J+!{%CAA)I{)qL^5Bas`wxpW%a3CDkiQCPB
zqccM1IRblAM>YxUb@&=#9(k-ApTRE95CX?dBubyfx#=;!lN}C9Ufbu|ijtR>#{l{>
z%~h?%&B9a2jk+_}S@{+u#Ogyw`*e3!&J}{{i9SI1V0=_W@tzg#R&%W-kHtfx8$#)K
zMcG{iMa2?3T=5pVU%1yl)G9rQT8(=<dBn+qdYNTjOT$XNl(x10*U=riU)TS=tfHZ2
zjea?Zu1kG~%)ubyGOo<mTGZt+N6x-$Q28#NipsaNiuyu5H@>ZCXNIAjtSjK+F=m)g
z&Qx#b>bI#;-j3j{rtC;wT8o~o|Lrcn4?A`{Z}7a_vclStmwFvLEte7<W{LF{R*$?Y
zrqWrPoNatu##>nHzt;S`tgeSYP@23waL3Am^O+i}sr;G#EwTl}f%&8E&awK{=ail<
zciy6PnRE5416sd7D!{9ix|H<vUEW%q;ZJP*)l`;)0BTqH8wvsjf$h$5F9?Z3Z7bV|
zkF;wZx;8^olPf$iN7%v>>VxTPj<YgE#(tr+L#XGTJ!?F~M1;Dn#RACi;B?AqFD0;P
zc5w#>W+Tv3vB;XYZ$Y92&v6HbXOO4!X;8;F$7>AFs56KoYSoxMjJ0|C-|2}SmGv{!
zkX@=@mTutXKwg#^d(Aoe)q$%R{C`Yl@W17gfvVHV>sgtxuuS^#wOHxVO!r#iMpiFv
z0cZ8{vI==5j%7oWezFP~UR)4WoAUH<b0;E)RQ8H=&F{wo)i0EF6#4>I0`XwPRw=ih
zBFt6K6q$hhTk92p><N9AC!)q(S?kA3&APQQwl+B{%Au*OUEjynuJees=pK9T|4ICM
z%qMRiMyD2e=0~SOWV`x!lV0YO7l%rQ7rC4(3!1EwC(xQZbE{s^o{KGL--mAoH~kd8
zN`C6~oL+hyHT{AmM`gPi_P(L4d<hm%hI8p^Mw#(-nemO;gAow7S%j{_4s3yDWMgwv
z9>_9df0^;4ED8DEU|ycH;#>O1YM(Kz8B}}ATd>_7*q1x}E)4$0eD|KEUI&iBtytAk
zeJ#%?gT!Bz2GbrcHPQ^o;*KuzW69c|>bzf^CjCM!-onI{m8=iAf?Xdrd=LKW3i=*S
z<Huh4k-?7x@*~UW`XGZvILSi)w-FB^C~r~|To`r-*6;NaJrS6h-FbI{e&KRvr%eQS
zCuZZ%{iN1;RMsadprLTP|3@z-w8|nyKPcihGR}7lJ$?X|4p~(@iH)fxe(9Sakq|hh
zaBQ=739{O0Hi025C4{;}F+s(b)h@PB6h*o;xLz@=9%t$9GJ_a}ugfyOV^WtnOMa{M
zIzK)KQxM}#$VKI1m-ERQcjaCr_^ke+6B~OnBc&CaunMp7eJoi|V5t(uQfn16F=6ou
z-^2G)sMX&)IUO5A`X~EDytnj<0wEeoZ><-+pN=P8*n}07U4eN-Z+6~&rczCyItyc4
z<fXVUN|Z+ip&dUqatHgj{0J}RN@w65VHU-V&<~G9bu^0f82ck}zEOm&bDa3VBrZqg
zIa2aC2n6eKloHgfk!WnKyoLKl4t6`fPh6X}y9o|*N4y?mwWojpg=Qn(5wXQidV_iT
zCh6}C-$k-iU1%OUFDABQR-Fas))Ctt<4VC;?d9!N=+4)a0P~CSQ#P<7rCURT&NI~h
z_V4=jCW*ON#A|C!N>NmbcJrOVu8X}<_Q-TMJJITpTXrFnWi#Ynwm~~V8?h_Iaa84Y
zEUs1?HcOFDl6NhQg}6V&e;(M+{^~vvM}dRcBQwj4ZAi4VNS-TH*1!&@)Y#_2y~VC`
zA40$#ys8u4{b47`lF+WK@+W_*GGmrolo}lD=DWv3GBKs|TxP8E7_->%ml<0<>^J)c
zeO-_rvJI{A7Or#NzlCAy8!YKEpzmoVT`sYcmy=vnvBh5)JTISp&v=Oibl+qa^>(K8
zO&!{0h=@)a!(D7nSweI09AaA13THdVJxgv+s3ebtX_m9{4XcS<nblS6^Yk1Ou}%CQ
zevmYGbe>qXwa#Kxc{A6?{Bxc?zv_MSugacLRfbp62%AHs{@{5T9u`TWJY3GB&SSej
z!*?9=qYDyafIE1m+ZdM7*HO~t>b{1z%N=G8SoSqax}fvUYH6G1uBdit$JslwX%6-D
zUdnp)?x(C*ZTTc`{q<^;$a=SQKHFm~KxGqlC#E5k{__b$ItBBQw7wH0GU;@$vDcev
zvdzx*-N|pU0qlz0ZZ<nk*5TQd_*6<5`D7l38Fs7em#Vhr`UYb<l+2jHmdM`sS_-=}
z=c<YzI~BS2;E?0pZr=^@ycjI!`K~7_r4cNB8zv#KHtbUSXy=nNrl}dL#;jm=WKfAT
z;o7rDwcvkQs_K|fRiuL|zh}E@#s9pWT^jMf;Ev9YG>xY**Hpzx@dSyR9o}}5EZqeZ
zsm%_77H+nc5@TRu-G+b0@J0*__wuCpB#f`ig~D6qth`GUy1+rF^MMNWR*V;EQrUo=
zZs*kPP*4`YtB19s58h09o9nEQyD05GiFsHO(4L$Xi}@iNBzE2daBIzw?Qc9@(G#74
zI_a!<mt1{<S%LmUvKrex%fg;zzmyfuAGTI4Z}I%jiQc?UWf<p{MdW8L6?ZQ|_g=P6
zhZo^qUG`IdN8Vabw5qId$%wBbFU4plR;S9FSYgCb(6jSW74H#DdjhZ2GE>!yy6DkA
zbK3|3sZ<&IrPB5N9a%N*=uA)HazFl^lLjA`O}A2I(~x2Q^B9o<=y+m+f|3_N$tBK8
zHjs$@y-@OR^2^pNgpREF-n>i7$K+)YKOjCwy%QCRiNus;#f=Eb!-d*;6$XXQOC$G{
z8JlGgFQJPf!Hgd~%YOD${gmdOB(8L^i+NhjD$68P=I1m|tJ!6lyYOVe?B?#}c&-bG
zsA48bplya&=}`p_ilKgAVX{a^H&wVO`naRj?!ukUz&d27qLGNYW!r4pV}~Om0HlgA
z9_*w4%eJ}Opzyd0xA~h_EcJi+i32<%lVa<41!K?+Wa-Wu)2K5Ab`o9fYA@QtfDy<M
ze7keKnTxfJPP)Y|%-4)WnWikNYBuoT+Ova+jck_%m^tOF_*mq-u@#5V4#CK+q@ROm
zAqfkCh8T(zAthYxj<6UbC9nl85n$2vXOFSQvuvXfFp;RU<@FFx=0ZjU%Fgj*F48Uu
z^F_S-3U$?qvVyHvZHa_lB!fwC)a^MBcW;=~asGzhBAg3%4!^U1hy|r^({LQusN37z
zf$un}P8{Z@;mwHGEpvPqNx$JyURbQVBF4HhCE~-<KbTe}{A+q)>3jmp{aFmlvwBz_
zdsN0_H`18#sHj+KX=bbyz8hl{aWDH(_y%gO%VV`wr1>JwZwoz{v(a4ABsMh@+_#Cn
z_<?VrN<k!n-(rE?51n0y0xS@t5*7$ng;75$<>7@J(GFHJq{?|SSSK#MpTrMI)LWuO
zpC<0@WC5}FcgER@C(b6XGB+2?599`xTjw<N+&$+`DuM&t1$)0ZSN+{5d{)HE#o|jg
zmh85?gIWY206fGGJm968-Af6|TegmvY-d14@`@xCvk**bn+;4Vm3h8+_X>AS4SXC`
zlfz?Nl;JV%%~p*g@RynBCQY0H(Eud-T$hT3@+{lL;@Zm{olgL+rxgod|CS!67Mo3@
zPp-SBIG_Atmhic}p}{TP!Qx!xa4MWK{3tc&51B{flP5ht!V-nuXSNjZq!NvZtD=CY
zstu`~cbh+~7Z>;Fd@>Sd)+fJ?9{0|(YA9}&z2=K_P&u)m)QROdm#4oZiQ>erUaxX)
zB=}wpnjW7sK0z)L(4b3P`!O%yrUY_Qjss-tx8rruQMd7Sj(*GG26M;9s_f1B?UK8B
zTch8a-|1IJ=xQ&Wb1>>s^ql@W8N9|kb0Qg(I<Y{aZ3&E21%Hc6V7GylMakoAQH)sJ
z2mufZ<yw!~`mvMs*it_hS&zB;ae(zG7DoPEZ9R6<k8<yvUWnD(a_g~3zkSqtbnC|n
z)+6Utq^ggs$4m6%4D0b~{kYhA9HJlBS&w(<$6D)glzvRPK+vejE2mnWl5M4ow^MSh
zlt=B9PFBjZc1n?zGQmz6V5Pimr(A8Nd}OEGVWmi~bnE3-$_zW@Q7dJxoif2nS!}0#
zWTmXIQ)XBx>+F=pR*Gq-tg}*T?UY(8Wv`tgS9%G3)b>KqOv$!V<P4t1IoC?bwo^J;
zDJ|`kA}b}=P8ndO<l8A%TPdCFlsl}H?siJKl~QD<JZh!5?UV^t$^bj%BP-<+J7tEI
za<!eZ*h(2<r>wJ5I80{swAM-)Wv9s1$clgLlx!<yyq%J3r95h<bh1*OwNr|$lnHjq
z04wEfNeT6QRffdaXl9nArF^MIK6P{_?bwx5YIyy3a|9yw{JrkfCa8_>z_xNvVDDMJ
zqcK>mD?=au(d$^$H?_qq>_?ci39)J+$Gi%Vemy@JA>q~L!=3rsuiEFw^pU>77KBM-
zo7!#ON&BAY98ciTS$<O@06`1KaI%MJlBo8P<^&5@@B@ScGg49Uy8F5iu2@xzZ<arM
zeJ$LDJ*xa0BFD06tEyFL{>}26Qz*PZ`RiI0six}qeszF1I1;<t9yX>;&JULK>cpuK
z-x(f*vuZoBL-oo%Kj`h{?iU=>Yk-O=RlZnGm}QSVT3h$iA{|70hlPh-p?=AG=H-y2
zQHilKuwJ&!IM|4#_#Shhr{JKtn_T5%jwxXfa;6oD9PjqSB>qT{0l~YO^Cf=QZQuiI
zYHf$ABaI2;-ELIl%WTSlx2FAqLwnufuG-SnYpf7O!LlX}fTE6B(EnGKeH!dM!!_u_
z;;66J)vmt}l)u@o3!}cg?&hwJ2>ZY+cY-GHMGCu*$>Jvo4`TJy;yi&~a{j<$EK>0^
zgt?2$8wETqPyD#25WDf<DNWe5%IT&O$8L4#X%W~L*2;D=e+2P3TOzrDdi#qEk7J=5
z_A0C$fj=nyVyck{?h6`pFA+f8k_hfj9aZcf5#8yhk-u*2U+ztb;oGs;KZ$b0F+&6%
zM&%|Nebh#mF{f{+!!E^K#x@B#GM~8+2a=c^Y9x;87xUEbQ)O@E8!kFpc3{0jLf-so
z?5#>Y1&dhjF@QQxDVHjY<>K0jp5h6f>zvs2Y>&}(R%n=`PMU8P7U&s`*vdEZ6wdK~
zC7au0r4q-?u^V)0r*mS~Q63{}mgsc(!69iS!BKea(mS23$&+brm?}Z$k#uF|lUy0t
zOKXZ@0b<OHFOfeyUalTfnOZE=x1{ku!Cbera2umfG>Y?#;`|b0bO$kQbSg25`F~DH
z!6COXysI(1hcUe9@bTou*BQk_OAGhL<H(Ee_}v)tVxLhw#t4*yh;d7a(a#+V9540?
z^4W%pitrk*t5p)N4kQ`|ZK0szRdfuLJM_vBq=Ps_<*%Hc*g@@{;KkjQ2D)E8U^)rx
zcld`hu=94$bog4igK6;O4s<f{niMP|QbNqZ&fbd|3(nQeXEdA@;uiD<9TLNOzgX(M
zsjdup2-L`fcrHYPl&$mwkMl!L4YA#JSI&1<uHczj5V_B7a2|2fA!%70hUKJFVi5sW
zb1~^Q;sl_xLiWel&uhz!?PUc^%J7jDd{t&FE-PF)EEQ?WE~l1F$#NnIu@3DhE0|A~
zvI6w}d1Zy)5Br#awa9zOQ*)LR!;97P%EnaMc?NMvc4X2gUn_O7+w-E;frgp(w$k>V
z*Xb|qBVE)1bMsp&{eGSPp^Nl+g0@j>F43=U)UVFw5ZLt2@Rim)KjUT6et@vS%Q*TB
z@y{B5yV?lUvKJ`3X}z2}>L2QHYVV-;)HE1>xGUJCaIUjLPV{pSFD>MDxD2O3pz!76
zvJu4XGHN{q58SazWWPRKXPu$?R;#1FnILi|IBG;r(^yW@FFT(my6RcHo!$i?x@uE0
z^b2)u#}K4>LepD<j%#u@FFiHYsPNIqiH;e@c?aBvlM}w#9`@s7YXoORe>>lmz_z{%
zJfY%juXJwjQN!A()!=vRrD)$Q5e|+L?b+V5oQ8&_Pm(`@y^V%HDT`i8B&39IWJT~u
zO^nK&Y!S8NkpstqHP=mXkfKTWdtO+gS^ZJ-!7bb;?o(_aN~x0oG8NBNP@OnEWF+0e
zel3ag6SwqyH>Wx)WusCS?9*Lww=lIQR78kipYGUAXR;Aao9_wsk?eEbhP$O}PM;!H
z?LOEbi!(e0eJ){e&dZhu`N8W4WG>RmIbOVOvdsTdMSy18&f2hji5Nl_$<};le)7ij
z65COym^K`Mp;^fGEl`X*j_}**a$-Uca-TQh_Rw@`%AB7?;B7kwj@)1A+#P)64hk9)
z{lCIImh8NJU%%bei?`y@F{5w9wk<YEWtEeK;xNbSN`j3`g8kDxOTwjvbBFcBZeCU}
zx5Q}dPL~sR{X=OFaCRwM&beSd{*Kk$jBun>d8fo~lF#Du?vWsaaFe*nqn}p2J3}xU
z0Y*$`*^1p5$4l+6<#;etNSQ^Ks@M)92A0U9W99HRtO`$a(}_Dsq(#qbI9n|aA!8-#
zS2Q2eNcfqu<L<?v>kj;uIgFUpelW{d&dLnl5lzq>6Sc=!tG2#hyT|^PLag}66Fp<2
z>PK_=LB^4t392AId|O)NT6btvkvm<VZJ8t92H{T$>7{;sx<C91{UMB;CSzgbhw6oq
z6EmI=MjlNFBmYkJ+V^mj?<{BFQEDo?X`4(!&KjIi9LzdHM6~E#(RnT-E6)`yc?ePA
zJp(3mJS?I@`A_j>U@x^|IFxFn@Eexlj(+8BwbOsOiY2xV>&kF+=#72+!56WQr@y8+
zX4sQze2$vV__U!p&3$wA%M10(MeS@6lSdYbJ$OrJIQK(Udfiw`dw2z%WE*bqR?TgU
z#LVq|W=o8fglK2Ff?0Xt_HeQcpd#cz9>Ny^Z2nBtUqg2hEV+;H6B$KqYKaK~;wUDn
z5h1)Ws}aL1isc@n8aT2&Q_eaRu3$;|(c}1r!%T{W1`@idS*YK9(H-&o2n$VZ2Me96
z8t6)#MEZ$3n;b%LG4U|RIxBM3>)D<l2Z==r;iT*<Ne+|w=WVsmT_73`>MgN<)#mZ1
z*Y$CkdC8){?XCRHS$U%1AFFaDv1vTx7CueR5!^4LfDrsL$2xV&L{4}K0~g>wxgFm{
z$1>x_3_M59iY~m7lN?GciHab7@dW#K$K}n=GsPL`M?#r%+GbB-|4z;du0Li*xG<vj
zLEAkaQP~G6)?;o=?s3q!&m~^RI^JJGHCM~&NauM-nLbzdi<WF1f}G5&^*XN37GR4*
zY(f$m9$Ucl>2Tq=<L4?CS@0$-auF;Nep>h>)blmmerC$^aN_7!6en)4Q5nC!kBrxo
zCvpe5R_nykvJ}gOZIR1JtkH=dUqj+hN$f-7Cpz(;my>uaiE@0wF+tV*jDGd-bG*8Z
zS7unZc#U4?7mJzbWvHTge*rIJl6OE8(K+Izy#I6UW-8S+e$z%Kh7fsrd#U#^RrFq>
zGuEeiQu}957U{ytQC!w%5o+rh+ow}Xksh4zU9n|O;$dW0iL8@A+e0P6;c2C);-|Tm
zY;}*FBT^S>#$wDt_?e=m{EMcTOsORXt}@XVu7u8ityNz75bmL8b$mX{%7X(?9B;9x
zvs^n|1gGX0HZLktWNAE7WU<=&O`*#v;&Ajhsv8IqQO3~D^Sq(=#b;4x|3ze(|2Xl(
zWSX8<d&NDUY8^EoE@VS2F62ThTtMcIV@52af0gp%+W{gHca|@UXRf$TTbkwRFqrZd
z;A_a0STy1?tWXgh_E_t0P1|LewW?u%P{)P1zl1jOZ*BMrpcO3gcYdB9>OgenY?1X6
zCo<Ptz<n)Z!<17*tQgLU4UmFWBV$+#T(1EUV+HfkL35pf`*;(N{NNZnXPAimIN1~G
z5s3Q(N`h&R5dZN|{rC^g1_ao()rtQI4jTWf%V_ZsfA_kKK@X+z<AD6g2(}n6f3tM_
z$JY}7!3pR}87qXoj{o?+Ui^nMTY^5avp&J=oAw#N`&o8E{0GwhZ$byhn4{c9pF93U
zm`5oGX(ro|9*Skaq$pCT7$Sb3NpgH5p7he~WyXQBR^OCmuExSoyvHUPtTw(LiSZt@
zbiBuG6#K?ynR`krHWTl0NXL7elNj$|zL3fZjdW`AUuBr02PK#wyv7d_Xrvg5Fe^{N
zTwyT0tLPUX%e3z536>!<WZ;K&^cWF!dW=muPNf!kGVad%TOvBbQNomC0U!@?W__8K
zOdghEj4nH6MP5sd63ey!`9vi!X+E}8aR>M?z7u`Ingh%;7>e))Il0d-!W-nU6ppG$
zn^4~dq8nVsau=LNbc3AFEF+dnya&NsI^j6mh`Gj7xY_x@X0YMNr?cWMsTIoXSuC%{
z{v_ci6zI-s%Ch55=eoTVxAKyz5EOP!f%hrm89PhfcpbA<I7)lsvc=lsE;OCv&gQ+>
z@w4{uiIqn}@C#>;xG{W^Y7Avn`TPk@IW(LzwVW)Kn;tfpkIbSiS>e?|Nvf(8bDVie
z+qx@wyOZj!;M__gyLy@a87zf2wDhJ9?&LKFcaqL~ji22G_!3QTrkIg(ea-kS?naI_
z&5AVD?)%*?W0p5~OLiHUOMDNiQrPUdIvSPmuNksT2m9rE<Us4G@*vuvB?P$v;y@#{
zvEsg|S!~iWhKnK1SfrwxMH32aU}LkKaFE^HtR&|rxq;x06(zDcS<bJSCG1ezUUQ;!
z>UORG`c)2CnKw9xf1At0W`6AA$MF3b+)u#uN#88w|79f9HtG!Z7L$1;|8J0ZGTyA<
z|8=}wng&>s=A1LXHE+MMQf`m%<6BaHK@{%u>{(Lg2p2Q$ANdtH^pT$=s#jMa6-lw%
zzhaNE*Z(GWANe}E3l_@+PME)P6g~|sJbM<GIm}|C$2h#`UdL{HS^o82$KY((?pLpa
zAd7y;<l;^!P$TbxmdNH}wFw+o4khOlm!!Bpci_usjwVHvOcxFccVmJDC3t4kt+F)1
zK#b=gi5uBvj{iUuTy^G=XH(`dyP;xqqgy(Kb17Ipx+KNdLo9-?pqqSlkD8QC)-x#y
zTTL;CJfjqo7j^ow7fC-~(g%>vTwcR*7tnK{jlRt7rCMp-v`h@na=F#<Cr~=@TQ@wa
z2J#hqZ5|faG<745M_0M#Dyoz~D7O)2{W14EotDzz)C~|v>im#x7Qw(hOIFC%Y72`q
zd-)mWC&VFzS0h&@3qc}lw2Gc2Kiu~EoofjXv>_o^OY8C$Jd4DPKyGz;ODDCp^Msxi
zF>NaWS6czhBwAtXvL*`6Bd6hyuH2naHA0X8YMBmEn0)_PiuBG3wLw64kaKNvDkqfJ
z89lYL8zzWFl6q>`rptK;Hv%}3pDI`QkCz~U%sJjxi{yHQt^`)9Tc=PE*2fftLMz}+
zns&r<`oC8if;r-#1mYs0)hwmE2nITU63|>iMe}4SAyib=?*FF>4v$vFPfUbIfSQ%6
z;ztwsT`a=-bm*Nc$z(5V35y*nIr}cVx^0oaC6rHG?{s`)tqR3o>dpKf;<9U5B6vaA
zibrJ;K@M@XR%NZk<9jRj7CE1)VOP@Lof$5wz}3x}p7$hwR-XSG)pzCnFyB0YMLD7K
zE%L&bQX9wSF~@E)AK-N|eZqf}#?<PE2<?vDVAvt@B%ADExg>sTNw8Mz)*s|<#CgM`
z)_p%(r$a8XZdE{ker1p3Y~IgqA}=Cm#OdJ*dGi`r!%B>w13$}L2)i8wXA~0>v`Ou;
z)yb|DrF)`*6WO-D2@x3?xn+)nWrf>@mun*yGtoRcgJvDuNFDY|_<9C(nwmGCctgXB
z$Yr)}4DSR+k4o`fgrIN}KH?2smzP~fU+o_pdXP=WS#ILOMRzUqI#{YUNdq4@kyt<P
zLCmvZe=ih;mY%||{R6lW$a~J{QNw1}5oFBh7bh#pGxQY1LEMxXeq4#4`}E7pxol#(
z92qNP%d|+px=z1(M9RrFRn&-2dDXypV2HyQqj+7%_lTjMNID6U!A{fxwLDLP6bUui
z><N}^AsD{LmMrzW&5rI)oR7}Q>v|01E~Gmg5~r<ZjPGU%;{G`{7rBMwBiVJ?@BQoK
zRLqWjsp!9vQ{2?hYuITr<-U}L`B&u5JY-G2&#h_ZMpsn-zKlF%B}D?xQy~4AEkyrB
zecTk(bxSUV833s7iY}K|oXW9Yb*lRcqrCO1DEZaUEgbaO;wyCr?#W2;_40;pZOPzA
zj#l(i_c_hZ_MUT3#_%?t;5~b#78SOw+KkkVS5vrRiT7}HQ3AA5uJ%^Wl(}jJH<Rim
zF72tRqTKD)K0_uNVecNz$|)(iiR-C1b5VtuE_PU?K1Iy@R+aIJC?5G%;af4!k0(dQ
zmH_gx`-+>TrZ_8`&_tvilbvO9lNJb1?v)u|C*NA7wI(mO_IdEYW){<LEW2*-WbC@B
zV$b-FWvndN(Q`}bN!OJIONmyLn7CY@PxPr;x#y<xC!bHEEUU~$o~hNl@-Kg6=B|9t
zo}a{iNSv?xma6M8oRkL3+PZ?J1x6ZXo}O>xkHmOeB75q+EOxV-vq8V5H2pePu#6_u
zB|*-L2cfU>P<o82u!t3P2Gph6^gl#3<+>UwsGx!v64T>IY{WSS1~UZ%SJ3aE2W`Qm
zfU>usez8%ath>R+#VcbsK2bgF+0J!k5)>p|rGs|eRY-$oWx>)M>it5@c5;GvK2>iy
zTJ@`2V!te}D}9P^w)OCJLW-W?>!Qx4V-o6ojymZ}nJakV1?0JqJeQK^1B{5YIGVTW
z5}>+OCy*i$-B(UXsJhpmL^rX{1ayzbHe{3Fm!ld)_njvcx(n3X&t=<6AEsbwq7VO8
ziItKV*qf{GtSNE!-doDtJdM!pHsFkrQS@I=u+kt~-(XjS&>rbOWm%L6tMmUzjqg3K
zYP_CQ{8fnPY<oXgEq<-de557}C&VnE<@Xh(_#D-2Z!(Cu@6LgaM$Y?p(t^^RB3YvN
z6wY^6EE9x-T^EZwo;aV&uu#gSu}gmGA3PC{-_GF+`x__D=xcOcjN6SnFC8OSM<&PD
zLk^Tt&0M#E^3;MTNXV8{pEs0tG{F%bya6rLZ~VIQkjL$p{=&%lf<xbPy3P=9-eNC$
zl;qcWhOg!ZOoAy|)b>AT(A;dvQA-XA`Z(o~w%8lY;yH^_T&RM>vT)I`$FSr&D~BMM
zRp-SDF7_^Cet;9KfqfZVeY!jBm#%ZXr1Ng%H5DNQ?x4bw+6-@~zoQH%jyz=Z&?xrM
zk_WEe)o*fnNQ*t>^N=2UAS}N>q4qi(ml5`FA`jX;vB>`<URU2KLY>6*icYwVJ6pQB
z1}-|^Upwc#B8TnFO<pe(?$Z;CxLiDo`S*+X$@jVbOmA=mcHQ0n<~jhOJ*z7>pExTw
zDJgg_M}e>beb0j@G=eSK?demGlug|K^d7xeiH4MW^pGFw9zAu5-ug8d8ZK752-(GA
z{BHKZp~i5Xx<s$kI4CP))mGBec9g<7g;gV(uzk@M-ru=Ck4$Kwk?SeTAz-{K4c(p>
zi--9$_vdkgZN2;R+{yRntvuLRMPFQ$9DR}PCWynnKkt)f_RUN0lfQ2C1%ZuAlL9aZ
zSmFM>`7Xyy7rwL7;0+SUaHDPsH7Lt>oY#0#-F0U+<1>}spnix4H?T>%1ExOw)p>5>
zirGYPb)JB(5@wuqW!`miWghX)CC2iS0^BYMSLV5sugsHU4C*R_T@r&OYZ`wZ;!kPe
zhIp*Qf4nmfgJ+gh(7?TUgY0|r=5v-8y8hn1d1I^Xd-FuaU9<{|dA+;R^v+Ym0R$6W
zTXHb^<%-FxSRTy-wMbPXJ$)gJ8S3^lH_k)*COkPO7Bv~4zGp$>lsV~Jf&4M~!>+pL
z+wY1-l+%jO?OYOm;raOGKs2Gxr^-p*V1I@&XF~`bxE%T*+B-E%OHmH);f2`VI+dT3
z;>*Xf?3kl6xC`cx|L?4VECkBAgtQm?e0Z-ol5AUj&)Yb5OwN9o7^b(An&*kvY~VPh
zUt8y6IkJHLHk30sq4|%gMmOKB!KS~PR=??clAGSCJ$8%5p%QO;#RG8*;e`_5YWx;B
z6!qPPsiRq*sBuLtMXB!L45;DWW}eqh;)kTW)el+ZVJ$PlbR)|~@*=Jo=;V@XhtD%+
z;229F(&=ETg5<WV`(dsKFF!)IBsu<s+EmyN6;9!m?sYn}yh5Bxr|VucjrBqW45uzu
zRDFzh(|fXK(NsNH8Sk26AnD%N&Cf!16m4gPTtO#?uG+Rq>ZfeD16(iQ%PgNFlOx4<
z1Z*j##NRR}AZZT!q5x&=K&_1FzG9kl2RH$k;_oVdsYha|j<z#ABN?>$um>^lHQMD}
zFp~i47e50Lxq*Nz<Vnk$gDmPNoOnr4I*4s0#*VKxwJ75ZoGcZF`gJNVO!4JH?NE<2
z>}oov=F7=kO%640r3~}r8a91k;|>iX+-!z<BQy>UA0vZ)r&kOPvR~aH%6h2V_XAZ{
zB8te^OU5ZEj?$i+u2dC1ltIxckAkduXA>r$lB0;CT!pGR?HDy7N6c%?)mLdLzAnMh
zW6TU;Dx=1{L$(dAXk$%F)rNVMV5h4eud6?u>fJ$zBz9J&d)Zcs#zz7?a^DxS<dl+s
zdI!)@A?3Rls>BN3DP~X{vim)ixU4gYapqqYNa7GX-d_%Ygs6`Vi5wB*sG2&@t<EKd
z+J64CGvPe<o(*a~B%S9Lt&jt}XD6QLcFs^Ulzx<MmJ;@PZiw;vhr)rev)tEnvb*84
z-0yy(dNC87G2twC4!wXFA^q|$?)$ONa__%iz3S8&kL$x@@VI7@V2f1j$_@wr@kKqO
z$(-{t>VBXh9-lFGv+{$7PI9HQ_eOPH&y{r^W8-8=hGC}5kt=ic$AUN{TVuM$(W6k$
zUT4v8&bKn;m<C|&ypzeueS$(E2luE-kcQ%R9X8ukiX~u`URcA;P@&V$<j<V+E<m1o
zcHplwKYd#I!Kkx<>EbNlEWO<SyT>f}7oAM&nwl~_n-+x8n()9O{Hxz$@ilV2#t`OH
zw}Q)skj>H{nL>D`IpJLje$^$R;36I<LAKB37Ei-11v_yKAt`;U?SeN-!N(H|K4llY
zCth$0&IbzCaBRV9@eC<=X=1_4?SiH8g5T$;g0FvN7u@v@1#=S%wy_H~ix+(QMOE-$
z#C=#TPL+b2&P?dzFFfczemmRl<7>!PP(5(GT`(X8KS(V2v0d<`ctP13QLv1lHLJzG
zQqZ4RaFku}`gp;QJ*wd9SM7pFNx>e81<%z5y@rH}cI$qIDzQq1bHH&k&9!ga%=fEe
zzHTeKsOp$;PyCbG9)=+lT60JlOA6Io-b`gNhrKB*y4HN~1xZf-YP~Zhm_o>Oy1Dn|
z)RgimVhl);+Y2|DZ@og6!Ts_ufC%RfKMIi~vmjw>7wUP=U4mZD^uJ4#K$Nm70K&ZG
zzJ%E)_qnHc14nzypx57>#BNI$YoDhMa5{!r1u7#+GM~^%sXFO8lCDqHNwwU|O)l;l
z>~_VSI_U?J@S~E-h4tMA^w+61I%9j1M9xYJpXsDynXre-knE`Fa$D&8jqqk>bl&0`
ze+Jdatz#^~w(#Hw$^zS%^D@{y(3Mq<s4dj2i&W|?nM?O)Tq5<0JfOC1_(t8&N>twC
zT#U6Kd{ri%mdOITH3g&`0r*NlpDe%ysNv!F@B)VajioEx`zS!qqXGHN0RvkAj?4iV
z0$LmcxKBX)V*%d^C_fJHm4IW82RtTVwScov06Z>Wi-1c`1pFxAwvzz+1k6H6g&#Qu
z@Q+-;YyksW0agjPp*6tO25@2?V2^-dZ2=1fv}AP&4?GpHNPs6FFiXI>1%Sx{O4|b#
z3%Kkwz(xU8EMMXGP6u4v70{>~;C%t^vj95<ynHsGush%<0gn^{=Kl?l-UCq76L3o}
zz(3CgtS<s&^#+_*47k4pa78~rh8OUofazs`yZQsVo)3r$czyt&lw&R7+`j`>3D`Xl
z(CR`!kBb1$TnyN7383sBfJv7Dp12(F!4-g&R|2|T4S3}mz`NH2zP$mEdn4e4A%GhM
zw7Ll}Rlvxb0Y~2gxI)0&0(J`MH59N?fbUkoDYpUU3rN2mP$XcqfQbTHv4;x3Ct#0&
zYdL=rUMQgZFu;!jZu0~7jRaJT0yysid@tboy8*dl08<1Ua}VHE0a^C~o)OUdKEM_M
zZ<hm##{$*}I4%H4s{ni`;CswQ;R_!Cd?et6ae$2iUL6m3(EyBm5YRdZSSVoBLx7zE
zZVLgz0=hl|hzjWYI3VW<fLu=+J^>v(JV!w1rvPIFY!Wc+X}}KxhCBm!{W-uP0e3wQ
z_(?#=7Xf#_1epCYAbSEJDBz-30j}2oQw5BB9Wd+-z^!iry8jdKx`5l>0)z$l-Uh4{
z(DxlcrGS)o0iy*peh={2`+x%iYCZsL_z<w=BfzMS0jZw=J`j-kDd0K*Q320?2G}K_
z{Byt&69E$iJoazE0s%FX0Du1i@Phz3Wg1>4pw|?@JON#&0=^W`ej1=sz+wTXO$SUA
zz}*4hCt%zRz|mEJg#x<I1iUWb$ZEh{0)7(k$SlBq0oTt4EEdpj4q%Re&NYA+1vHrp
z7$)Ec0q%K#PX(MjA7BXhRlrqW0p<(nwg7O4fQW#03jwbRIB^l+839Ls4R}dF{$jwV
z0(vb0%o8wZDd0l^O_u@u0%`@^@(p04fD66_Ocl^%IpA>tr+){SE1-G>;Ea`k=LB^9
z9`Ln*f2;zuT@9!e@WKy(K5GEm1sqrlxN#j|rhu~bfVl$NYyeCbaNLi8Hw1M22{2he
z>Cb@00xsJK*eKx6O@IRe#+iVl!+@6r<o^QrR6wr?V4i?Mn*nPD+`I*_OF;Qnz!BR3
zPYY;O3wTe!+1mlFcL1901Pu8V(0mu*WdSP%wAc+eU%-O`W(!E!0~jq}jsVwQz<UBt
z-3REsA8^HQfVTyt9spb^;5h+<4goqw0e7P!gl7xLMavFP5zvk+$ivqNcu~N20*>Yo
zOZe6_z+3^T>407W?h-IdKvclZTy!3Ovk9P9z-6e<;W+}vW&$p63OMfw!2JT6WdSA$
z7|8j9@R596EPR)M&jj4=1hi+X8U9(ogrfkLaf@g8nC5^b0v>7ss1eX82hdZ%C;^`e
z*ec*A?wJgKBVet7Uj*zDFo4kP@Du@kS_0M!xcoT4CIR;!4>$?WWB6?W193El4+)re
z5?~Np+VD03^G^YEL%j+w77!6|Q7gc90Sj6K&Ta#ET);_r08>DqE#S&_fJ;vWbj=5R
zD4;#Dwc)1(EEaHCdqBI>080dP=m2<2!2PEKeh}b31MsPUmpcN!=mZ#qKQ;WNfaA{u
zw7m)N=*@u9S&aEa`E|i9fD?xTMha+j8z6K$K)w|i9)2gF|1dzoaDdOpcQW~>?LWVn
zPo9K|iQ1Nnoy8n?G&`-}k-<CBKDjfBs1I#e>R5(8Wf_(%r7ZF8N=dTpIp}7~O>O$b
zpOe0b-_D6RY1c+`e))pW&lq#$x)?mG>4Wz;Q_3g7(I~q|$_XN|(5^LGY-@z(ELUq|
zf-%<0-|*Ea(CU1K@&U^=DkG>@?8q(L!Sv^;mz-2VX4h4GjVYA=cMAs&7RZZej$bui
z!>h>dc)K|gycm_+i3xDRy}Z_ZYO6HASJhN)Hrd=LB{Eg@*xsL7pQj0>@7Sbq;1GNI
z$uS%@+N!z2V?KE}gy*cdpKdB#HkyBD+sKyO8ye!^)&X<b=Z%CcV0MARj7beePAi{;
zwBQF8<F)2hY#F!>hC4SSO^7oubN1#;CLP;q{=B7;I#EYYME8}iW6WgJP&_BCS4t$6
z`1hpHeDkbGBUk{phrVw>&O_M!X`RrY{*x@4*6GjwjrIF;KYPoB{-oOd8M{j;-B5oX
zdE;>X$*JF;9qft@*Pk1HiS<W}iO}#05ERB}3Vkk%wtf2F^dppBtkL!r^0^4U&lF1E
z_^Y6nK;eV0%ccz1${a$>6(<B4dtJ!|RAL_BauZ7XXGY#s{E)OcQT!isFb_~y&|_3h
zU;ML1<|H_z1(oPA(i186X*0zpuai-+$vX@=K70I4=Q$_l_3GoCC?7McDXsX)mtNX^
zvoFjSklG9G(^wSXj*{|esuS?BOrkwzH@hG5RKsvvIebS7BOSwQ#)>Kx{~R*kBnP*P
zbl;51l9F=VmcT*>56%)5Y}Q?-rSU!Cg7QTL6~Ts`Ifi+)X_`>KOt@*mf`;iK$898X
zL$2fxrBD4yONMs*{*2!;P{c8&r%V*uu#cD4qyf2anoqvfO$xvD8le|6)HgrMK^NDk
zeV)0PbDpOHTC}T}=R2zV5`QzCqu#<F{TsP1jAMzRZaa2BH}wg&6yE@D7V;FX9^TI`
z?<((cwpd@0LWMj11of?v_ovzKi_3eoOYxmxRf^Sob>v97&%N_*`EWui^>tqor(yRc
zLPDu)4JRRYO4!Ibu&M+R6|Uw3BR?`a6fxV8D8Whj44`!(IMFd5VXsWMc>8zwP3S8=
z1PD6#M@>+oCer`?S&Z4@)Fev}vM4rtR}n9o=wF0%X}x+w#B)Lez6=e?4eUSOS!{+b
z_%d{-{=L0m7i_~ykw%Qk60BUWZFlko#Ho474EWiE*hCTr?8B_F^Jo@)f(%J4*ji?m
zrh&z6*^1j@s8^fi8{spr<}*c%!}6Q;{l_@Vm2J>e$%QKKe44;!4a~xm$g9Zn9OO|<
z_OfO&@xRCTE3IAV&M!k3<OU8wpCzGDUxx0;4Mh1}%Mnax;ST>17K?Den@=0&jH!j|
zCTmG^uEli&ncofbTVWl3D@f$G6Db}!)*SY;l0?qRYoUm`U_e-~#5pnSoH&DHhIz?+
zSB*=z9^ZXHqAOJTr7Lu6ZeZ{6E@v?}KeosV_5L!{R}whATd+_D1rd<InBx><CX>#Z
z_zlm0YdkC#ow`oX|7&Fa^Bc_oCSj^Yb8<cr<)8-Tz*+gYkUw;K?`<OI(2tRGw+k~}
z%0eJ_q9MontDm!CvaLW~|4qM^Me<Z#Z8D<a=UAxZG~5r|A5~Vci+E_V$s#%d{(?kG
zByKbx=gI22@S_|oI|WKh(2Y`oZ21>s#ow8q$^D<PHohKo_&7F2lWSG$!4qUuocKJQ
z*L{iR?EgrS2+HJdjZ@|#>H4`Z*_7$3e%h4Dme;xZwNU0rApjRoCKKQlSSn6{$ArQ3
z*vANPa)SggH*H{IUd}tMPul9)cWZP<k73sJ18qfFj^Zh8QuXj-a;P4%!<H5L!Yy=Z
z5_=v7dwy(X?kLMON9@;{S8|q6h3fHp$4{c7?J@gWY4UpwX_3bzdovXlCS~T6rXD22
zG#)e-(VQ<n1S13)$8}ISYfMK%MNdgZC)(|9R5n^d1gY<CUTKBO>PWS@X+5=u(x2UI
zBhR@w$PrFcG)ZKTy7R-<C)I&N`9_TX)YXeWXwsbw=~{v)fo}E`wAWrm!rRCR34PZj
zW{Kwf_<^vA+#KjoXt5!LrpM?^=I4AD5qdD^%y;=|$z9wToa13pTOrpbUI6l3iD$P~
zZARBYm1h&Xlc4W!T&?jCeSZzVWBUHJ3K6C6yLmM{B1Af<M!0)ygynm$=2(vBp?k@l
zBU<0jeokK#u|YI~f;A$~$hIG5i|q{be#0QPK$N~Av?{v9(a`MI_Z7PtTV>~<=+wu;
zuZ&8Jt<jvN<j@NDx?kcbof<=_nY6VP3<RadNf?Yuz`%^Gg(mOWG?D#OY$^U`m8M83
zxGmr}xT$Xn-I35<3*pLf(hL!(^7{lT2mc;m$D@3t%9<^SPvLIRO0}u@N8Ja`U<wfi
z>xO*CQ%UqaYtC^rOKMRXl_Ex=h1w8$313BXz9B2rU_D<N*6TSY2p9&zA2l~U8^bSZ
zj)vFmr5v)LD?zB)l0)gu@^mqI7@~wmtIg^)(CIPzelyF>$-S1}>p>HIFFq{32DyaE
zZG1b}__nT(@7W3Xeg)6g$2TVl-?zw97vJWG!8bJl-w*$FIQp(!sqq~aqi+RSHGPMf
zn!ea{pl@%X@3?2;^gSUFUpdILmh&5mxRE2|&J?1v6DGqJxUVkem;L~y$5|X%SKjFO
z_CW)^+<jx9n|vqxzEJuPn`8=LLjt8!%9Z;_E=$)RcNwo5QNrRL^%JomD}pQf4YwIy
zCASj(B%(4GeI}+KwP=U#-JHclbnjo3?k%6r>^!fYdQ2W$8K&Zm=5e|?`FSKiE!mWy
ze&i#V>gvU@2}o3+e1VXAZ&v6LIDy~i^P9;Bho!&E_pq9Cbcyp%Oa%2eJmgW$yvF|U
z4`?>As?=Y*z9tFvMZb&HH|?DU>T4OVFI2ZaMgM{94be?SUH^!xgt1Hb{Wia2y3qI0
zoSUdy>q344Bo28C#5TgZji-y9#2#}y<7{gq>eD==GAx$FVDA)O^spr+>^^Dx%rg0+
zqBP%4_k<YhP<mSnm0VTL8uKyghh)FJttF8}rE6ho?ja&DvQpJKkYl#kCPeatmunH%
zc87d98Fym(GE$^-dDGKiEMeuI)VYH*nY<3BKTFLdf^YU^(RJplQtP)irus)h{YI?*
zol<`m^$)LC{|~DhrR<dYv#7r)xqjE->dyuLRYN%!2@R|^TCEzB$Dx4qqpTL%*gY6c
z3uJDwiXag&XZ`pVoe0dzy{y_?$F2R5<;fiodh5^8aSAY$KKolO?Q%fxV@j&)CfV`X
ztJDNC2+0a!Z=yK{`DCH0oN=$nTIQa)o4B>u8fe}m%`Uc@jjsn^%ZJWlOHrBCZZ7S*
zs80_uXA;%X+2pn1BnyR}iJ77~O{Jr#OI%7;!@-pm7MI)tDq>+`>e`C2Tqt8M<x`K5
zsTRL$SpzzX3SWMssUXVDlz-Y(5ZxorylS0#jGRJ_Gc9sRmT1m6%G9CAPYkXt2-RdT
zFAxN3l7~Z|?~x<$BKu<abWFgf%kSdjXb_+1oAvM+R3D!zjuteGkNQB}0Qu1APM4aN
zFE7&sIt2o~OXVd>RpSa|YZ=!U9w-XkPFA7NuvngG&Zsy#ec7b<@ecl(_`wX9y06x2
z>`L74yMv?mdV}Y93b$d7IEa`rpNE~w3V$B%^cEast3XLPPU0uP-(JwI`xeY?e8IwA
zX*8kZ=ck0mR+q%+^VL$_B`$VPU;Bo12|xWBgQF+bn8)^0s#njETY5D#AzL)(Ey`Kr
z?xJHo>voLwr+$Ct!v=q_Kc`54LP;ER*#7*sB;KE6|6_mVk-L6>*1rB<^k+Y8^auNM
zJ|7>c*PkSL)XnOJ$fHZaCMI=kd6aD-o*<7p$FYvdqZ`1<mPaeUq1FE&kESiw)Dw9W
ze(m?=QHZ=kyE^jd6N)t;k5<VTHYAUF3tA1!qmC9fB98`BZGy0=Baa*wF3Iw!BW3Ey
zqd}ly&loL_s+NJkACgC}e66V?_^f*M_wc!soI;yqeBPo=9eifN3U%=5d{}(y$)h7J
zghU?woyudfrU7}hc9Ft}tJzudWBH;vC&p2_h-r&*$I00=Q3=|E8I!pvSwzL+@$1j$
z{fy%u<^QGFB}OIf!4Es1PXlk+Jc@i8IpIIar+o`G-Nz)!r*kQ1(fxwtPL_f+ojfP0
zlVVS--@oJkc>iu+LjUgmWBuy~#<Bjr^2&ee-*FB0??%cs+`sGLr9arezmfG1j^87T
z>-A6c8&Rn2{p0Wjx?f`N{`zI5P-;U7BFypvddVV_q)=&3vXO}9ydha_<Ih?%?LoZ-
z6OMcqwuw8<lrobJCXpGe46?tswW>W|skZ&OtZ$q6EseBCFES?@u?-#eDQW|^b7)Yj
zOqNJN<Nfe!YW+HCy|MgPFX@_N{;S4<E#E)4+BjLT{JYhL{N4y7@E1?l>X&d*T`cvk
zQuW699YvjOgx>YZu}7Cu#kyJzIYIGl%9e1^gmEa+gVu7a);zAU2VG?s4fOYC^WpbM
zd&~Lsddv=Yv$P#dzlFkIN;dH|qE?C(iJm50bwG#Zk^rkV-zT~$J99KApDOtDH=l0s
zmnEK;kx+NpF&LXBI}s|Pq+I_*YRa6FJSX3iK1A~jB)7~}n~Qb|E=MTGH5YExnD_9d
zz{p90cT4K+<v%)bFp6z3!`hB~z_8Z4ks)b3^mxnmrs;gmXzX@m%Yr%O%4pkrUQ`3~
z3-RVM_$77G-W1LG2U*mlS>-#PyLqF&7IyO;Y2F=_?%_DHv@5<o`|Lu7?{kety!{6y
zrkysD+E3T*-)yyiI=Dr1YG2U2eJ0J?ZB@e))4MD3!Jon)$Cw>yNR^nbOZXQh8#X~f
z+PCewO)M0|p8JFpQM-{CG1ft(^>Z~Ldg)iuoS_yjrzGNXb|NnA1sBUjZB<omKDB_v
zDOUS>d-)($*Xw_dR3A#e*=knzKgDV@x&N?Uy#G8%|7k__f6?=bV*j!K77Eh;*QH4P
z{@2v${}^2}pKMPUi?b7PX)n0c>;G2?{nuOd2dP^1U!+Kk|Jq9R^xtZhlUjSCIoVd5
z$h&Ga{vwj9^;nxHRK}<7hr(z%W<HIng5N!-$xxGUyIP(5qm%ey&m4`CqKA~NM~@}s
zkcm+m<A*W2317QJdYlR#g5!M^ak1H4aH~s}d*_q*S>goM_=qXvW(yf{b=dUDk>*0_
zeJwOJeNrqO6n$b-;&sgn=5K*GeI5}m%rP@)Rne#B**{00?`Fs8^MsVGN1u<$kwl-N
zx(Q7m!HqIz6YvoFTxH=Vzjq3U)TPgr^BSU$o<DspWRm5FG#5&5uhDQ;JV}eTspC<a
zaEC^xsRZ+W{gk1feEKO%Ki#08vh~v?`l*F}>Z_k}^-~Xd5+N0QhL-v0xbJA;gji2c
z;&r5nr)CaH4HkvCOstrTDj=TD#K9N&Ao_r^rr|hI{_k0u=AsXDqz!1rf}w3!mOjvf
zED&9vw-HAeM);`N=>8b<Ma`%6i0gvUoJ~)Q2$!g#&zFjwRJ(Ccl3`ddmQu>_El*cc
z4oj&2#7GCqVzHxkkFx{}_`aHc=Q~<jY-acPG}Tlr(&(iclp_v`Ro+Zj!*|v1ud?3-
zHs{(hd3bfazvqFc&11Rt9?nY^VKP3(V0}mSUF9Fviz<g%_^b)Y5K2ksz2R}#(H
z{gkZ87@jyww%*ug`P9BQkAY3px~UrBd+Bo+wpV3nlJpJMWV!6i*d<mC<2&<}@;HgR
zz^bnDa~XxIbU9&jU0!?6*Icob+qpP&#`zjPeW*^_OwOe-PO9y=hpQj*)6XG?oL7T1
z(MYpL)wNW(+(+`EGnaFsGqA6%KjM`ufp|LKA@^&gkmg$_XS*+^C4v`1uEjkvM_w<3
z{?bS}!LcuXKHWL-sL-&*TsBcueyHiN?dALP{7r-+d~;H&FW=YR*IAVx(JOKb_6GTQ
zO@|5&k@v>G7XA=BrySRg*GB>W*JvvF4uy~1dPiyf6k4b97M|$WvHgyn>w?AQ`<o7{
z<+iYgCs1qozFYmxsHecL$2lD{ch4#R;Iz{wNF&&x2RSP;L1er)uf3uS1Z%qF03Uyy
znAHRddHBXG?>dz4yUc%~Cv-6}POkF%99OE3;qnRd&P#ijkLl&Of=|`+sT^*RcTOA-
zm9Jr)1Pbatj==sad?(m1{Iy|@SkPtrzAQZVzV_w&uJO0!(@^bG{3kmnW(W2U_Z=D7
zf0_PHRAB!=e+E4{-66Ns(|vfk1`1XTOJ%4Sdbs}seo{VPBKw3J6r)s{oDdA7_{pmR
zZnW1FIHv;?_?d%du4ziGg9eqmWH9=W(>bv!uy?3$Kw$3>ND=+|0G02-CQtkAhbUT?
z?;GO3h{w~0L`tdmbQn^?V(5aiGL0Hs)99eI*q%4?xtQ=)cuL9-waO2r<~OO?@04nW
zDwYdBNFxlk`pXq34zy)1Gb<mZXpQ}}WO?V^<zt#Su27^8rJZTh)LHQn4aE2%u)nRd
z;uR7gyt6`uM8rnL|6}+8$*K6D8y9n%YezFhru#UxjY5233CFUt_#TwOd?47Z=VQ?W
zQFAI;gy@}0oYSu(W2oo9_!MmkzLwW`qdWTrxO4673}2DQ*eq8F?kwRrWL9`Agi!HH
z9A@$@aPaxw+2Q+1quUH-d?YK+qJr@2(2DAdp^Q{tO!cvfld9EJbxwq$G?~M2IV*#d
zh2*E9?S}8;w>Dj>Bf1=8Z9zUh6~Rr0j=+j&iSt9VRBrmah>XD_xV9mGWG+`4x>Jj@
zxsA&>B670In^NK|4eR$K)^X($4hoLf8U465i>5l<#vL}Uf<?~pgx9hQN+6v#n9JTF
z>T?1QO`^ZlRANSmv)g1Yq*r*^U3>+7uzak{ynGg+c8Wc_lm$ER{oO5a$GC+DqP`x|
z#SVeJgXz7`71%q__`%mFu(ufU`Ug%DCW~C0Oql%i{uB)F=YF=N7h#hWHNDl~L~<p5
zbg<fzrE&gPc1SxTM@ZrsGTPA@YWUoPYb9w)bt4g@zb>HW2~x9jI&x#4uNPw#{z0T~
z&O+*+ehy@ibK<9Y<Tz(~MI{GogPr8!QMqQPCAWZvdhS1PKo0qu3uz_!=1F^%R_lQ{
zak#}`x!O33F?jk^y=a}tBK`a$vPg?DSR&ulj#2Do$_)HC`DEp~;t#J}c1_V77Zo?O
zcwM^B3Swk$sx5P;&S3d^QQ_P$zDFkDyW`=*<GW$X{{Y|TsJJ0~-`QIi-;rM?;EVc`
zs1FPV1=%Ku{&PMxHlPpmu&})F4^uU!4;%_vC_0rsAQ7_Po0&2EEM;Kjbg+BF;(&(f
zIel_`{6C>id(`TVKlx<*|M7=0wQGvzT-p$_m+z@d%=UurD=~&`7~dlk@NN1#_-^>(
ze}M1D4>dsC_jcFC_wH%Q_|_eNP>7BHjnvrC_*+>1`SCxdArv!XD8|Nr>VF>pN%8TY
zAN-y1C*OZP{&!Nbnoo($dik!p^lUHa{-^Ozz_<19;Je}9{{wvIJ=g%g@BLaA-@B(Y
zfbZDNdy)IH(PQOu%gvz!)b}~%9@?wr{@)>7_S;kR={uYS59O7U*;(*p-lOX8i+K~&
z-#65qG0uYb@@A;#&+<6)MZT$d>-g)QB<fWs`6a=np^7|C#_(G#b9}tFNPck%Qnw8g
z8>85B4F|gju7+W&_mLqTi~NIQ^i}dQaBdxmjt(fL^W7}&Y6H^AnV*)Ywg9m@hN?Q)
ziJ%bj+0=x+$b&l-g~rYj3dAJ(g_C7>(Tls{Mxx6|i~fhrA`<JkE3N$s@zY~Bb8?IB
zs2J7%Z|Il(N9fn{wa<lqkcco9?5GlU6V172ydvS1#An?r?*l#ZQhe7YO2yx$p<Pcm
zRZl}?EZ$*}QOFRFxz75++NM?BrU*&9{_*B+l-$Tzk(o+f%I7rxGW5Ilvj*tbZJbTN
zUh@9GrQenZtT|eb4O98n4O`-|vy$nz;NK0=Zz8D)^b_j+W#(^d!sC<YZ-(l0<YMNp
zLte`KO<IwEm-n;`m{n;Jj?28%{MEX{!fg#q-e)JqCa?Im)Z~5443HRk@Gr=3J3elJ
z-+r!8{B{}hb&<SR{C3r!pRX^edYG@tWP5sRgUognXRh>C`A&PX22BsCk%~lp>@PK6
zKl!Ku`n?pe>G!a_|8MDcp{l1I{Vv+lApK7HtRebklA27vztnsk_My-(alO6cenmW)
zuNTP6|7N~+P<7NJ+)0}on6Kt14b0bNB>pAmYwr&l;J31|Hou)L@BdqVOIP*OBiqJb
z8sxVRKW>QMo+CAx-~LkbwbA<x&@a8*rr(<P{s;7%cAwN!kAD9QH%PzRv6R)9cbAfC
z(@(qu7$0nxeF@mf_DEb-Z_}7ou&iyTvN@bg4%u@IN|e^N3mOI0nVl-ea!g1{SLN2}
z^p|l=mya4G2M6_?P;F77V!E4)W~>IP&8Z)PoIX0LB(m0@<a`il=4&*~kvGNc<a^8`
zVw0=lS+QPEjal%q;dGjUxrTEgq0>LvcOV++mT1(9XLJX<y9e(wtKllj9GHwX7->N{
zXGI^eyc^e9hIZ-H<a`jv*j{-9mc;z@MdX&-;WAiGqLJp<yVce3VrG7Vj2_1#Y@m$;
zd)xXso#GQm!o)rfG)Byod&TBXz6}_p_$qChnA}enKS=wl#pJ%-`hMeK?Csbs7kNVc
zqmL@P`}DKKa^<)YTN~fE4V~}6IO+e{Reo=@y+0T0W3Cv*8%N|kofz5j<Q``(1mH}P
zYJ8Do5d(SSkb80YzRZy-M&q@sFutedl^KT;?e(e$w4!oifA4XIvvNHBp)p5$e=llk
zC5`oI#BGC(RHts0PF+Z4bn1pj8|aVh!K6QtVo&H^htvR;O;p)d=81wuNnTpa-WzM5
z*G-4*h~Re+^y2n)mvcIIwfCIG4O?dI^~&sjq%!;Wpud>*TZ20Tm-B@(`ZwljJOq)!
zX$e&-`eD(Q8v-$dd-TsNe)i%gM$6n3UsvjWGmYEauA|T9BQzkDS4J7qU}|7*{>UbQ
z85u?@ADjskXJpXOdy@`CDcVXGz1c20C{`4E=wXW9Nl~#@=LHVk>a4g?UH{j!7TcqF
zEQPU8kIBnZU3&;zz^hS5F;pKi_gzP~r;43(nq{bVR&s@iWLZv?l4UtrRMFwIE-cYn
zSW!+@EA#Rdq=Iyt6lcW*(!ymbmv{>3(f9wxSbLQ?3@Rs6KjYPUh>ZRa|8@LsGbA!d
zV!pVwu(i5I(Y*O{hZ0%NipEln?$B-iq1x2!9Crgo)<ko3_!<(V)}d1CQ2&uq>ri2*
za9`SE#KOJ#tG_!iO(veIlitl;4{d%Mth(lMmTZo-<$DstG??cK=4~-m$0xu4EXyK3
zA`yj%Pp8K7jU1Eo{%%@(nY*>jkZ#Rc@tDx=U^cBxo=7W00|&EwHwF%7I4j*SUa&Z=
zr%=*4?itB9BikrW3qSQ69Z#^cySb%b?DY4Lu32Uzw=2I$f#h6ru|2*za`Cz(R!Lxn
z(;ZlcLzvG&8pRpmvw0CJjq=e?aB2rm!;S<UUuNK7nzNEib}XiAs`xdWDb)zehNJuW
z(26gQ(8Fz&yYD^5{7Pgk4&1>^LN_0*4J=QI9AWN~ek->w`NCgP28z5oh>0P5ht2!f
ziK!`(OpNEuzIpyDknj2SLAficIorT~hcB-h%^u+g(rOK>MA2<A8#;GIa8<psp`Xh2
zkkt2v)b|A+c?tFW8_h&sR6mFEGV&N{=g(1=`7ZeKrl-PYVwr!eGdH_w|2GEU!~^zM
zy{Q+u6IkTF8l}u@`os=sijmIpgLfsHtaO+DiIwiNPwT97=U`38=&I|F=3Gv5b}QB9
z!naxOCRlq9`D5D)l+9YsDA=Ctcizxg^F`tGlcX9IEn%Ng0P)z30MK$2nFaUJhr=D8
zb`$N>=5wlInxfF2Pm++|WZsMEI<Xnz0!L~EWK?s`2+r}yjUc_Ta<4T5?^>Z^HXhaH
zwzmjM7|jfgcQCPkWyaTh@XGGryw~IX>q#?-4QrAm^zT_R|LOi6Tep8NW9?3CCYm#A
z#9{jP`aAXe=R8dRV*VUE2y-g|(4tj|KW7*%s5aG!tz>`BO!7&uZvUN8s}qS3N_Tyn
zG-^?EE(T}2g=mf=)}<^NGG3V9!F92)KJDQt{U?0*V2uA1{$<7-lnWgjb4CLGpZE`t
zf202f|FIv{!9ODze-{mk&O6G+|JZu?SH4*f|HL5J+la1?(ZA8};NP2kLjN&;nErPB
zU;tiV3Vrrroz9$tK|NtGDtxidaJj6EhuN2c$%8TJjd)mZUH$wGLa1Z<c@cHh*U#@I
zhxFmk=;wz%NWyxLS&3Oark}4I&VXw^tu{{tw?zHCLHQab4qA{D`T9DgZTV+QKDCqh
zhHQfApOLSF>SDSI^R``oG^eXZR!i|}^VZk@xP0|kSc`n!HS9mgpOzNxe@ebS^L`SU
zH<>4cykZU=6TB~)Glqs0v)7fcZQ$@&NRgI<)#f9wH7sB2$)DSZkB;$w>sZ$skUzc2
z_owAg)q6?3ikfar<q0hR@trZ2kIA2lVWMP~pZ%&ee>FeFjOG8ox@)*{4Z&R3UDJi2
zYuOO``(rL@Ef0UnTog)w?Ozsonu$`JEAsbLY~eA-P4OLYTFferFs?Sgn4rx?{?*E~
z`;_@M36Y2O8Y=YB<_>KT|37!fG$+DZYz+SMvE??Bhd*Z<3Z;*Iw;q8S*9w7bugs~p
zD;jABmSX;D^9h1}RJ-4)2K|qiV@ZjOkE<H$S6zB7lPF01eV6>Ndj49X^mY<OoelG0
zJ%2q-;*M=neJ2lpmQ)wLQ;$@uFm)%8>cFk_Nj2o<x}>^@6iupR`dGTy|D}7cn&4w0
zl3ZxQ5rwrV&e5D+^vcCO6WoU1_GQVS2<J~+r7>X~)<+sjKk4lx!ih=ZHEi4oO{^NK
znh-U^6&j&#GWiIGDj~);<~p@F@3@y@R<(bi+A36>mEuq^qc0+lZ%op6>(X}-3f2EA
z>*f9-*1PrD2B>P@5~r$LGW=Pp*8H;`RlOL%6R29Ln@FVUKj6Ivs9OEP;i&o!Z)~b6
zdMb0EUL*7=<^Q`$-L^R~)Lc@9yZKmRezeZSH8IhgyKffi)<o{Z{`kw&Lfv~X091?>
zF_92w{QG=+{hM_t95sKzR-W8awr&X`H+O|bI=Gn4?d+|tK-daHmT>Mow!T<rh^x&D
zpBHL_wX?F1Vq(Fq@;YJSzws(8oE?{+3H0WR;o1oMzZEng2G##4I@`Yg)M2RH;-)y2
zD<so@L*?k}^{8BdMLn6y?+#HdH9+O@Fld8R-uCQasr(JE;#7_=TB7fd-Mm7W#16IS
zBnwt|ks3upbvL0ZXG`$M0B3<W?`mhkAPHJ|F|V9L&Vo0rpq2OXCaCAnte};tc{6x+
zPs#=+(z{ktV_J_Ap<)T>`J`4D`4p?Y3eUSyEiHl@HWEJklHQzLqAqehk(J9FBf8(%
zS+Pv!=4V)G)+O{$ok#c!>30L45YDpnyWaBfC-u9TuO=~%=y&IB66Vp4{TFVitKY4B
zTDXE1WBT1(68^IKU6;p&K9+vhS|0wa((>wrdIV~>Q3zyn)_vF4S6V)OO0NfgTfcji
zl>eaL{RQ-w)W<ZvlJu7kt|QUXU)DStqv4;>U#@s1iH4%TZ1_phFs35)(@i9<w6*Y6
z11s&NPio3KE9>en=kms0Y4x1_%dNMs6SdsndfR)j&{(gxcT1-KW+mA5QXM)+&AWdT
zIw!8|E3Q>7HLw!g3yU_m68!x5VON6ryo#>`7rKp~6KQU(1TF~xe@YJZxQ$Rt{&<bb
zT=gfE&mUHVj?RM0PW*Hhgo@Okrv|7$FI}zvym^QEb8ES?!2hVT;GPMdX)#Z!*SPf~
zxBMS0{U~vL;3-s!ZMbxjJ5-rMaemJDLe>v;h9FNZCGOPH8FGrC>?3cm6H7uGA=OwK
zoR!=lpTfs~gHl?~5tL2R8Ga-@SY?<k6@}8j=9x9bS+NSUeR<EF*EAL_X&n!|yx~O|
z4)H^Y-^5tac3m`O1|KwOe^+|SISxLpwIZ-io{!PbtX$3d@iUlNIM-SFl`%KitSdVQ
zHVxhQ(|M_}sB+$Y%_H3s{DsD$!X5YIm2kkM#Hgv-c1nq(W=YgLxW-epF0HsQEqC~u
zXpOPIaL1U{+*#VoJ$R{n+r4*TT5Er2kK>!LxIXn22L$JLo)OLI0RC{HW&*tm#x8ED
zZW{&r&P&0;8R0-sIb~Cf?S=F2X%*_x)HqbN^%TdUC5POD54o#0NgJmQUmu-g>?oW+
zW-Xr>tyFHl!Sng#wY+TQzpr<&q*YpUE-s#t!#$1#HKhw=NEyz>t{~nTqbjhzNnl>1
zz`j&}8>6Z!oNBD8+L}FfWAt3{^JO@Tw~HJR{!{q_>zcSecLet0ADRzlkiW~w^pwJu
z&dMn~D#woGe_r@iT!l0=R~jSZ8F$Oac|c?saacWX#m`1mCN7vmFW@HPg6F@<M`ubG
z8!M{JW^MOIaR{!g3ZKkTin3dIGdSx(w_{0kePFI5k`b8OD3ThOn`)FUP#thSH4|Km
zoR81hQ=P*7{l4RiIxY9*cth9E)qQhT$V3SD5(W+RG+gp2*e$yG`)FS7e2tx+kO2x_
zr3R>!8&oSU5<g;qIaA@4UooOJ<|D7aN3mxgi&IRI>qtt|&9o7kTv1%Lp5VP1p226C
zL&+Q7N`krJVNSmEY)<pcj$G}{UEQ7ThjKm`h~w;(Cb5s?tvDwBNh0gMgZGJj76YDj
zU-$AWSujKCPmn75*O~T3oL9Jo;QAuY?T=bVN#B;-2G_*bm?z1=8Y{SrYar;zQOtSP
zMF^IPhW<G~`4`_%Ie(ijx}y0{s`3WA$o+ciLdM<a>1Ko<9#rp!%V$#c%asW)Z`Chb
z+ApUjyd0ok=Gre`NqE_Xm+s(YxI|ZBM@Ff--+IY!%*b-RK-ejH^zYa`egg^d4yE5q
zG_;T_Q{DOS(G`kX*OMsJYAG?-FG`xGNv0BQLX&etXc+6`$|}Ds`k!u9Xwm-$U1M{r
zMmbh*Zs0s!jPxTOu}I%D-ptW9=~p~NGgd2#)^Zh$2z_a6dae+>8x<rHywa*%2!5P;
zmq73#2`>*Ytlr@8mSw?Sa*mdZgR#fj@1y2w-j@k)2IeqgQU6*lSZNtatEOU4!68Y-
z;G$fh;g89>K-2RK!lL8!yyJ33&m0nMdR{JRnw|wVJ+}qx(K8;;Gaz=^hFr1eQI~Dh
z5QQwww8H}5oC=9*;UXOS(p5dwF>$I~2#--haJWVSV6HBWapgAiSBA1q@o3JMmx(1>
zIO<4l>}rY!#>`?O60M<b#LZA6wAP$NS!JG9SAxba^J*=fc{b+1OhjJceuRa)Tmf{I
zw3CE-l5ci3@xp?1pX5BzoKx%LH<VzAy7;wJ_{FNMHpf2*W}j+?`d#|oNW(GuR{!IV
z(6_UNs-ka7Nu0h<!o&6GdpmX3p|5;OyDt8hP_{mOXBxjt-+xtV+=aeQY3K0t9YfXz
z>HFQK_3#_h0DUJh(h2uxn)!m6MPEH1PPUNI@?|{LB6QNNW|e&LNt&Jy??`lnlDTWg
zQ&AoHqU6=0ip2R<r0e^fM0<X9h}SnyVl`BKk2Fx9cG%uyRcOhp-Ipi~2U|6=Uq=M4
zmqd+XV;jXDQf0z^JtqH>>-#|0*F34d=i>FXvr*hUF1fzg{Hb%D^Frc~g)`+75SLO<
z;$6)#MpqMl0vUxUC5btrIc+IZhx5)P@S-l~tzje+N4CFUXWb-|IDd#Xl#}`K>!wdn
zz_04!-@z}XVf;drse@l5XkHh;UUl&MfCJ18;iv9PPZU8~evZ38KEK;2ER!Zl1M>3%
zN(ed2>doaJFRF*}I-(`&V*C*NQhzS*t_1J*grdJIkG>ll$9FXC)j^^Gd|#zR!}y+4
zAKwW?uhhkNH6vah-##|JdVcrPi23Cb)ibOn^;}<lA@@feOLnDMEaFM?O$_SthY0tD
zomKg*{1%jT`8TBecNJForw>zpdP4aRlFAR2^3TM|UwN4F_a>CTEvbB0DSvUSe7kz(
zFI3lesTHkISArpfdx6QQZv9hZq7NVESQVe4ULa2fAx~IHk0qR|IfQF_6FV_%mut-%
zVPn2WBAVJ2)#l0o%<+{)u!Q#J-KV)h+Ixw*>bCbL?}d<^toH6A4=3IKAA4^C9#ygY
z4JW{`C@^6e78M8*Rz-=d5rPDQOdzO;xS<Gw;4VT&P!@p@b&OFI6~%quH;_#Qf&l{}
zh%AB%g0l5Ef+#{jmh=9qtNWZjXC{gF`hV~Df4=wn+)L)1Q(g6|>gv6^yNcoqiF0q%
z5)UDV9Eqby;wmW-RM#p#)gY%Mcvw3khp*M=02_GH7*D7X(zI4-n&WqWTgoGl0wzZW
zdccaq(;#po2{iacCOi7A@g@P^75Gm2wNd)L1sbwv9oW+^hhJpN(_NYskRbDfG}(XG
zr`nEF5-FH_IJlvchk+3nt0A&F9iU|5I2PVW*DbEl*WobAoS69)^-1a)X-`AHYKDF%
zF7=oM`t6>?`e}dp4m75}gh%S!>+Fbwzq~aS(rkZ`y=1K79+p1(1b9-rLFaQmDrIt@
z9eS|rq@$J4TWFBc**@)Q6FdUZ##QADQOr+C{IrOl8t~IQc%r@rZRNHCZ9SMb-Jmh&
zf{t%bFA^nF=#-knp&8K?PW^}Yb!<GP^IkIOqMzfi$mWTRz8TQ}FhI5{nPG&2{Dtf4
zkxM~p`HZ7%Uj;ciNHUUU=VN9>9kkP&4pta`K00SIA`^F?+yww2YUvP#exP&b_KK~@
zI`{HZ55$L7eh8t)SW##v_mt8<Niu}LpyO9<eK^2j_a{O0jZr;|hj4W>uGV-9dS~W^
z5^}Qqt+0aSPwJQKuhlD=79^7CO*_14hc{rDjf11}d-b4*&K@{U0{_oU>4A{#5yAXQ
z&A}CH-&}|x1JK#ZTY;}G;dvC2(dnPd2o~eec1*wEzup)Y-3!0-Psb&UWTo#$K$osv
z{CUgqyo3~`ayA!Uhp|yr4A8JsH@{-Apu-9pAc)!YA-5~$r06gxOfR_N&nZ;*qWlg!
zXrQ*Pi|#L%h!IFdr@eP=HfC#d2)X@Lwy0ofqRv&Ka@vv@dA^Lvf3(5BlEI%QpIqLf
z2gg(&Ig^XA{@CXp#~%k?T&_RX1-bUe&mQpj<A%<LKkk_%{c)B<r2R3QJk_-vO8$6P
zxHNz4c9z>8kEcq1Y&iV?vOi{C^dIxbtn1_XW3zip@yAo>!{7499-SP2+&H;(e_V)f
zab5of`T$X-_~VX_4*wj3fBF7M*XtoN0@XTBDWjTk@i_KG8V`JvEj^JB8^b0N*LYwC
z2+0S}=kb6z4o~BssINh61s+3kclhWQ(3qou-~==<OJ#G$<%5oN{8p~N(zITx!a~8j
z{UsKf-Q>eqXc|f%#^O^@deEH9Mp;kNl_i$AQDRbwtUxsl=oG}Tqo&D{a8#=dW<_Gb
zJpi<b+gDj$FB^zxTr?ixO}2%exr;4C<Kd<Kw1vXIpy}vO_d*)6(zk+?Y;+eJM$_P&
zc%+l284$*`8J1W&5&4U%sV6m->1go7-({yA%t<lk%Y(UTxQ8-`DgW%W-$R%{{yplG
zN9~|~Tt=7BK%wzqW7=2T4Rvx2`yzlX6Qd#b5bMhjE0hd{4)@gxk(tBaWjO3)w`DeH
zja;{htiWeTDOHm9Azjp-J(5uUj9hlu6p2d-)l#QEyaeIsdPs$!d)%2r7ZP+(s9FQn
zjo^*#4!8=-K9se8_$X(btD!!Y!|=`t=BKxps?cQ!NQKkfe?snLg})QcsGX5>mpW;Y
zSUMh%bWg{$m)if<pg&#6A<uxQp2CqYK^*0VyCQC-^DH)tA?N85Xo)}IM3j&H>`u=P
zdXs++&Rojr)W2Y4RXSgp+Voa-s^7$Cpiv3-iVtLhvs1Ew6RAS$u|GxegB`>Se9q3l
z4N~^ybi{-_7)pLyWSvaD|C_kg<ab>RKZ@@<Nmr`L4Ik1p5$m#)s+5JH8g{3*1ro|p
zn{iw&H)`eOU(Q(eD$2hPgMTysx*`#|7v<jsB+3rQ=HGCd^Y`SRbysZu(Rhlcq@!aP
z2o77}`Ri+~<}C>j?mogN5_c9pS%KPk$x{XDFkaeGLk`>&4y%T)s3T^QthKtn3a>%!
ziGC%m6BhXHRkG-O^woXjS=Q)Up+XEtoN9*q|4MXMtdVFED=-Ub7(zWd*b8eqz8zQV
zj_zzYsUAV|=7FjNWr_CJ%Lf`4H5@|hukN2`NhCDP2>)XD&)0!Kn%X%Ey{RqTV}N8S
znUWTX$3tvt&wVX<hP<CHq?{|EN>*LW0zHN*dHwVk+;avxI%xCk@Cqs~>~++C`*0jr
zhYgT!-Bq^!cpLRCMsHGHvO{315SaI!`kL}w*3~LMenX5eS%LCnAf&<A7*NQ1*pna2
zKuPAHJ#?DH3N$GVVNC<!IX1#0lph+QBcMoJmxB++=2{?@<p)=Cso5@B=k-u<squZc
zH(UW#9tDfbL5bPo)lp~%p`aJ>mcksItA>{D3o#ZToP<_jI2d8eA;k*Gm8iLGovOJH
z0j^*ksw<np53xHDoNr>zq1U4y5MEj(amIglIQ~^lK_y4G{cAJ`$e_u{!v>RGebf?h
zE@9APrk;=nqJxucBJQJ>h&PeF&JghBOzH`Kk$WUD)ClNSV<%h&AN&dJ=)womK7=F)
zy^y}j1w;MI_zzORBF2BWGx|E@7hnsgL+P-we~i!48OSZ=y4&Qlb^Tr<YC{Akumbgv
z7}C!~p)K&8z%icrUBr33H8Dq4IDqssWqyU!ebfe$(_T7;2dQtUo(2O{{$TEwOI0YT
z|1+-_p(NLYksa5Pb%{^G4cY`&AP@AiE^py3T7v)vtF-8AIF)<`-nlE55AFoyk39BM
z0r1*mxxZL|^?fQ~<gyJVM)!4qb|-Bp2w_tm{PoY<MRwgQTAZcrx>tMxKcJ;EdFX8o
z`-3H3k{_sopfypdARfUR?o)9O#0osuQ`elgqP@IPwxm>d-$2fle|qY^Xl)uUI{TIN
z5@)((N~GsY(g_V{;8`_)Y?5eAQu4<plLgn2*zs*N+7eM6AETd`{hE756aHQPyk@AO
zBIn^B(|}HUN!$K2!)wsx$p~Yg;kOxj6)xjewk0*0n=&ZBf5QQl&@eN66o=Ml$VS9y
zh^-wM+Ft_4tAvE)uVcd1v7_3X$O=3FW*On-XxXtgc{rybjl`u}ztP%Js4Lu3-Ivw?
z{R-KQyI*}L^PG<Fnc*GqJp)1!!XoR6DR)#z0F=6siH_3BpSeFTYq#qWl1#2bo0CrV
zpxP~NZ@x3gNSXQs^$a`u;aRw3@BHr3BvBjE@GZ0}V=0AQXf{^hEik1-jmW(lK-;<*
z`JFP-N;3RCQM|W{tH8pZOrwg<%pbMhviN2Osn&0XfF%t43f>Z?#$aOwvgjpBeKS>~
zHkJJZTE`xDJG_lZS};jyo2*+0Ly~)$c`ba!TxM?E%E^d1@*9?!c_q0m&p*&G;YdqO
zWr>H#+dS*ZRq38*V>wxjVpa{Z^1G8)w>15K_}IFl55CKQ2`oQ@AoPn|ly);grIm4A
zq!B1hbIJA~Y+u)Q-%V^bnnFJB0@eH*t-<y9P6q6N6CbD~twkQ<Z+Jh=?Ij`s*|UbS
z=Qrr#saA<=%;eip)>gR0tM&1kN-odw7j~Dqb?8PG)`)WAU3y1Vv|^q^-AB2I_vNjq
z_CWC<Q-prB^NJ+Ve6F6(eG?3IXKDnf3At-FH&;od^Bd??v<s9_tq}#LxZBwx(GTK{
z8Dyt^!20}Z-UBK(A*=CFio!sZgGh%?&Uol3k8HSA(1RM;GDJtQC5FaQTY;Csg?nC&
z?U70+I56tZ8p>gO0%!2z#9gzsr~?w+T4-TKV$v<p=4m5~T>3ayI`Vqi6~ILXrFes8
zfPqS@oOpw#N|S3nq`2p<TdW#)z+qf!OvY-nTa9dx#Hum>W~d<#Ty^kMeXVy)uL*C`
zPBC*N>(CN9i~<Ik^)%3o^bhag;i*wkB0CV}bS2HfVp0<!<SO*r7X%wB8TSN-%bf!s
z!!2T1<zJJMG%y>d10-c!ZUv5ZMfT?|Px5Bo-AL-v`T<%Ec4gk><nzK)hl|3S5G!TW
z-IR@9{nugxLRR>zEEoVDNdv3LZeRo8bVT3n1c|P%)kEzg{pIy(9t&`DH3$izG6mNs
z<=hZNqZN}B_cDQ_SHY*?WG)5YpyhMi1dWT}tejJ1TOr1RC|1R7H&Q<U0;7+xld^u!
z{7wd4JVP^JpN?R)XnyB~^J&b0>G6|t%;25hc?hJGcYUeVafUCy!-?FftiT{}BC2|n
zAGK(VjUY9sEtZ3rmM`?y(hdMht=mV@&y3P!j~9&gw6+2&+tajC+ta<n8KkE@&4yfR
zPiu(JN2&G{cO^QF>AS>Hj>{So<c4|s#s@O;D`%!1$@KR;0^}H)h$>*B3y~a>I*_$}
z!)q8a50Hld3>AFp(q0M)c7$L!qD>$xuJhu9o&`tko!<hWYdD~tFvD=f3LNiJ3ho`i
zy+h*OBQ6FXqw7YfY3pthJNBaM8DuShoCkJDM8yqPfx<nW!bi$;3O%y|Pts>HkM3~t
z6{BK5a#39;N-2xf>nlduuPODFoI3P5?kqLrry26oz49Bl<TtkEtH}z!f>hiN`Rck+
zDPO(be1rX(<nQP#(cAKqp})FsblxzwL2Z4V=x0QmOV^3Q2s81OjBvL8`aE;W*|tQ6
zDTlYyHKjrzljurPPA6Xt0s?yFZO*wru?cg(e9>e_C7;$8v%p->QrKg(Bn9Cd{lyA2
z1s(3}p!`Te5gJcpLJXKhEjeo7F`?Kv0yR9Q`keX=bIur49g`Pdh>n1a@HDDVYUGba
zr|58wPbA}IcK!q~wJ@GVkRQEHqI=?~S60FPdN_1SbO1^b74)p^{96!xiTmTSa*M0V
z8!lbrkF5NAkMz%P*+1>i{{CL*<nIxmz&Wzh_Ko%TZ@njobK7%^NsG|8=#wQA9+a7}
zh{`;iQwR3MY=CZjBU6#k{R;g}UH;IM2(QVISBi4|_QQ2f{Z`UZ)^F&CXNLcxhRZ#F
z{Vqr;o5zqsG#RxzNRu&Wo<t4EL>*_OME@JyXv{gea|Y^>+@db#cq_YL>`HrD@oRWN
zetY5*lCyb5J#f=RXsb~q#o5ETRtl?o-D&)8%x?vh7UUmkQVY-g)`206y}RQdwLqu*
zBbof;bZ9i`Hkx&)h#lzijI>yPdG&hu3od*z{6#KM9G4|t_@GID+yue5&yYJDgo9-j
z+?v9R+B9AJ6W;X48(!E(2S$7?-(XX^-SJv1Q{iR_RgrW{5pSzk?YLcgh92`VA4;y7
zk>8nJGY@aG^KU?tu`B2Fto#saqr+H~m#(>2j8b3eKAv6uF=LD=bp5R#zEM%Zngebu
zrI}{wr24$CEHtZd&~7PNY1?rFEB4Nw6b*<Sd1!X)W70#Hqv8D_v@t7Rwh_4AzEVnd
zeitexm9uiKq)9j#$@J8sti<i`gQVzbBWK{eBm|J`IFen^p(3JRlY)sw{nJj4TdONl
z@eUk-mVXspFmX$E+L3YZ2MZ>sn|>Z7f`SR^rk{zL{79`auONVZ*@2vAY0|z#T+d%(
z?i;1J-%F@$d@E}mn!jCo4VMj?FN~h2o2N|7z<K9y9|ff>8r*_O5-YF(X?Xq?MxC6M
z0c)ce;QWGmxA<{klUW|{0ZzZxxPtl<cv{AkrR<AnY9_4AZ!hv_<gmfj#)?`);gb3u
zb=~4h|Mbi!9za+tT3Z=Nvje{gFjgU~oa$KxcOl-`5*I=ybge?T<4!h;SFfXTXoc29
zH)~PyudZLPo<v9D85PVuER||X-~3cv!Ei9KcRi`ELe-P&5}OF;aUi9<hN&(US^STN
zsOT>}J`Bnm7cuE>Ly`Dv2)-=PAeB9g{?gA^bLLWisRg*}z7G4SJyEp)WMn8$f9VX!
z<@txlAcrdEnE8iq+Lb;3@E!84y!nT81yapF+y|EM%Vr8wOWEnTlU}0a>Ugh4z4R3M
zI4J=W5&N%2zETt4n`uKEqY-Tx{;TFY)*!r<I%G;-W0p4;Z^;HU-yx>e=UE5(T@M2A
z7xy{-c`oAF?uQt)jY097<mUG+sDOXC;)g*vFfF5(7?MMqBL9NX=yer~#c(qombcf?
zXERyoJJ+~eYoYSjoG(pOvZ-@<0}{i*<Ib0Ex{|f)X%kq1j-Z$GrCcSGu5AcXepi|;
zwQ{QY7mQ01_3_GG7W1xAPN($;Z^NE@&SCwPMArWsdg;|4PNnn*M`@<Fne1QCU&$u@
zLr83^=kf4OSH$Yy8}wfN36e*Dyz=NjSn1F8fsBvU!B&6DX}?<_jVwa<*Y0gg#ZFFY
zs!%JVA0xZhX^hQ3{Yx~-SxabwkNzWf9}P$Pq6c>Rv<g+j&sXp>dMSQ<;=G$l?{Hy^
z?|tXpxKXHj+u+^H&bv|ct{&damhb#|MKpz1;2V2^E=g_Sh5f^a46s(9HP8cye|i&O
zLk99$o=A2!bEY0r4g8d`myv_avVrsMkmerQ%-<?G-@X}%ja==J?-APb1x5!z3{8pW
zBKzA5{A~w+8-{Pox@&p4{B525mQ$O);U{!RYvs@DfQeb%H5UWmVcGd-VQL!}B=}yC
z3H*7rKq4ns4vTgmh$P|}pGUK-Q*^tGON$D8qxfY5o8?Kkfl7Q3KXomqyQoFt+N;oe
zF|*U3*A`vB$W!=77HK1%F>R3am=MK$w#IL&iQ19}$T{hQwVmqde#nAbJ$06<&84Fw
z{!r|{5+BetS!;*XQ)4eXZgTwx(Wm5-GCmWD(M>>4RuvQJhaZ80<aXvyR-hZmsL<+V
zv|9}r2~-GN%V<U7Rv?Ieu4^ldFXf!c5G9!BD-X1hwL@<ut2I@QTj?F-oTKI|D;aX0
zx|*s6nrN;G?nM1QQ-{{a)RDS<reUp&UcnFt_S}XrQ_`H)Ywo7Rav+b{4H_nzu`3^f
z5C-s1-%oy(@5?6QMO+$D9x-<V=gFbVZ?}hkvDTrH&dwi^LR*_K_zJgF9a~H6LdN58
z&BvGJt~i-{qM84(!m2B~CE}8|w5>T!#$JS-gO?=Ay?9>^#}SgdYUsUq+1u#?tF6&f
z)P7CG@ZpHr>L~>SN^<}5jX98eG&v{gpT8>i=vg_w+@ozn6+wY*sP7I#hkg-Xgou`9
zyC_P6zwlaP&8*g2L6v%VdSRBo_n-8&H{kJgj<)(m;={|~|5cr}49dk)>{)QBW6#vq
zW!Q5l2u*udBI19;o{uBv%eH4ig0$y7*SqX_eA~Zg&ufNy>^at7mOT$$5yzfG2mNh(
zrVnuJ8EILDJ(ECa+Ve9a{x|HI3BYCB^TU(!h|@MW9@p^Ccenm~_N;!L$DYk<mSxY^
z;EFN+nRMCTw&%W!9ecjiq6~W$g3z>Q4iWzw_WTohUA8@ko{;wZ{yLXEuleEMv*%~m
zdhEHOdRg`y1b8v_EW9+<o^rhoqX+va(N`{FOVa&i*EQFc#OkbEAd`*8M3BISIo-Hc
z#W$Rb2vWMD70uTf7!zAL_4=okQ0x`X^TklS0#!nV$j|KX5y_}XoCh?~#VY&|lT)s<
zXd5XM$w2W_ibA0dG3543P9#e>`}#ASu{3?dcZ<___)<utYMQzh?{}i^1C0c`;jM(N
zm|1~oU=QS!_)LP|NW!n5mD?QwW-j~*jNg!t_HO`>k{*c8l#QU&hq{OMI)XYgq`;R2
zoYW@K`HulZCl1<!SyVSdD%F6M@=u?Rc8RbxQcX>XxO*Up+?R7IR<AJ8A#Re-tmul4
zTG4@cs})retibcl{;sZdkzD4L0#ziYT_QF0=Z!^+p^uZ{^<L9|h9mlvv?TxO3~{!P
zIR4WOB;-H1o)!~P6!~KXsxvamfCe%HY8(Co-<b|?aeU|S#qgbFWbjh_XIno;LH@Jo
z%(DIGJ<t-IJKX+L8xpnu+}=T(^*Hidn-%BF;6%-<!Cv_fFCj?ybxi*O{^dvq{4U>V
z2Q45o^iXt`^c`ED`2JH(=|ujsuc@O`T>sh9S1RS7J|)I~(xl5e{?l4Mv;X9^*NWCL
z6t(^5E{H8(*DdfAx9?ndk@6k74J`VYtu6I85N{>VqW)GdxHScI@}P#$vj*!)3vP5b
zB^|rU`)-b8ai-l%nI`9K&tv2z;wp$IKe`-&{smW2pg-SbhHlbc$;b-KPbNJu*@CdV
zW?8u>n~v?nnJUhp#Mny|B>U0{OXQtNT$M`cML;zU$y2S!NGKz2>9159PpaLIj^3ye
zS@C~Wj4J>ZCd2$o7S*J9p5MiFoAfp?+X7mDt$YgfBIT*e-T-2NfM(YW%})Xvx$70A
z<2BIntt-7cT7hj%Xq-#xHP0S?DECCuu|1T6UEy?4u##IrIof6Y#M9Z*exX+Z;R*18
zrguX(XjX7oYW4>XLsy!8tu)Kqs5CRiH&;OfsY1aLg9r}=!pH)^S=do2qW5d1f`)X`
z^F2pXDMtZY+z<CrV=V-h9qot0ucMf${3E!ZRyLs3o<5GqUmEKo;?z8!<2!8UpFR!Q
zLbBvg>j58{Qgx`s8SJS!KCExZq4O<-qKj25<^(Xbqcf|E*^-Dik`jujlE4+NKZH)9
z2IiL9KzBOvu1C<%Hh34pRh)niI*Roir~*=dUPw9*yh6>dG*XJF`KlHqslZo};pg<V
zp|7drKpzJ75NQRDG$KP5VkLz`A>TBJx=A98t<*?LywjI+`=h9g6LQTr$%&1}E@Ye1
zeB$fitRoe<DzoPkM}dsova9}X1;&F~)fSj#q?_i`w&OT=6e${pijUxDc9kfXRYhZf
z(UHDfL^z(&j>q&VFti@h)Bt1G3p;u<jMff}qi3*R$>nIyJJAuc^=`VivLJMwDv^f&
zJ`Gmde;0#?*MCDwl-@7;G-zOTW4~y>bF>GqKt?+r+?gcueo-DTpTnZ$e%{84EsveE
z!BkWMy_}8gjwie7kUz*i#^mqrOvX(1X{a%y3bXA~2m;Ec4tiSH3jBUL?31=7$45kn
zo!u_RRdBrORB}-a#$`<B<RE~%B9uJxZ!qM)1io5|Ex(77-%)?D0(TRgJT;>sRpzot
z9!hi&gy9rAsPsf109bCAz?hGd^Ya_!<Oj{@eBX;rL7N{NLj(#ZypDHPu<z<=8`<G{
z1LU%5pj~_9tuj9IHid&qb|P?KX~W)Eh<m5A!!H(6)kwcST!~*;pTc=NZ-7OXKXW70
z!BBk>KVgxE09b)XPm=+@S;cvj79SIW#d(xi{zS#mt2b`O=&om?L>Re`3CiB#lD-8U
zP7sl}3$L)$h5Zm%I_jI>ZHJt;9g4RY_xDfR8xnLbn|o18EW1#4XxxFDRNt6NA^Qhg
zQYkjFGoR$5%AZ1jJ@ZMFqV)Xe+tbO9CmWWVAN4^Vn;+XO*aB5G<;R_HMiUh0$2#ER
z{MdqDe<wex0Sp4Jksp*+W<qfuB`zgDj=~2$`LPGDV)J7i-j<OcslESpeh{#7@`C`!
z%@2xaKPYSRm5A1)3JnXGzB3xA!fBVx*YTBVhW4^#T7kdnOS52suc2&i;FRP2w1-2D
z=Kg>?=@4{rLng}Q7drp3K1U%A9E5(Ul1lBz`{2nS<y4#*>)%p|4^bVkktBYNCU{1_
zf^H{yVR#zk(Ni$DxY80F-<BXo<4uNDj8<jL7R}WP^o5M@3Z#|$SyhV4oT9svVvl=*
z|J7cy8yC7SGRU@dj6c&5)z<5t3mkv`v7Xk;zAnW(<KGA5(x2x6I&}0uQ3xe7{7<aN
zNM4aK<wRoWtkiuYXJi$0+t2zw2KS&RN8c%UW333LZ5#6)t%Fmqw`nWx^<sZEdlLr0
zQy_rK#{j%xvoI(t&>1jcn(6y;&WvxL#z>+pYTHNGl0{-lPs|@=xXfeE7aAEptUDWq
z`tNH3+Ay>y$JjI21w^JfI8giKF<rb%#A9$NCs(I~5eg~!#Xa!EP-sa5s7a00&7gxN
zUTH!HvmMTBl~b8wSSS?0%C6Xz_L)+(DYsT%08*;jyP&K_Ye@g#@`)H{wXd4IFAA|@
ztO2D8dG{u8m;SCp*zU}bF5}PZ36n$yuw9(`5}2+?XG>6Lesx`K3Dt3utxhT^sL;uP
zb)9u3;z@WT>yG`UNJwhCk7>vOg<gKZi3z)Vu=_5KuH<?`|I7U+RTPW6F&^3v(#ytU
z*8s2LGl9t<BRF8H8DMH46}Qy^R1D;SEt~5;&QEX;-A)fucs_CW7F7G-+5f6g(R$vz
zI%FkosA<4w)(}1gZz#bd(J4|iW!6ZDri|5~EpZv%*o>^ewczW<eMgGM-CE*ygcXTd
z7qB&-iN3A+;{{#Wa;iUG59|P7EZ&mA5Fw9lcOt#CGm1(fx#wzl33{4Eq9>S2y6<XB
zy3iI{t5*Emy7@055}beR;{O(>Q;wkd-=g{d++FdXYVud(BWHfE=zP{h&hHVWo`ZoK
zsC%iN1qrD&z-`h*_IKOJCE_>avYp-vB*g{zC5U8Kn+81T012+4xAmz*G5DGp_=;=A
z#dka{F1}&xa|XVP-T1~6yO<1WW8!-*-Gfi(KSWZ~sq()QaYFtpAe{eA<Fa3kZ0Gz3
zYnlH?lOznzf6w@)xspTu_wPWbM=^F8O;94!-DUI!NYS)b;02c8@stckB0KW)y0M2Z
zQNs{jUGRL2vHz_L>rHqD#KSWh1mr6oJe!cu4xajP@q7dtJqBxDIv$Q+Qo-0hg$Eb7
z`BwR*cgX1xv!2-2nIR1g4nKjDPAjG!PF*h^M>xNXuqnnRF@cAXl&gajOr$1f4?mFw
zLUv`*kP8xGDiEh5j{PR9V%*6&;7+RF%zEOEu4os`-2l#fr0)G(1~B9})EFj1Drzge
z0_)6RpT&rAH6JRC?BT~KZ|lNQlXQol>bPm}ADP+;Y_2J%uRzg*NV^hve4^O3Ye1Fv
z!{k;O;q7STM$YJ$e+JEBiC6!G_o+FKE}&8U(j7drB|9r9{zc;1^C3kd@aNrwCN27q
z?Hck2k5?yml3K~}>h1VV9>C+(Oh6M?fQcNhcEXb<Jx|o6GXq_o>VXdBshMKM`;QXo
zB?i<eT}2c-<Lo~wNmE7X#~PBv+<zq3zmF|x1xt#H;Z^4LqZbac5y+hF<u~J$%W1~P
zdi;?BQ@LYugYoewavj_15FM*d4h^c0u(R@x$R?uU=F7GH%AT)wZ!bKjBYO!2<JX2x
z_NG6*CD!p3s{@<6gF%D&zUpi%y%XmU;tgC&^^!r6Slk63YYyOD{gpSso1B&UD>K0n
zcG6RJ8GN48R3$yzwoJc29Gh+@=aFfwz(Ax&=2g{JK=xkJNH@9jL1^?|S^^aqhCYZc
zGv;3!Dl{~ISOeVTxJ0t!`NKo)*%EsGC332F{$=|aT3~NSAZ8F3bcPjp{>807IBNZ0
z0k=5%E18D=zZ?3)FSY)|8I7rb(1tS~gr@$$6GwlAhV*X)ZZZ0w=hpw|DPH|IpKj~#
z2qgVGm7+g5YW?SeTO9qBOhf+zhW^M$t^Zg?W9t8u4QD<GP5pr<j{XV_>E9OIV)RdS
z>t8KK|CYAE-i|=hzhf!-gQM2Js9Ie8l}tl_v=2NZQ6jb@AGQ9uNbm9g2Ts%3F&~7c
z{=gGQe}#thPY1Uc{oA?qZ^#mj`N#UUz}}8P(!YaCf8AES35HN#1|&VDm?Mr>Re{&<
zcN9I03<IhUs!AK_1`Sn&xCX3Al{L}r++J{f-=A6Q`hGGJqC#k|gq%TD>Fy;sxte-N
z;wj=T+mFjZC#r!Z2twN_&PesgPMpg*rTb&dZztYzf1LOctM~RXX!@ODv#mmG=d@?G
zTPW<^{*USddpTj2l0x<24^;q}`pSn@jV8x$cPfd!$xJn*25<oJEi#mevHy?~0?%pK
z4JDTZ8XB111EJvvje&`*H=h?bfyz}sRWa}k1l!p1`O`UU0f|TMeSYUxwgADp0MfL7
z)={u{SgB3l$c^XW^E4hktaPmpWMQOM;JXttsnpQ;ZAE25g^4IE$awefQ|u+>&W$vz
zkh80P8=ECwK`z@V5f=?cGElRv+p-fu#vA+PJn@>SlfT38$=yHuI+fsCp_$I16fS!%
zXzP^sIug0Jy?YcCggMKN;xO__8}fS}H@!My?tBB%TF|+gVGPyJm!J-Z=0q2J^S7me
zCm(EM^Ec7NBlCClFHZhmXTa*GVAV$NNtLW((LD~Hx7%?Z&yn-$YCM9YUgVVF6KAu9
zsQ;e`jua?LS!<6+MuUJ1beH~O1-`4qSvP(#Cci%p_!1V5M#0WZOU>zl7a!mS{l-n_
zzGvj`b|F80=i+xhYRu5-(Pns@u0GYr?_1TUoKr9Xgn~RCUa2P<emxf!&b5oLyS!{Z
z%gNt<Kox7xk3d22?ei&9Kt`q2J=;<l5{+Uyj)e00lrDc7WysrEF`m2(ue{Gd7Ax;b
z_=q8|N-6S6+Bo`6h%4_KP`LEF%q5Rz#mJ&9-126f6JNjk4S99q${XgD_iKfC`pt*W
z8Tz#>MP7YF-aB9vn}5Hz_UJd=B~R;jp<CX9wzht8>a&-?Q`cwdQfmDAETm+c=aJL^
zy{xv>c_ah?sKpB8pgxa8gZc>X-1Sux!sR}X1m$i~BAp?tT-@bvC9RzNe+E>s_W1@B
zl>fAuO)fs=fQ<Z2FS5T`OZQ36B#4E&yDny)4J@nN<>`JyUi-N6hADY=d8)4Dxy#dh
zWVj)#CCQ3Yp6VO&Rwu;M@AsCT{C^c>vE}JPx4Z>sIr3<F!Q5T(3OG{k%lR=WQ{}!h
zKQ>VbrGPSS=@9>><kq8x!dz)yjXbrdzq*rP9(d^d^vZ8$$lq)Ekf$;7kGHTh)t3Jm
zNJ;M)`Tb1!OWMT9H{-?M0Atdf`a5H#Y;SuwRf)FqH(SYd=kK@3P$Pf)kk~l++u4wJ
zYFv4>yz+LPWIf&TM!Mx~ZXI7<adSt%SHLK?yu1VoSA97#uDrv@M?=3+rO3O&kk>V?
zyi~8ex^d;*=ayH}D!zVuQyl#^oQS92GElhmdlzJ}<)xlm-u<P>8)wM7C9b>yUU`{u
z<-HCEG3;5t6nRw)c@^Wz`>B~nzpcmP>DS3E?~Ru6?KunFba{9NjAH%ePOrS2xbnV$
zgBbdCE=68TLtgv1@)EuBs>hXgxm#Xwi}?C2IMdN@^|5&RJqik!zq|^vSby1r3^Vk*
zq7-?340*%j%4_PC*DbERyWH~jHuva9v0&dfW#wphgdVarKlB&iV=ooHky;(`4d(;k
zCCv?^`g9^t#%^~;7F8h5o+w2ZejLf;C=_mwN#4!z<Q1^IzAky=<H@UN$eRWKFy&Qo
z<f-%jlcjEa{y%<`E<FF50^VYCikg3|hbO0ROuszwI`f#3Jm38HqduNvFyn=0DDLOU
z*#`=S46FhjW;UE~q2P~>skS2P^`!nKfDtWr%V+%E4gBZ0@aKTJsIT#l#FIO{2mc|&
z3{nZD@kvHJ{OdqK)-mum^x}VDn}h!h1OJR>vG`s6uW>9L_qJiLm=YAE4uBk)r7E_l
zHN*<!Ao-UZv^Fs0y-OeReFaBiN}s3&@n9C3Yfjj@<d-yNH|PiXd-0nTO~&s>+G4YS
zKD?d!_pfh-6nMDx1rBdtbSA0myV)V4g9wSN34^H`3&zd_y<TUBS&d~3vcI&sgtSnr
zw@!a4TS3DWv+5hps{$#UsIsKO^!|2qq{Pyn2pQilHWNNz_nRsMvWoXM`zs}g0MzyK
za4Fl{U-<?UWR(ka|F;{+cr35G=4~KIW{nQGQk6xd7JX22_&C_4Qi`~@sXW)DQcCng
zZ4g&}j56^2a5yGMVe||yo{vGsc<5vka+TT0cE+R5I?;Hh(g%e{9&}Pk>`zt)om5iC
zgHnGT;nb}-N(Cp7^nlQHXRw*-;}{cdhfK%?vsSP_opkaRNXSm-8zfC_Xmr>KL9@tn
z;Kg6)4UvjNWDh>z+><lJwEzUEXV@Fmjq_fz6bw3VX3ZgpH_w}Cr2z05weQns2&%7-
z(opt?7#ufKDFt%*w3+HD>d2X0P4tm7Adn|5S%DjwOXw5h{4oR1qSM(dgr~O`&r|qF
z2I2E>S|~gXH6CgIs$xGJLY_ZE)7(IW*Vr(qPN5k9NPIXA{-u3{)YfeMQ)-48KJE__
zo*tMZh0%uM|8&e@Q-!O+cdKC2Bz^YG+mteQc@KFq&C$~s-g$1C`<Q#qW{ndJT8SV~
zf_E!UO|S6(4&tN6@^uILWak$FcoEF0SXH^wYTk`F7Wl$SUNbAu25)3`;PB%6)>q<-
z8^tbUyv&*_=@1~4m4V<cUqp3Lk$;JQ4}<@%gOY!1#lN`~SdTaC-Vo5-7LaSZ`jup6
z+&?J}G_t*4PfvbD&JTs!LCkjzS&f>opcTj^L2!8T_D}JJGPkAtaudG5Aeg>PpZ|^4
zr>=5jnS{QIX^;laXGFg=6*BzkCMA#jsS!yt{X3{cvOk^t1D~n8q4P<3@$<G9L#}h4
zQWc1XEi;u56>@6(JR|gQr~?u~NFQYz$$yPVO!+JE(edZ|yz-UHHnv@s<*V2CpP@Zj
z`f$)AKbifPY<`2nV4tUyLbSAff$ZK(fw0e0s;qEq({_NR>x!q?G0pRod_Vx>$3IWW
z4$0e#&_A*O#W&6UUDv;;;&{PuIX=s5*OlNSpy4n8-Zlp9#OV;OCl?<%9nH>qGrLuM
zgo>vcXUOXpEAKL|yq0m~y@R~f@|wiTQ|lcylq9v@vF8A+inE04u|jq=DXs-KXn}((
zia;XPo(2VeHWg6qH||74i%(uN7d*&jG~Y3c*vh6Dkq=J${g4t9w7*$_TR}=&A#e<m
zKFepj*p6JXx2dhi1Hxo!51sXeSdJ%W|5hi$B`3Pn{2R9a(e7=9sfofw=hK{OW1>R`
z)oJ+lu7lqZ7D!ooF+-Cih>!R!4!E8sxQ;GxEg2mARK=0t{O^W9__h(RX5zaSQpsk7
zZ!BaP!W|xReP}>jd>6#S_cAiyHr^9>aEy09o;*7x{%m02%aG?$=*tD1`z|-vWt?>W
z<c9rJ9?Cm^@+G8zJ^j-kMw3tLH2Ta;oc{$76n0vjPG`!>?<{^tCQ>Ell%4H(L!FuQ
zto(C4vaG<R0D!fKto%;F*8A^5YxtY^fH;J=u+8i^Q-Ez@f=yi&xfqcsA`vPBl0WYz
zic863ujc+2Ir1-cSXtM2Z}&b~P`<=Ba+CJRDJ!rJFV%Pt+dNP5&bL{|E2-T^4S~uW
zeTD18)D+uH|2gvRi6?Io%ez|1J73DXk>!aG5f4rQEw~u;5}YO(Y!|JRx;}LkdylyE
z;)*V@wMp#KnErM}C6>x-eK5BB)J}QbTAQ`K$X;lBeH|pLu=(@h*>)l;u;Eu#R({9{
zP~Oq$Z<F~*xs#%%q}4lir__@f5PDBvW<vJh{a4@G<ALE2uIrO~!P^$);OiEHud;z}
zHnQ1HWCgmGh3|GrtMTm!#NxB-7eXkPv_7xJ04W}QPt+OmdP?prkde!zIRoE?pq6!B
zJX7sNc6rL^CBPFclx}1HL-xY%V5~8_?xSg|KqUjzRS?6$Jy>YNwn1Gyba>{8Bt|xx
zN+U*^dqweDoxOX-VBoOx{eAm`C3;GLbZj}ZwE_ozDMOz+ramfq=-#@-dsiqw`kYp%
z6n(A*M)CsE=TV3$OP{te`jo73^vP$Z;|1d%?GKh1gAXQKE6}PeeR@f9_XgY#>cr?{
z`-{pR=`xS_SsSYFS4YTYY%6-aW*8{RYX;d_aFzjiDICH`YXv5QS*afND5yOib+R^l
z`0LSyDg)xIU;Tu*IkG*i$2W%n6sEZA@$*1OSV!0dR^X7J9#LN5cpR6<_2MlhR4;-=
zB+BB&q1S`d+Y%9lPf#mB0ofPGx2Vs=CcJg8ZS?~5=o7qOfR3oyfYzSl2T~2ZlOW%Q
z<FTzT4&MGTc$Xg5cn3&tWxQ{xw<V&hfp?~SOL&{d;GJ1Z;&uF+4%E4?2Ah`Vcg$kh
ze9d0j&Rmc0FsXm~VjeMDfsgSaj4h&v{nMYpJKF0x6MyZ7{#|mx{7$JqP~AH*yP)~G
zz!a^8!_aBtKic5@PW0t^d2A<*!6(QBljR9CfMJO*ztn}r5<~D22c65a{i|i<cSsT0
z{3cl*?j_%x&u?(Vrj_%OeA5bVqE9;QunYv9nS{+AX}qaH7k2#R_yDd+$0I0@9F6u9
zZQP@3QoFZ(Ilhv6@lHc4Fb5Pdv#d1~c*=v9;2~&ti9wO53-L7XrTf21l<NQ94qnKP
zgQv2HX5@e}-e>VDHU{w<0Dkz3zVHPg7oYp3qay6u8}DO}AKm~zQi#^S>L3k|6O<du
z{@he0u@7Ymx%@+avb&XtpW!fiVDas4-~%RkIzQ#wCJW_31}}UHGGnUzY@h<K(8s*}
z*1g_ZpYep!AgGFx+)GQYZA-opEJ<2zl5~qLX#l;(`NP2CQAEtI$x;`3r)T*-QR+@T
za|+wS4--lY;65Qgsa4>ch)<NWE$|8Z#CB1YLGo+pX(90~a!(b1<Yf@jq8r$WRDG2Z
zUaNKtQ3TPsN*2|xbHNU7jm7HlFPXaWv-|srN;I8mr%9~9y-1@j&9KDPkc}NexGKfz
z&Lc)#X(6vui*y2w_^1Y8*#{}w^(mddYL7&oKujHZ1kEoug%q$sU25DfG#UhCizfOD
z&Mn`m(BKt0b7Up*So9)k-|O<jTjM$_?lfLqjZ~aXb_uGhcxRDu>ty71Pr#K`wDS#&
z#aJ+9dMoe<(sCBmM>soCCK{AkpwuJG#nrhG%u(*<#`8v1)>iHhip4V>pGnuacy=O_
zHJ&PE;#mYrji+syc<LK?KK(hyK7Uq;gGWu8ih%~6dyzKQKAN(%f#<nX%d*e?5J=uY
z_GurBXGB?eije=>J|#buh39EdYWvhE6HiHH2hTg;>G7Y9_#Bg8I{Ku&f#;66cr;}-
z1J9&W%CgTW15dM9Jeg(Tc?F)J?X!1BJo{)ocY{*fXJ56l@ND)u_IU<8J@)w+pG&n*
z9RtsGW#Wk-k2IcZ%f!>iz*8p{PfA&Mav??AXY=;5>~ocYXHC_z?6aVfW1oER^w{Td
zd@j{K2jK}C&&6fpSp`aMpY$^Ev^4NkjKx!_EIh*vJjFkjWuG(y&wEwMvd^rFj(sM8
zr^h}y_#9&&I%1pZ=MC@#ZJ!I`;?b0Efl}M2QJHwE7<dkDi@~$2LK%2^8F&^TZESt0
z@ia8>6jUzDKI0%z*N3;n;<+py9^HP;g(qnHoEsO9rVN8p+vkL@EIfM?9Xva?#@J^$
zKF8#jmeS0?^9It!+DB6!MILE9x0Q+K3IosJSUer#;nA(qOi0o8X%ZKYro73(v#nBD
z_9;$q?6cvA82eD{T#S9Rl!^wPSx6geA5Hlkd`8=+f0=kX8+iJ~;;9=Ck8TCVLW;JJ
zKQ10knPuQvT(K<sykYrle=hny#y->WImSL(%1(HK#&bW?#@a_yE&`>tPunu_)Hm>S
zjm7gPV#P|uGtj_eeODHq)&`#EDwJiP`yo)5_qpK7`r7@~5oO^if+uMEj4Kn*)1cJ$
zsZl1Lk`oS|vtsdV#OE0M=vuD5f#<g^@$921s~LDEC6;BMQ3jq@z|&(NihYa0qf5;z
z@C0q2QE~BT%DX|S?XxeTEIgZ!JN7v}7SG4{9D_$|UB|$)eREm%i6D<Oo@>j*)5pLw
z6FfckNhu3YE~IGtTu~;Ts|-AAtV9|bW3z?YhN<Q+c>J~Cm}8$)WAQwW&oTDVxo{Ak
zpz(bDZ9MyE%2lA$_DL@jPfG(&E_izEQz;%EU8@W?@brm`M^mO5c-}i1V;`E($9)mz
z4BaCLBcpOp<MV@?2$z~sjJYpj7${*f|MVF&cZE^-8XCK<%*eeUA(UcIdM$vL=oMe_
zHXd-2B`$$@5x)%9xIY}S!YhzCG1SRUe2<y<HauX+TTfl&mXzO~E(WP42H`z6-rzD)
zV-`RxJ&{PD{CT4=uN66$Cb!glgmHYy%STyRS|6w>m8GDfnDc7JflO+8KXv%%xb%nc
zLk`keQR)P)MRsS?8mJ|jEkH?COE3%QwG7C|nbLu`HnNr&w5LifDdpoJfnu<@JWH+t
zz1uOLS<vqse2-P|$Kx_P{dqai4frntYY0j`9O<TJ@}$0Uy%3QSUF3aln3lrrAb;|5
z@T*5yds;GVfYgwMbL@li(j30+t&4q7DOT0Pm!(v~1kTM!C5^EGHn4Mc9h_?^9n&Ir
z;MYEvcHrYy_Ey>9(YTdkJgqck7u0wg1pWP)Un66(3w-bL6HS#tZa%ROFS7y#8=N^6
zF`k@up>sxeW=cC8gZOe49D|6skYxG?m(QgSLmC|G0@LB%8oGxrd1P^BN@_x4u;3bc
z@uRroLomWEbq%t^H{z0WN%o@%FT&g0<%XU`BJWsYLWC|n4dzz~=3k68#$xLynlj>N
za8u7HJY-ll<`pGuOtmBljzg^G>(|Rkw9fcOmCLwTO}13_XflhE#A=e5m`z8MQ{>!h
zA1F<N`QxkNtb*6t!BuE>ZiGw<&4}uQg<3NNvCjDm9yCp_-#O?1BI=9EI;}6BONnX5
zi_T~Tcy&I=M34~l&h|GeP>D#fwppL1c+(n$YTM=YOZX(6#)H!oXl|Dd8R$%+v6&P9
zxp&9g^SzB2g$!`5p~R`V-<Q9^WbUD(&_vGd&cvhS#Q&)I;%x}nf)+A8#hMXZmHuW*
zKhI7-*qvThek#1<DTB!aiU~~wwnGw>Ds$WG4d55u1qe$1wMNqZZux($gZz%R{3sq{
z@i#Kk*Knu*oYEgUqV=EaPH*Dhf=DfGgq8SBMj-qTAc+V6Yv3pGQ+j(oe3oKB^UwDy
z26Fyc#<vcbqieJhu^Tyx{1EBt9sT}>-sStwWN9dGsw&1V;Z6{T@*Y~~4yJ+~%>DmL
zdNQO}Q_@=z<1HUK_28{6WX0F0L(TI=#fB1S^bN01n{8PUBD2USq|;pdCZlB9t96qU
zYpStpfi=t|ZnEE%h!d1=`Mx0#XSY<UV2ELm2u7g=<_;g1J^LuxwD+;Il8qyrOGITu
z#B6pA@yCabmiMvP-`2{EbD1)F7<xJ@8nQf_`fm8c(YFY{-TD?NA*63p2q7?c+wXAy
z84I2)1*=A1>w708$LcHhGv)r;&UT_p89)vl+0xxqpq*2rv|(WW>C_ki12nV(wLpw2
z#Xmg{FR@B$*C^xRJ$6HR3m#DWznaK(W&?M1+KL(jRPE^-4^UM`@vjq88c;jX8tnjX
zquZL0m@owqk#w3lT7snF9#yFaj3x?|_Lablo5(=6yBe%4j}((<I5YGjar+@O3)$2x
z6r_)SK^~5~v^LIrzaBhNSM^RU6t_-tANkAqW7_Au-(?0#`?OeNWUyhMQ$T}^bKB?W
zSEbnJN;slzpMH3NeWoXrZJ&>27*ni$KEo-9f5Sd2V6`~*x#&;FK9@W%?ejt{mwmD)
z#<P#HI$Q@Crz^3F_)Y#xZBs>PA5#Q>JO~)BcsK6vTsDw{qr#hT|0&<-{wWmJeBZ%u
z;!DS`%w0`<`7TrhU*JM==iE9mnfQNiRM*^pL_kLj|G@_Teg^-+W$|wu!{4mWs~Rki
z>}SYSpC?I{uJ&lt9{@`25z<qCgvUF@NCcg8aGo_SUvq!t`!C@}1->_!As>y5@3_AT
zALP<?&W&)OUMG)n_sU(s+UefbSc-h2o>q1^NB6YA7yT149*367ZWeVAYQUS?C4^e%
zj<1>+>cX8aNj0r<VuD<QCRjzH4m60!Zfrq%;92Q9*L8;L+=ikRt%LHGz%Mh;!OPq2
z@^<`gw?C2gsEdw4gQ+K2gVL&qj}U1@RYWj+jjkf_yhkiLE<5;imkjPFmTPgcM1-2<
zPJj;Q;flK?xyfJE6^ne-7E)AOJBoTuzv3B*{-sVP`kwsqzP2v(-96~ldONFtOsWhc
zQwG*TGbs5}XlwK2RWb*613!83-PA@_LZw7qG3o;*;5*-OI}m2L=Xlv>cpKqIZZjmx
znsUxDm?0Md>2e&%)#BPp$TJ#ZQC)?V-XRCzc2l79@Yz!5pU;MR<Lu12ZoE5l$Ujv)
zls<zb1xk>HbY9jaD`jn{I}XknNXGzCR);5Pv_P>HOQ_B+NPqILnsm1yUyIY68nbub
zqK+`_e4(fAa4LL11Benr4I}yZL;cbc(cJiuGY(-GpM3{TxaScbi)H@w2KA3q>xT+G
z{}>i?JLS?X9>t2FE_vh4M1X$sU!WNU8tPIY2$E8XJ^+D}4VoDAw|*`eGJR*z%MLQi
zu=s1MoLHb1@Zw_1iXw4;lS=qoESj87f9-fl*_V#)S7X`}bmY?_-Ty@>6q!-=d214*
z0$&l{l84}6aJa?MFQ|NrHj4S@r&U7v8(buw2FK_&RmmxS1#W&{ASw8bRs5Xzi}W|0
zPRloN1_(m@#VnAEo?uH;G4A*aT|djq)Nlm=-=ExAiK7sv>ya3@Px2K2;qRQ<7y$l2
zcoQkq2RYVwLnY)`2ioBxE{A9%jTLA_y2>DhTDd21F(eUEF=-1znwycn1qXg^A9SB%
zRPaWBGz?*6Q#y}l(+co2`<noBg94K(KUjf}h(^k+M=~i|<Q{Bi;3Mrn{!Nvfs)Hf6
z1&Nh=7%urR@jl<Vh-#W%m{TdW@G1(MZ@0OoJQ^eCH#|)E2tEKF1?jh4eB6Sw{dI%_
zO0H4^DVhE^1RWXa4}5E+?`o%S%jrLfP5*&A{fB1yV;^ey`!165?}<%+hdcdfGySJ_
z`u8~f`L6W3f7HY7Kh}mZ$n4aAyl=VDKkA|SkJ-4Fb|@^f8&b&`CM<0w%Qa5@dmh33
z+2M8>{-i4S|HVoETEPbXT0L;3SB>GIq3gYp@q3*5-3q^{PN4U&aE%w#?=H0WBAGVv
zpt-~4>a@WH<x%!h4;G;)<YHS64{fiBqS>0DCR^Y~!TdhBT^YM2gZUR&J5me1<6V45
zlcK)|!|OZ*_>O2U{e2-ATP_OepT|IzOjOknGj6R5ub`PBxHl#Oo=v97M*t*010I_H
z3~vY@;ts+5J<_(JCNSUY_)X?(iQg2z^hq+bM`21mY+*LVR|k6HDZF1F{vz)D1y1YU
z9pQ|B731n))?YDJb<z6IY5a?{TsIWq(I{`KzG-Bnui;KVo6;YO==Aw<({F<Ua2g)}
z<2Na$<X=ze=h^88y3@PHYqJ!WM$qDZ#f8lC&RSiJ_aVDsoO)ma<G2Q9mqJ8zz0#+@
zfhXOK=jUa>)7-|h0gr|~TzIN57a5;67jzzxK=u=2lt|Ze`I8lR2xM@Ks?r0{ud*lS
z+5@=}hxV!fTI_7%`Ox6hU$-FlxlzU(A9_&PDcHW5nbIR0zk;Y-I{RB^rd;B0ot-k&
z-}<tY(f-!Yrc6N^f9qFMX!6(J`rVWl<=>A}K9qkyODV+P;6mC>*p7aMQHlztr(gtz
z-!x1s$V73(ucSpp6U;r@dThra<^YpqnqY2WRV2wj`xIP?H}33UMG)pLOA1#{$;|Hp
z-6M!PqCCGi+Ln5oU>C%^95&wNBiP=2X7W!IIVu3(+xSgp>j1NfyI<lqjDFXEpW-hE
zqa95xv;w^fQQoI6C-cZ6kJ|gjy|A4+R8i&kwai@mTs=yJ{ZC4ayl=L)5<|_<>vs14
zx)k0}pBVliy})RHswtk-{;UC+xBc1uGb_<XwLfoxM)p?G_3+#I-W8%$z61Judm!m+
z!`}>fPlKOs?{0`AFXWMz5m(-7IF=(X-77C<zM`3uN@Ki|CFJAoX$P3`_)o0p9riaX
z@D)hm=W0JOx`Gwg)x@#_?qyjOI#pb_hI)a^vcgre3hLGjT3-gkD}yClf=$;2!=DGM
zeHmQ#OLERBsq2F!zoWAlt(>~P<agP57S$zuRk`208`gTzu$C$>y#Gc{rZ3OZOD8gN
zd=Q|L4dH>lR6x{c;%3u?`VO8@$DSj-_eI8-Hd5-1jv=-J`xe^);}@WZIr)6tDPA>0
z-e{g3K7son5dDkm%v)ro{fgVncV}bx(ihH@m9~v;GvA$Mt&o?H=j{u9YloI=;YH$w
zUC=N5ujm&-6qDBPDqxF&%=L~IA@xf~8WcAF^ob}#F$pm5qjfBt+>dHhdC5D@zlH0N
zJ9l=e9INlup!=Kpey|gbHJx(nsI1EGbKAKtq4ui)zo~Li?Z-Wzp@fEzlH0Fa@aQC0
z^{2gl#@$(pk0LA4`dKf~qic#`0k&bpte-ViveFC^E8s^OxlN#OvgZz++u)IQMpBe)
zk@)^+WMZL)<gE8x!v3k<Gei34ydRzX`{+|bhOG^ZtW^J;gfwK2UNOX0;5P7xkF*HB
zWh2%8P$bSKwDUDu-Trv&=T|7nesv_u((L!sHpZ6avfuVkY*c3=Lr4Ew6UQU)1@s<!
zHZoDNJs<oD2<`p;hCMUGYjDY5M))h9Ze@dx007GB)_^)1Vr?5`hJWfGK9ZqxTw6ev
zh_e*x%VMdq-`Wj+ZnRYJL*bNhOuT9b6nIUU*r?wZZDpvkJy{gno;-o?l+S1D_5?Jj
zV)gl=_6%bLS?JX8gGn0iZ6}e2yeyU{`($Z~H_5)rD_hhj9=j!CcmGErruh7`kcvAw
zs%s-H`O^=Kc_gwq{0mvC0*&=^n(5;(XzEMMdRk8kprg#=f9Sy6tiU%5-CZWIYO`E-
z_>RLHjBe9y+ffe`_!>K+LQk0P?2hN}19apZ@_a2wiN)X4z`rU|8ve~sdGX%>SQ>xB
zSp2saY5d*X_}}|c;veFODhofwHEH~xE{KPJ7%0eR$o|zFrXKs(kHP;Ma$Msd2F_-e
z6|CO=T;rbzLc{+1wn_ZY3)@s>;V=Hq!Cx^h{+B?Z@n<<qJ@~Jf<+cBA_<_d%*8CX!
z`<H3_pMlW8-<I+JxVJR?9Irx~rDT2WQyS2;Zx-a}>KYvs-s8Sw3oCFvtoDGz)uX_Z
zPkI$d0WQtV3hbXJRZvI#!2fOh$Kf#ge4o)<rSY3OVnR<xKT#Ep`s2ah8}P`32>+uH
zqD9*HlMVbU_t;kP;Q#gsFa8n0rI}fQP9FTX7i#-=Gw^@-gTz0~5mPSyyv+{&?>>%$
ze<%pZfyn+<9mXE~^<wb93P*78PX=og0o(p>FV*-bf==82!1ogWOS^3|m5YDHw+{ZM
z@$kP00*ycDF!tcT{Bf`Scfl7N{9nv<<EJZ)c;4$X&}sbb82?YZ%D~U(Mb%{3vgtUv
zi4pR7S0f-v?EMbHX`=CRJh}Z(_2<?g5&TXvSgaVR{#+a4@ESO9f9`ZQ3zfdWNI%J)
z{*O6Ge}kQVCmsz5jQ=7dd3TkZ@qb3CTWb98yAy+tl3&AMaAG4HK;fT89CmNf_{X|g
zX#J7K*594JFQtFYPTw_d`T|a``@oGs1P33hIAv15oaQ&bU##<aBjqj4G9UbiMxlk6
z(8nzf1-^wg#T-dt+(g-wFkpgsfxX!t`lJ7#uSTt=Fi@Z2HR7tSM#apQZ2Na_V7pL^
z0A2Di6*RP$b#hH?|5~<(uJ;h>5Nc0{i4QiTpYe^6TVH+UouMo5nfoWSVeV##r>YI>
z#?E<jjR<FfudjmE65p&qRlKCNnSu^^I!2KE&pdM=K*(rrO^uILL`r2PgG}({hq?il
zqf=>upVzYqQgvJO8i=SrZWZE&$DqIGmf*6T75Jk_l`@a0vF$70LgGq$J;t7Xoe7rY
zIRl_Zu3~>3nR3i?6wg%*W@425x%{GnZpdAl%-N2m5b+K26Bn4gMz4r!8J=+++w1gg
z9_2OS>ueqLI1znr#D~-)FtaIR<oXaDZ(+tXSl=+FA;9z-ep7{Yo6_povjl57rUt}1
z`Ex}UmKPI)X+H-2y$;glb%doOV|$OWS)w^(`#|Frbs1pj12VEb1EG5DK$x=$2p^K`
ztCw)F7sVCD<bOKSQ~oQE7n0%904v+cds-lR9oa2iqXyBVw7`kz&J+6fAh|*Xd0ngO
z=@yB@8}V+gw$DKG{_UOX7zWLs6u&P^i8p@hGmwzUXcMK#e;8G{ic4(5Fqj_q+afvT
zaN2>fHSrE>XaxT+<PAm{%582XB*rF6nP2jJ16-DeTc=c5VSj}x8K}}Usy&D*vyXIf
zR`(hyc>U;TgI^0YyuD%yBun2?3DHk^FCM}x8KL1)MPm5|;C$a%Z<P1VJNVxFnyp6o
zUIJ57jKN11vPqyN^GST!2EHBe2{WM;xI8Ytg`hLWX>u7a#lYvv&pHYj<)>dGbI0TT
zx|RV2DL;Rpv5oRBdhVw|Q5wvXy=z;HZWq$!RirQ`m8(y#hqdJSJf-C4B#0zuCwol=
zOH*yHz4|M711LYcx!~;;@4|n~#8%+EQV>o9rPpfvzX8rSsT}D4O2NBkjf3~Qcgn{5
z9B3(jCEkZ<sA8LQuQ-l;a^rmv>{KanS#&)py?DFF;N|&tR3WQTTxeN__Cq&OXlcHq
zJ=h~3_422x0x;7feew6N80%PCZkERZeEOg+FqXKOz@llcVbA`iOHej9E3olx7y~3N
zv5cN`CT5Bo2Zy)UwWJ@^cG3!XutzR(rfKv#4gh<At7@4AWPkEs*?y2N1A&k{h5Yvp
z2!k$|)N{p?o~@L)fc*7X*unn@+|Z`C0_~7oo*+=D4yZXSZj%%Tl-q9q8t&j8fGGRg
z$F0aw)Y8?AiR`=cE$+puQ&BwjeH4`B3l|c!p$4=+k(1QlwGUHy27El=)`7-p)TPPN
z38AdGmBaFU(q#iPDKeUT0D-s};o~seX$mA2u?xXGGfZc<z;MCvcNxx_wRD4DzGRpa
z3Fe{)91Hdb1et@>==H#Zhl>@6zDZs6P?}A38mO`gCRdU#M0Gr6<aeSuS7LDjS#Ep8
z0X$G&UN4>xT^lo<s@5Cz@byCAq|QWp=+{YM?rKYZ5&Rs<kMUDgNEi3vH_XrnSmhwY
zUW44X2KHQ{3qSrE^K;qZ)vzi~8psGcM`Gn{p~1#!a#HA2OcLTc3o*V2mb43eudQMu
z($wAVH^@_wzk4#nf8iJc7(C0li|bl>ih#AFJc`JWS1Oo;0pNq2bR`FiwZ1&av35j8
z+H|t~7~yZrvG%X=V#I(I-7hbYXiHWy+Oxs&H~c2E(gpO0AZf4y4uJIrsRQ+0q`9bL
zh40Q|BpFi6vsb1eU)w9l2to=0*=x<~|0DJ~=Vh<G2130gRxhUtMtHYE5MCWEX6;Y6
z0$0-;beWR5%Ou*!9w-H}tXrL!U<J<S@1Y&)#(TqA@Kd)J=)=hyWyR@mV%!-=&OSO%
z7o-wVNxsH)arrGWmeNidI((_+$Qjh81#P;$Vk;c-;{25ES-HhHqGeZTdv^X<DmheB
z&>}n+mdHGiNg!%`))x9uEvd$I55Fe0tfRE-PBpYVI(XZ3hJ+?a+zB4|YXyc9wbN-7
zt+1j&SGE@^jYaX7vK<I5j(*MK(IsE7I39zxVrj48CDnC}h-Y2J^Gs#cjfgIiJjFd=
zMihU%Dy4>gblFwcXLe=A(U$#{K*Z4bmLVa`W>;pq0+BUYvI2p&z@fnY52OjNu&12w
z`igeP;NF=0u5_!0mE#lb0nn)xi^PUivMq4=cagjz-l+^0tY9<HW|#I76_t3zWO3l9
z&M;lv;dfJ!f^0ObKQ`x}z5Pw}%hgWuo7fJA<2u3SVFkW_#RdIS(0HJaKS9vl(uzcF
zC;%y;+0l0mm(}()5BrmCT68(<LNTm&kVuy*E*pOaQc@zrO@sTP-B6Qap{_aZw(@N&
zVP$<56~VO2Q-SX)g{YR+jK+4&e|TBCb{EFLS6*6y74(*3y{F02TcmDxE*1~L+f;G<
zX{xq#i3rKpp5oXF>z`B}7utk-MK%aaDUYHZzD4Vaw~DEn=%o<kO-AW4Mq9FRb3$%x
z{p8`9cNMYGscgEmJn@p$@=|<<rTx?2B(-!QZlLf{A$}LREDP~!N`XQ=8U@bQ{$YGo
z+V7m=Xg^NA_GsTm-e)cKw@INtNK4fp-&`Y0)>S<9g{&_8d35YzWQc2hRQsPgOfU2~
z8-V6>Ye9qXZ{Z(ZzIF)+qy^~IEBa~$A}_M0cbS^L3P)0!9=+Gmv_QW0XxeV%$aD1h
zSfuYtJHcLY4HA^5Zx3=-9!YDAMdF<m=(G&Cb(H$i7MRuY6d>-8KeCLql>Kp_SIy`z
z=!tQz(@LSNzJ`I4iWVK1MlCIoSOMz=ssgBoeJYv)bbrM>_?#~Cl(<MV0}k(fJN1=Z
zS}!UB4^FH^a{s@O<(<c$tiZD%BS&k9%Ic|?5riwJT~(4ucZgNLi^NCEA<=U@zbwD5
zxt5ubp!@HABnDE9uK=Si1?1eSK(t|{@y!aH0a8*}hMJR4K+DKdIF{<FcOc;iJkjkR
zOOf-ljTAf87JmOc<|ln1M=?PMU^O0H^`Yy$rrk>&e|!s+NF%cWQaOVPK|mRFA%T<I
zm>!(uxLafLe3b;}hl7C%8ikhA-qdrON03!|L+`CcGW<3)Pgzoz53@hdO<-RIjr#NT
zz}alzy#4vp6w2GEmq8P&J}I1>eFB8rgT1vAHFir}j}IQ6R^Ywo*d0zs7oV<DTlNMi
zk>&pYIk;V%izH?T{)Nwp_pZ>x&iRh2Jh;bC_bzeD!$3%&b|n7zX#gn6-0~`MQly&%
z07nsXkflaKD^Lxb<OoXjb$*|~$`R5$5PODFe=L)Rz*XXl41ZAmv~4jXR6aESS(pLw
z&v;8{lBtC3#*6n~*Lj*^JJ^Gu6;FUUh(bTO1{c)6nfv3$v6PU!|3UFK?tf7DQ8dL0
zxQSE$8;ZS4A8=kcgnCN1s3jHj-2Wig)9n2Khd*OA-TMKyJR`N;gpZ`H+z;^QT4z7N
zLVS1j1Kj*o+4})ze@ey3+!n$60bX8BA$sop082Kak~j7PTvUisJ_WJL0}&n{EC{T}
z-UDMlz{SCs{Q%V8PgYt{f4?s4Qcda7(h5|dH)tr&Rz1@%?{?f|2ppJusT-G4qyFt|
zNgLW|_fcj1t8>2FJSa!C8?|2_5wW+vm;nMZVr%`y3WP!D8R_pqR`K>$)Z|w|bab8a
zNteI$QPQYB?*}fP9Ea*nmAJOLK2HG|l(pkG`Mn51pgkE=B=#(UNGXx)A30%;lkoe8
z?Rc~ja4-IZwQUCZ8=fZlTxuy64b4I;Y8CCyF74$(nn#d^3@|<`*Qzk4dH;fOXjAaX
z-((6}bd;*In*?cc>gNn=6V7zbvet&8w1Wl}|E;vKdPGVxBB^_WG#lZaN!j^w$Fdv`
zQ9F?h7^sd=OUrigIXH|PNs*`q<w*G0-016wr|yaEe3vHa3-Mg~HptS!IB2)C3)*g8
z<V(nyf=ktLhj_j(@)T*TTCeQzPmt5VrEx*qmysZs=tDRcy26+iSp~O)-ZJ|e&^J50
zKXrYEwGmzYptUCSYcPBUb{&j>xv;O0B3EIR8UxX#$O`YU1zLe&kPlTc_eC>cVh-HI
z4WURhUQGK_2mlR9+IawDKcPE;wS&NNKTZ7n89Fk(?T(E0KiXfT+#w0!N_t%OXBNkQ
zEo6^q4sqYiBKz>LfhL&{t^{#Owuxk0KLrWM9nir!TW+STicR}1No5+Z(Zsy$@#KM*
zD{+zN34GC2arVojvvrWe(OW7svV2H1u8?YYW1=ITK3Rbj5`@}Z-)H#Zi@HX?6mDY_
zk)lsYymP+gf(5KdCvg1$zy0%R-OvjB@T4!H$Mk(6&G0OnVWPwETDq|Y`eErY^8M@P
zoQ2mPH{)alHg#VJp{j=3|H%*nQ{#jK3>3Y2ijd_>6lINR1Gp35Raa2&>7BQ0@;RB|
z-Kjqq=fifOvLlw~=$=4{xDs9zw2Fdh+d}X^B_$tR<pwtFXw1e#RV#4h3AUa4F1^!`
z+UsaVV)`eEI8}4ky;Gy_DbM0R!ElK@nl`c%*^5`sNXh;lT=sM2VEEg}GNOq2PMu;f
zA%Yh{;QjvJXP3ES?Dt>${MSCF1^)l;4>Z|Vk2?8c{T+6{Z*b&>g{tcC>QD4FQHSoD
zP3=M?UDIis;rsX+!Mye8VTDT~=b$ckb1?dE;6x*s*^(30xd#@Yo~5xiJsf-u8?9)8
zTs8|CLn7<xtDN#3Vz4xRN(le2Ka=UnszaaVc6)PV_I$Gy_*d;i$3I~|uCFTbL#G4{
z%NLP84$FU)-#?|-`i*}|>-WjLf1+P`c(nfOW|{gsBFjtvuYOlWXLDrd$5O3-HT{3-
zbxp*7|6j8Q`A<Wda14ew3+pDMN`Gt)Ih5BPM*VLR-yn%$LNEP4eU{Kr^P_oKE3A*W
z!mo`A=8ea>KyM!$5qLdV3Hqn<{+s^)+wvgWd-Mr9@IaZWQ|cLr5u@ad2{|$f;tD)a
z!*Ab>fabmzy^Cq?WBgI|H<)XrCK9=G-2H@j0q&nD`X5Jn>7DZMIligOQwK)|;XF9W
z%FD03XNTVylb^3Nehz(4lYZ0t|5*MXHyHZl{d0aVoBTTdbAFea{C2-rioTA0&NS)g
zl}2A){*?FKkvC_AVZYx0t~`g|^(MbV@0Oykvp%YVnTSDMtbm9!ReD;)n2+T4B1Sd)
zgC(Am9}ef#<M}`?P;yx^G7X=MXJrVlJxZ7V@^H2X$?s&!58EFs(WrF!75=t-v>Bv4
z)g3qv5j{|xFJs79^bRTSSsHs4FL`OK?D|iQN0e1him$HMT9t^Q;6c^W=@^BGx$1*4
zBI!ZjJyleU{dj%D=6|cfKf&ZbFqZ$0x69{GMxplpAX&xl@E9C<8L7)^?|&X<_;W}S
z$7$~!f1P8}_b!dztH1ZzYwv3mS2_NB%dmGZN1U>8WF39~7wvs~sFR;FO#VC72O~d6
z{|){wd%xiD7#ukdsblT!)&IZGWtYc)YM+5{*zrwpN<q$P!GcB?JyH`M(-rsph?x(;
z5yBN|rK<vB*{1WO?q=Gcy#D#w5XXNmdBgC}SJVf?f7;Raa{SXNPuDvP21gFPUP}G%
z(4S}0uPBY)i~snw4*nTPWY=dq)dvIrX!>3belP!L4gT#+{<*RIll}&ON8g?fgTawa
zua#mSFP`cKo)?kG*7x8wYzRZ&DfGP@eH}b=90r3Uy-UOMPyP45>FuH@e|YUd^=}Qd
zbux4@vg8%z7Iq>peR<FQQ6JEJS*!bK_?aCpj@<0x=}j;DLvsBO$K*XsC#mvAs@|#l
zqV;6F92PkDh~{8|!$JC|9m*NeH@|w+zaRrkwRbnj&L5UMGWSSg=oIR%`6xJK4gT>z
zu?)}tCq5sUSJ=b<_!m8rzwI%7ZD=t_tA-X((8?SNfR*Qw+<SyB3ts**E#ej@Wm#W^
zqS43gMMW>(|Mas{UZ!1T7r4QZ0N_d}_-@Y+Z;lKBjT@npzNeXf(@Uk4e}}%dN&i}D
z^loXf^BWGo&#yG_5Blf)#+dwkrSWs*_cZA@y;zF=<>}-74*q0+J^sDIYyYz8-!GTm
zoBp3Z6I8vPx$XsTX^+jn|4Dg6{{!;8c-{Ha{c>;qIR0@azS;8=^PVroKOFi$1{w5u
zrO|uwc%L19FPr>2{#|}bKe;}H@(&(v39kP21*VAg&yk95>i--0kMco{0w$)4BTFvx
z;`7Rv`q=ajmrL(W|KDf3>5kjp|JOeMyX`}+k09*g?C{!1yV>Q}&(vSXMza1`+Qk@6
z-*Ef2`15I=#GyNP5s0L^<&kZ8?dqbJ=MSoWQSIBwOa4~*wcEqB|04bK<<ggj_kY`W
zy}sINf?i*}`YAJ4|I>I~?f)zj373oiG@gGdkG!778w>Ad`+gny>q%3q^7y;k&&C&(
z=ih&mpDHhF`~OpZUU|T);!DPWx7}ayX{Af2?`hI+dO{h=OHUUC7(OE21tE?3X-V<2
z@%ED96L>A>dc5iF@t>rX<3BAbZhJ(sC?4Xoap0}vAv!xry!<u2FzKt5PVY_sKmF|F
z$ItzY{0Kc>ihUjWIVOGY(&%aajY7$Bgz^RSvqo_E*s8SuUDQUBVD5zE1iZ`d8q6Qt
zYH;M_OlPI{KU1E8$rykB*w^dtj=rDco9(ZI090nhznUK?@BZn}$Pi{<kGrtbcz8lE
z?MR5KfQERa-9%qUN*;r0FbCoOej=p5Z|3hw?(dtP#WCbE{Q~}8$^Cs>j!r*|zyAt`
zN`CcVep)d9&Z?1#_+;v2{-zx9&ab_j#a5yD`X0nquaC@BAB_36Wcpst{2I-tP)q>b
z5D$03%9bU9whT*beU$qJa%<7K_J<NtgbySf=OAqt^&H{6GwL;bjko;se|`MmwO@J9
zw6;ZUZVj~;ceD6q1}j!m@6H_@*^F0errOJo^9RdgFyHzdIxs_C)<Dr8FDXX#vQYeB
zCE8}jxe>XKK6!HHKU4m^_{)2y`qN(DI=7EeBtL(IJh!~`|LS+6{9kAY8zhCnL|%R_
ze2<t(&WwxCMSq=1|Hs4e^l{OjYtnxnm!AD$gDOiW#qr)wd7Xh2c6r^YJ{aY7G=2AY
zg;!tYZv+c#vDo!iBAzxxv^7Oci4{@9E8=hPPc!(hDKJz>j^+OzeK%F`@^{&Dd!~`a
zV~{k~pIr1GnDjl%r@z;vuU$U<#U}mF50#6jiAg`FeEQ#d8Tw8upMI%He`)#jkC^ml
zmQR13N&iPa=?llD_IVXW34I;u4gz-$y7HrmnSPE;@1~9|umAnd^}PPKJbUQzVA5^e
znb;iZ`yl0${?GZvZ~nJGlWSo<`Ri<iF;5Abn*4kCsiLi#Kdw@wJu~~?uK)iwd{mxO
zDSk!wcEm6DqY5vWP*uJVTj?pcsDb`GIPxXlI9fQrIRA)WSEo}XlTlywaiqdoNQ&!y
z^pyJrwo{suO`~0$<NE{Uw=Z&`P3e_Sst^tBkr;6kl^M|x#Ck|NK)ynoJ2+ASuLl=-
z)0g)=C>Y-DtU*rwwH40J(R&a}C@d_Bxbf_Q_X<la>ysl@=%mQsme4JbVp><ayXU<R
z@gZ28w-A5uHX&Htdm;T5?Vqd&Mz<{t7A%$|wD;nd)KwYS;DKwH7ZI`e099;pZ|q5?
z<9cXpR}jLcP0I>gcqMke756TrFZjDs{w)aHL5~If3way871%xm`$xmYm@*7xVl(8_
z<%Hf|y2MuBlXI~vF10Xs(#C`g{}YA5f&#+AxfK3fyg&g{;ogP8{N5`O$PxNv<O~rV
zj5dv*bN-0F$yt9aJHInR!a_A?h}HOxgD@yS>>xS9GjkFX#Dx?kkXuB-r*xTF_<P+k
zKJlSCM~HKu?i=!@0=3>AOV15G&$Y~FnKwAE5!wINWVt~xG|JwOITLSav*#EZvs~Ye
zj;$PzyWr3>KPe7Qr!CJVq90z%^EL^Y*oL?L6aQobim&km03TjUyB!_RmGOMEn>79M
zkB-r{@T3m7CT=c_&v#~aIi+HPs51@wHSagh|HPgS2I$Y@JDO+=#^-ZHCrO<C>5otb
z;+({bZGp(UO4holPYlnHIJ2zf*=akm<c-5nrG3wmdRQVC?uP5B<>uTkL_q3sg15tp
zDy+wWYltnfmyf@&4+DiF1s_U9`m~24OKnB&s0Ky)PQ|`lx^P@(ks3d$4X>R59D71T
z=fM0eSc#lYWb8xdyNrEbt~55;eFA<$#aks7vZ)+4;zZJ)r@&xEc6q~zsZ)PVlBJDK
zr(HQwndlt4Z3pu~c!p$uUPpR{#9T<kuJ$Z>$R|`3wMk~d;vG)Gw76hKI#a=1yp9Xz
z7N=m+oj3`i%>#)ExkoCD>nf7*E1X9JB7mRBib*K3Tg3RssOaVqXK@_03aG~nN0~qu
zDo@~L^l~u>Bsf$NK#Lx%nULw9KlDuT4M<a0<rN0~y>UBz|1Ahh7*{J)1yG0+3yPlJ
z+dqFf>I6b3x3^QB5MCx;rAn<}9=|RTo6b9l-Py$d*gY~+VMsb36T8-_oRi(CoXewG
zU>(n^NQ*AiF-@QKzRdLjsmS#o>|DRQigBtzY5CD*f;&IX#<MFw4nl-0Kc>Nt;^xOy
zoyyFQ+4t%En1Np^Kl1R?&JQue$&Xu^+M6FMK*IKFAJ+Mi0n&dvKk6c}ksrNNb$;A(
z_rH-JPr?Mqk1FrU{8)A_@*`sBN5p632bVwiKg3YULuKdGBd_2k)pvKGfUWKuTZc4B
zhZs6axd^pK5m(0ImwTyf$}Rod@e@e}S{I|9fD_5e?xuGox@LrqCOx>a>j!kA#Uzwn
zd9e`hA4lQM{gt+U2N<S5kJ>Xjkq9N8iu2L&%6%$P@m=_~`cGY>g(DW=wBcW@zzu)`
zFUrmzk%HsstQ;Hx&{Fj=EYS{d^Y-~4UzWS#WbTP({>KWduI!c=8jzK?HRrUkePPgN
zTgYQO>V<J^$F3R)3G&9wtnk+8srvjS)TbH#CDl_329)If<r{MVS99e={qtAl9z83^
zmwU8rs3ItE9K!MYaE{?3zL+zK>YF;0WoXN+FQPTGT5kna>fz~yS^nNQF?hRpVlo-^
z0Gc^@{s5Q5g7g;eNE7a$TT{s*Y{IIQV8WdfP|f~<L*R5v$bV(~THsqVP8F|bk%~<?
zAFq(v{^_sNE5n3zx8ELs_~aCPumWdZ!N+~jjge_T=F}hC3x>&UE)8?yer1^G)u=$C
zzL7JakAF$!f?!GRQQw$9a*rhE?DNkr%soQZI09?PLzKa(^^fW}<#&+vnNm|64mgDF
zi_LgjDtV0VPOXOQWhI+cgpw!kfs&W#wuCfP`t7?bM!#?I>hI{+^YS?Qy^<pJYxK|c
zTRl<dq}FfPTT;K%NxufRe!VI{zs528dCG%N>**;Ek<i~O54Q~}qdX)7YT4!Cp=Q!7
zmGAq<<>A*{PkE@=zO3?)2k(wk9_qa*O}VZOOxesf<&gxKa>Mwt%ERKE7=L*kul|m|
zw7D#fzl=Ro8fM@0e_S4(feSb%JyCf`eM2f4Z4D*c+Dg_TCHIf>mWNBGq(3##myqb6
zUJGZtf8oog2hjq*!Rars2o`iq#yj*nFMRMMo@!ixKVtWri3!sSLv%aT=fl)(sR5+e
zbd+RwT!w_WYW3dfvY9!tR=$i}D%n(HHWi4?^A4K{CYy(u&2l6Jn=Z`eC4M=Z*(?rA
zH1qLCT;Z_kV6y4MDQ?6cu=zb#qS@b7Dm9kbJjiTviA|EjX5UalsRU*-4++8MZDx~O
zN3uDC*`zU>^N7u=H*DLyZn9Z)M4F*55`xWeX0wrBZW|zNQ}TdR>Hz+TX%3s~Og3XU
z#WVN=HfJ!K6HiIrw=<hr%w`6$>FTg)V6r)j*(^swu-P<C+NPFlayyP<HshJioy6wP
z*KMV~xz4c7t`aGD3mO`*d6?P!HC4WB#cVn;n{$cHdk&iiO*YRln`KA{Hhq}QgB7KT
zzqv%(X7~NlHb3Hz80oO-WwN=P*$hBJut{JxpP==FPrl7;<}jOgh)olR&G8|IQhsJL
z6$!y+(Nt-hv|prBgPF}pW^)~}*@9qkRT$=(Y*rqYDH4znY{oE~S9i*nA7jXcGPUzQ
zsnk~d5sx}-Za3LXWj41UA=sS7Y<}G@*@T$QOU!0AvFYouX=$=KpV`bsLa^C|GcU!t
z=ToQBm!Qe~D8Y-Kr0-Wnia>BBRVP2cOp*duQuC8$S0~Gl@hLv4f7o&Vf@9XwRzGw}
zz%9ZLjDW$yzL#z%6@9bmPuQ1>KjOzuBryHJatZ#!{?Y~)<VhQJ!5>lZs%^@fuQhUH
z(jl2*0TKdyOJD<=FYlEv?_@S7rb{-5@kd<fuxVzp>BMXvMMALIA<w}~&woI&Igi<V
z%4|L+Hr6XPnyuFuXny-srnnRd!RBdZ^I*1gvxC`Esk@oY7-F-~Ve^E^=1pcZ013h7
zQfAYa**t!MWK(0JRH`!mh&vrN7ny8^F`HV<=2T|0WQ~N_mf6giAlbZwKcbDpri#ht
zG-i{Bgg~==vNZ8KRisk;yGt~qnaxeaW+x)PmD3jvHf*!upcFg?3BhI(v$-@N*?hok
z&S5q!iOsVPo4ZXm4>6mDNC-Ban9cH)lJ{t4^XolQsonS^203ibGuiZJHu*>hHor}h
zO8s=YM3csBK43O)5}TS1o9NYsQYZhAg5N|!uz8c&{QA0l`B%EM&27wPB(W(*^tZCj
zt0tQT%w{eUg3T~y^Ug`h=2K>K7PCnqHd7on*P3ibGn+9;2sWoNn~z?QY$h_B$lcO5
zKjV)`bJ)}~*|cUh<B<?-HcXVZnb}I(CY{;5!E9b6Hh&;QT`BeTRfcVT{#^>b8VSMX
zA!ak5YtWP3q-}0sHbaQbI}V!%Og7Ijn@LCrHock6>=Tmr*UY9ZvuQzWMmTITOg5J>
zn_)-@HYX=YrTQ+CY-TW<=y++HUHBs!Ic)yA(om`fv#EuIV6%YPwES4I8Nh7bVm2=m
zn{N?Gu52^cWb?%VnPUH3$!0XOsq?&iS)196Vm3pG%_9z*|HIi`fK{=5e**^-R0O>S
zHg<QSq9VpYvAerl!EQlC#O}U!UAr4p?CyK*Jc?b{=3RSzXU<-8kMH|`-v51`-*0q=
z^<k~G_w1Q7=YTyWGYZV#i<!v_Ch9mdSITjVbArjN7@KBtN~R2$i4;ghJsirtnLL!4
zmSBzy;@<3~jono<cbkabynDkD4JeS9g<#&%`cD_xT3{A|nXQ<8^wDvnH*+Pk9?TvJ
zB&H#l9M_nsP@Ri<J(9C|MjQJ8&D51l2QaZ$nehPQGmV+JYRqf|vq~`qG~+Irj9{Kk
zWhQ0__a^aeW{!aA4yJ=*p3sNYjiRC(i{4zrW%>L+%!~rF11oD_S<YtaKrU(`ZR|@l
zGfXm5zzm~6>P;CiJF)Y<FEbUulvYd=&6JQ#bwmuO4KeQqb5V76aZ%gB{0QM}KGDYR
zrI}ZaL{Tna(oi5V>%nw9!OU<lhr#So%&A3=-mI3)-d7xvp8|>L048Q9?|f2*i>d~u
zqGHBsrj2C!gNdX-VlskRae$fR$fhco3W_PG8E?tt2h*JbiMck2do!3{I+eVDY~q9Y
z5lqa-g^r>wH59#h{F1+MMG7Ql3Ye6zr-6y0czWHmPceUKW|CwUgSkZ;Vyc6AiwjIW
zFr&Z>R!nEj1W2YanBBO*xPUqTm{Uv%#t%#>#bnXUj|QSBPcXA7kh0l3kb9FItKXH<
zJW}5Wa8Vy=W52n;(VLx;IT_2}_ytzK{$MiB=bf{_>;<z;G1E2ETQXz7Ji;rL{9p#*
za@PpVATWIuQ$sVkB~unm1zhePhj3BLu~4J|QxZ%e#l+Xl{raLeAMi4u8wFBt7K2$<
znXj5RN^x&K^yl8Zq>cRmeZbvVDCSFMBbY7}NK9ie4{@EE2WBgn^@<76Onu380<#X+
zDNisf!88NY2TWJR6w*vm$z%r81I$VO-I#t+DY>ZhU<!iCrI=^*k#VD_WA#LDZan9)
zok)RH)EF?uf_djZCAl}T{kS)eX=7ienGuqi2IlJkZgE*KQwwl5i@>Y{vs^LFG*e14
zfnXX_AZ7Dm01x=SEX=e5(*;ah#iZ2Co4TT?cwoj*ATb-k1Rdg{GK0wpCbMGB(1+NK
z-mI0(emh6}L4m|{0`rQl6Li(QSAu)<s4w^CHf`*onrSDQU@$vyfyoSJYY{GLEtu6{
zmMX?iGwCH$5X@i-q-<{V=iao#&Te2jf@!6g&-8I_qo^x&L~oux<NWhbATiUx{56qx
z<^z)rOa{el*34wdECmxxfy4xYd3l1F*y7xq`+c}Kw`gPUs+p>iX$odL1rievOph|$
zn~h+WfSIe9teW{%TNIT7Ofd>1W`95K&FIOTO)oGl!8B6L-}GT`qc^)HbNVSqRHr~<
zg28y<;#&wzdN6+|W~OHPN@g6G8@SvR1Tzs2=H3+J-rVfXy}3jidyr=GN~S!RI(RVm
zv@aKRhMz)~+yZ6}n5l|MsF{bgL~lMl;i6_xAQiO~%y--@_XE=qOfAJ6qR)gIy;&%k
zO<<xakeH@mF0N&JD42?1$|z=#W*SPS3z)eSNK6Va8T@#_1Hi-s^Su`_MKt3f86PkY
z%X3ku`S)b{EkDiKd@RZ>-bL|r4d1Mo=k!@|qo|l5(VJVi&UT|f%4QsxS%lIV-U^H#
zm{N*au9;DinE|E+vMCQH1G32s=6g@h<|A$FEi_X`GBv?GJjL03>dgaQ9ghj`72%@x
zg4w2+RGN8LQxuf|OmbX&H-Tw@3ydAiN-&ERbB;a|ZuDloWDeq~S9)Avx`6p-B)51g
zm@Z)2DrUT9I!I<9nB^2mE%pJEH5s?KJ(vt&QYogQW->~qFc^DM9`IYexHrY;ayDlQ
zb8nXS;1(~Wjr|LK*xV@US`E<~`$LZS#!t-bGr-)tz&j^{X$z*AVzz2#ie&x-vzh{_
zsG4AQ5~^NLfyoRetzx=srn+RBgBgx&5`gK(@4J<348|QyJjG<!jEiLc08@klDVu{m
zxi{B*xy4z)obArNIZhk<ZTh6S(VM-2qBmzBaQ=%ZkeGpBYW&SRFBak!j|MYDF|#$(
zUov4}Qcxf<g~2?+BI*k!F&G!c)YeRX$y5Ndo&t%n_u!&-j^}K?736G=cjKZC(Z-%w
zGmmYesDJNs#Bd5E=1(w*aI?Q3%s4Q^6?23>A#U_$v1GP@IY)Qd#54zUp5H1fnGr>;
z0<%OhLp0M^GTp$;ra)r;05f+Or}zrl^a9gaF~v0FDVc0wT2mk~XS;K6YOG*p7nmYo
z@+u~lKH+T?b+WqX%|G`zA_WB!69y&)vKa;DLsxF`OWN31YG#aNW`VhhhYJ<JWd6ps
z8<;6zCMc$rX39#Y7MLerIh%jGaZzjMGt;gB4|q~A2^5n?Gassnq7s4WLxEJ(7BI2N
z`D7>y=4KZz>Jn}27w9A4MsGGs=I~wq#@i^6m~LR|9^;*l@^dy5z>HSRM9p-P%wRBS
zDUg_KVB+GKIRK_1m|BYQ*Gy)~6a^DQfyDgNm3z}TD;HG@%*W1L)GON9ztQKsjiPQ;
z6}@?Jhre+q3M6Kh!Qe5_959KIjf=`=yJn_IW`)7vF;Fcq(>*wwHegO8o1-e5o|*}i
zOiM6LDUh;B1m=A@W_-X*1T#i4IW-ecGO5Arr9fg1tKU%>={FcxhEw@?q)LM+qL{n%
zfk~q``>TlFoX3NsiWEr9U@*nIaKqAr+1!a+yp}fhxta-<%mgssIx|xg%#%LM#OCE}
z27&3Tn7W!ND49xNHc}vE^P)2sm6AR~NEetzVDf{>p%{0~JPi;<eWlkN6mg3_vq#Jd
zFgp@p-hjE+k&C)X8+$Z;j@;<YQps!sGmHX>X$i)S9>b7bDmNE3umdxFXk#CynWmEI
z0p`MfE-E#cGRWrNT+EaJQ&2G_G?PLyIl#o>9mDgTxHlcKa}SvJ?KzuR+Sp&wr@)P(
zPFEJaxq~ZPehQ?bCV;t}gnP3$4`=fSdXr4`X0>L<NoEe16%<HJB`}-wGcy#-B@}g9
z71c&F<t0-G%s>hx=4(eTst;ZPlmjyr%tXa_Yvz-`D9Q~?DGDS8zaP_Y0Ul=lz(~~s
zQ%y0KCOUevNitElcx=_%RK)Z^HktPE&Vyi*f=Qs5Nt)>*nW11F?`0+jn6WX;j0AHE
zBNeSiDnK(nk|_=*kOC>2I~}+;J<c*y5zGWIqZOk*Z)ptptxBRdFK=>0F$yGR4w%bW
zSzW+X1yeyWJ2f*yGONJMr$A!rfH|^^?I=vY_+WmtBc``zYD%Uxm<bd}j2oE!Va$vL
z6GidrDWqa@YbJqY(t;UFfy6|$=iaRS%1i*5QD6ou=Kgp`Zw^)zy}5XUBl=JvF+;&b
zVP#DK#t%#>#mv{tK*@xInMi@e6bG|!0o%uO@;UXrEf@8XHun0ODJ+@FV8&1&F)!P3
zQQNvRGak%dFxwQ9R5SJpqNwlJIbsL}60-_S4!n@44rUOTzKS^(=IG6zlGy<!mhQ5N
zX$__hUO>BnDG8>KVn%4Dxnz2QNrM;AX~E2x#3`Q2!M*v=hI{jpHuh4Q`9m_fz%-*k
z%H|^fx=X*w<CvKQW-FNWig`ny>o!h?vwosC_wcSw0}3Q29L)P@W@>`z1E#BD)@mk9
zGV{Rfr9fgTgXuhwnPgxJg2|<rcABXmnR;LfQXnzk+wg$TI?2qr?A)8!*4&%Nw6Ukx
z%)jMDQAxnGpg>}FfHCYTVAg?Iu9z$19KG2hnWI<v8+(&ROfN8TbYno5yE<UHfN86k
z$(rdVnc-k!D3F+3V7B7c-3v@kFqsuoRWsQnQxeQz3MA$p|H?_f^mw#-DI53ZQ7i7v
zZQ9s>jdc|DPdU+>*H<|I2e`n@1LK9)nlr%s31)#}c57yqWY&PWK_8x^Z0do@Foo>~
zVA_Cbs+hi-sU?}VU>-(tQAxm*4rXR@7A`8X1s64*Huk)lNhFzcV15l?=4dOPHxuws
z>ReW4uD4|70&VOM>GS2r$#A%==*{KJ{Echlq113Nz4q|VDPU%SiB!x&%?y@I1eg*O
zNWCcurXZL)VCsXZshEbEDJq#NU_S0<=5<RhYQG!bQFwvz0^_b256!&r6-E8L#QBe<
zK+0wfm;o($=OrKR&3Ux=glcijXh&~WNM;w9vJ^;6TQJ4wvtx8J%m6bPOqgOuX{M!Q
z`hfYmo|$xDYW8QQ0hk(K0u)n5GpQw$2TV!|q--v?;NJ9~#Y`$NZeUy$^KO)*sPkn+
zZysFa2ww^$CIXCYEAPC9=@-+S=gnc-*w<@jf@Bte@uWavs(?8e%FG-vp<qTRrh{fG
zNu~jqsT4@e&*ogzw-?Mb0aF=FImKku%-7PQsAOQGD3F+4V0Pe1n>QF2FkhPybB#WK
zZk!C;By${3!-i2HF@3-s!OD6oGtZkt6i+uMyA?A<Gd(0T63hfFqItlq_u*N*0L*YO
zA&RN4nH-WS4W=IjQZ^6x7fSk7#0%&aV9J6it{4~1+$kk`^Y%P<GZ9`uF95R?ZxiJJ
z^Rp=z^_e#Iy(1jGnIoBXU^-JEWzzsm?r^rBWa8fJ2eU&l{WViZGVQ^<p2$oxFfV8k
zqjPFCm?2>LD<;2Y+$56$%vP*^$D8uJ*$k!&n9^X1DCRMJ_S+cnsFI>LSI_Y`9tdV6
znC5GEXBjYYO}IB-Xk%ZjnW2)21e28lsW+v;?DygUf1Z(hvmMMv#WdDTamiEzGnN90
zdE11G>OPyZSpcRBn6`@X)Xd8gqNq5$JSa+m#H<4oK7pAYU<!lDqnMM!9KBg3nLS|Y
zQXnzy!Mx7IMHK+^sxkNGDQ)ayG}Br#{lM&`Kw>h0$>Pq;&kWp~onSU8rmSYtN+ute
zr&!XjHs;=R>%v8C0@Dpld&PVh>L}`BanYMcXE>q=1yVMVV7y23&Ng5&fk~s7jhYFU
z%px%NMln+jO!<DW!8~ciy}3&pdne6QmP{is@hOn9iEG3~-EigJyi3o$SpjB|V${!1
z7_<0$F;P@<FiR+qm_1<XQf_okZ3EK|OmoHD7~)`dNan<8j!285`hiISW&oHhV7wJG
zO*6eDGaAgzkz7<hFkO4FT>{LVhTNO$w6O<jCYNM<!4#%ID(VsaDoc8{wQ4;xpVDz}
z7J`|jn0T7GS5)-oJ-wquZRkmX#4G|+1|O;11*RF8`ij{<*wLGLlGy+zE<Lxn5tzP@
z*&YQZBbd~R3D!(K$#eu$ngS`C<X}>4W~K<3iw(Fpr)Xm@sF@^^$pq%+CT31F;NHx`
z{moZz?#)avlNIxnevZPJH%E(z-dv{_3)Gu7xW5?<CQ}NYe*3`G15-mWOEoiGGE>1k
z#<JxL#ygJfVPHJLxGAQoW=cxN2Ik%`F6w=KE^7Z`X3B#(SD$+mLmPVv&AcuwigE=r
zmIA588;G%wZ{^|=QL-!FKl*i}jXl9v+LN#`{gHmmG`ML{WXbM)G}8=<u39v%=&4vQ
z&n%T(BJ0M{59WIX`ra6j&p%?I-9Mu7m%z9azSsLd35=-ywpK(U`bjeSHVx~=bJ^+|
zkuU)%v8O*rxq61JuC0DRrG0Ap0Tr6KHzsi%LG)Mpbo$o1T*rFa*aHVT1}B%S!}kP7
zETKU929M|)5Hn#8@9dO@8UK3Bl%<V5o@VY9676`8M^3dVkeEeaV%{+G8q9DoA&S`_
z;>c#6WHx|_q(EXCff<bVsWyXAY0#Zfam55{rk-Rvf~iP>#3TpP2}^rlFkjG{cd9o9
zHIqa#nZOjFKw?hR<=*&i=c0;&*#%~^Vx9&&iaJ_Q^yd0;j&PwsVn&16fLE#SF;c-`
zdMRe9W`;{<DwzIwmFf$|qb}Rq!4w0NUolNJQ&KWEFjwnvHt*|jQO!Rw6Ab2U9d7Y+
z+SpTQ=5+y4lq;C56i7vF0JE_>GbO=n0<%Ujrw2HCvqm!ej&Z~Q3M8f@nEQB{@HrJv
zzn)+^DQ28z+Dc{sm^FBrkO|DH$!zZilMhUG#gx}fI>{6O(~|-zo9nf?HwUpc3<hJb
z&AoX*8~dmJj-oE-7rl9Ml)rH()`qEI7Ovx+zF<~^S*n;#nu(Ci5-{y4kg~CXncR_?
zZ+~!aI)Z7Xm@b;BBAF&&8c-lHu3+AdU}is<Y+y1d#z!+h^NFIo!0e_#V)oVI-fURJ
z%y2OGYjJOG(Z+tOpQAUsBy$Q6!1_}lF$2JO4`Rj-%n~qj6*EIKeIzp$%so8LDgdVP
zT4sKw<leLd(?~HjHIqj&<-oX8AZ7C;h>Plq3*})j>B0P=m;{=6kXQ8PV-$bm!W2l%
z5-?LA@y^j;ZU%90F44w*u&<*x3ncRwn8p-HOcO8(r!(UZW)7ICiW#Vx29oIv=Ik_P
zyue)R%}hKn4Z+k>OkvF=lS~#ciztw?IaQN;bL=5AM^kW%Q-VpV7<(T_QOEO$-rPLG
z5fKzf%vdm2y!q-M3g%)>?#(IM*#FebNXbkGvw#ALDF@~S9%oerGX=~9#WdGUY01<8
zvy}pg`N+TR((k}BPB9UfATU)G^M_{M<`zZ82Q!WWiTMl6=Aq1-@Z#Pi0TW*_XL~z(
zvraMx4s*mn3M8g8n63|a=R`0kYjAI(Xk!o4Onb?MfGJFY#AE>z7syN?nDJmnDW-yE
zGDxNnm>3Eq=4K!dc=5D6Qc1v60pq8be|tHKx|&P$=Gh_s#^2KLNKFUhLRWOU2R!4+
zy@?0rdmu4eG!rSAWnfOc=4@(!$x)w~$zYC9JlzWHRZKU{RFh0IFtPQRi4SINGiGXm
z83|^PVzOx_E~h9eC73=GNWD2=<K8U)$V_rD<-wFx%s)LHz1bs~GY2`M8wC;*0w&KG
z-gzN8_vV|8i~2wt`z+1$lT0X>H=~&;1m+nochkV^0kc&xwKS7YGJaq-;d1wkf6t`x
zOK3{g12YgzAH^in%%dElH=hr1{tdTqHp{?_TFg6Bf++!}pkfa9aP(%8WHy7TMuF6u
zW?&K$O7rH52lwWEb?!|pZR~?J(?~L1!JNffoD$4ey4(;m3(OWU>l9N|Gsz{B70eL|
zq-@Sq;{jiWwYU+O-e9^Y=0$f$Q75vC-uzAP#M3wKLV?7Df|+-RccuYT08CEBtkBG8
z$;<@Ph60K415^JCGdGfPZ(da6-aMj>y`^S+B@+ZD9R(8enSYa{p9@_>DaCnU)`Iy{
zF{w54KAR{iA(#^(%xnhJfIcihOfxW@!L(7#`EHKhY>>>Mef*8n<5LD*!F*hUonUf+
z$)uPGn&~K+L16Y%AQhDrOl2(T|0Lz!JgmyS`G+?4N}9<enId4^v84Z9g$Mk=RL*7*
zm}Ow*E9PrgM^V?airzfm%Ms%!kg}NxW=R{~*$PZ+FijM*O*2y^vm8t(3M3{7Of2rX
zGlR(pCbeREXvQX)7GSnfATbHS>;iKy3HRpjD%_hZw6W*VjH_f)ff)hjPyqL)%yTYk
z8JKxsrYq)77e{aQ`H0?}!y7emcFtxHm>dn6X$Ph;m^zA?qnQDc84o750W(FwoLI$7
zHZZBccqpchW(r8ABA6)@NWFPpnTz^|-Y%nA{K%bqb2)%}bA~o{H_bfBB8vJ#Zz55|
zatb77IT%-5E>?n>1}0oFQJo#VSt6OOU``%nrUjVD`)qdtQyWZm#SGO<6UlT3(~trw
zn^a)pYA}-vOfoPD6;oU@UXsZUCWZouIp@#4dDxPfXKvh^)0Me5M`>e!*~wAVsm!7`
zw|8;GItnCaJeUf2Mz<EsL@;9%vr02#B{LgLLOi3Z2<FmPw!49;2BxB7T5G19WNL$%
zM}d^hmr7jJ3hc}WCIOhA{=}r!%*RZksKkiqP8(vjf_Zw8vx!a2y@{rHx-r?Wn2Q}9
zz4=QrM|N_=Uld48cQ9G;dSoM*(O`xsCR{U}B{Kw!oo?2N$quFivgrk;0+`Z@sjQhS
zk|_ozAF{b!k$bbG8>d(Z%#TW3)F;~5zjtsHbu**rP3#Vi=tO~(&1^73{^Ff)5^-<#
zf!VH@9h#XgnU!EFQ6Mq3!R*2Uy#>r*F#QzMOEWbj(+bQ83M3{mn25)mO+PTDz!X+Y
zF3rT3Od2rtDUg^W6}UIaFECRK%*Tq{n^&~4-)rya&4CP}Hy7~W=-PQ^hJZ<eY(6C9
z-fRQ2K{4|*6C#<3U@qZBVlgli2&F}62bjKKx+$ifW(rBhA53jz6YIxCP5jJE2$(`(
zaw{f@W}c-NMSa`G5p^h#db1MDcUqY#n^It2R^Z+|p^g1$J4bJpNoG5kXbL2z6_|qb
z?g=sfCg9$z2eU#k!!^@PGCje3?95CWFkTax*#o94n0AUOshO0L$qDAh1ZFOj=iVGD
z&&&`oxxi#m%<Hy}qRylfy}7%UBf=<<ikb-K>sj7e7R+Nm?#&(A*w<(#R5EkH9HBsB
z{K4EE!p!&h+?(ZK7AmH#X8a^m7tAUOB<5Q=E@}d@IRK_Dm}ZJer<u>*q9}JTwUNzs
zFw56*HY31f29s7Xm)kgcvsp6HTR5T{1yVLW!DRo)J1c;>U7mY$jW+fO&2*K_FfcDZ
zFq0F^KiC-;k9)HK%uK~p(M(p!lt9FC+E6xk%W`kt<6-6zFipYKQ_RoSj-vifD|+*a
zUUSfd-HV5rbHPk_$~(t^Ndv}HF}pM~Q!=Z;)TThnrY@L@c>Gow%++$-n{%|W_t8v{
zWZHm9M1jP(gE_dIvxyI82ABxN<k3t*$#{bar9fh$eYrP%{$=KvE4R2Vm_WrmXyxe5
zp){g5mo{-kDGDTJ7?{aVcxM=x<X{pjW`Sk~NoEq51{6q42{5k$nW+ZmY+3HjaoX4$
zXr_o{0>JE`Kw@5%;iB&QGm{w1BrxL?lT0(uQ;VX0;L5g!0*P4-W*D+L>B7CSf$>+&
z@s^I>ESJnqFqM%_8!$%`^6RN^Fp0p#`4TfyGc6?38_XmMq-?ywRL2FT2AE?MPd6q9
z6;oO>sU(vdOmbZAE|uoqRH)6pNeX5xm|==}+rm-Qxm2Pz_cwAxLJFj8CV^?voOhm$
zqkoenGIb>|zKU6=nemdD4<<7O5)%Ms4;JW1Fu%%hQUB7$-d-~mB~u^FC<-LzM=37q
z;v3GUHkboob}A-=X1@F(ib@J*E(H>^6U?=9%y@zs3TA*}t~Ph{W~*e5ZQzJS6i7^O
zFhlV8?cy))O&Ksn6%(nM?vfb+rV<4blN(Hlifm5@^SLzl<_&G^)ije`GNr(LufWXx
zlH8k&SDC2~W(SzR6cg9XQPk~}qBn2WbHt-7%*+Sl_k(x-0j58g9*Wtcnc0$A3nl>t
zQc?B6<Xpnc)t}s(B4F|=rk`eNOQs!|mw1|#)X1h2Gqb_GF2%ihMjLxR%_NpgdLtVO
zq->6r;NC2M#Y|%`8^Nqn%%i4`-W*9GdUItRM|7h=Vn%?;Fqe0x1=Af&2gNMX%n->;
z2J>(ZGo`>Zti#OBAKaTfV6rNvk!FfXrYe{W6iC^;Db7VT$LB%jgLzt#dvlLA_T-w0
z^%6z>!rLub@p+K7U^)-xY?^~v31+ckPBd}!W~F3ygDFUXlubJ@UH)Vy1DN(;S}10;
zW?D(6FPK6UNKAS#4LUG$`#bl>2TVG}_-ZDNWb%SZPl3c-DaO4?17<OpyCt|cH)vyj
z-`G*q1y9kNhj@&2r#&;1!35F^Sh_!G4Q3IT*^1eqnTe8F2*!&7DVwTb665tq7BJ1h
zG*C=O&G<{EA(->qnfX-|XE>hX-T%hD@dlGZF_|>;Ex9Pl1I#15p4ts&#sV(tPcYYu
zb8jxt#(uq#qc__n6SJBlno%GX)fY_Z?!2=-m|0*V6*E;cJtZ>=%&%_D<OLI0mYM8e
z>Vv7N7@KBtN~R1LFAAh=9v0!=q{kE5$6vWOUSQl6<Eojv9-=qzR`EA}aGRNhV2=6n
z&Q)N}7vtWXppAWBLq~7sN@hKn2nwWZ8iE<JmzmCBCW8r6%mB^Ql}raPl_-!H4=`)4
zF_Rli4KM+UDWDm5$z%l6n*xc6Da^eIz~#dJg?r-$##J#-8aRrIP9}PDZ6$x>Bos)@
zC@@oKWu}v19hjJ++?&I+u`kigFv&~-(+Ep?88Bh=gp`=>U_!x+P)rlel#onyFhLYZ
z*}N;nMWu^jCO??UV9F`ROEa&MilSV=+&|9DdN2d=LgMAW+#44#UyBfPs=lK)t0l8{
z1%G2dy!7Y*=Fh)*=U-qBQ9RwW?N-cK&9sqBe=uDskc!F(<|TIa1~VK?h+@iV##=J^
z5wV>%#9S-Ly;&B^*%Ss-7EE!)e5~gv>QWNXo5y&S(S-tunF40waAw|q=H7fM%tgJU
zjr}jpOp?rEFm))9nCf6W`ZKc?%q}pS71LQW0g`D9=4wA?T)^bCG1DJRFqmG7$)cGb
z?xHA9FfXe!v$p{E=E?$Qii0TzCck2C)^+q|r({mz%C>+4si^*7f?$9A#Jzc2h<o##
zHumY7=`EQtVE&*$V)BEzdVsUp31$<RHHxXBncR{o3uYk&67x7e7iB#68wjQ+m`;j`
zubKO9qBkG#c8l@cZ!sAEOPoz<F!{h_SImJrj^50d%tkP76iC@L1~axAGhaS(Z|nuR
zHxFoI57A70$#en}NP)z7f(gJ?b1#_HV3sPTkY<ufCNr2M6iCd;eB7H)VVuoSFdf0P
zQp~g3j-rkw7QMN#lp_vMATeXWG{Fmra$vH7$)K2Jni(OPX<#x_ATedZqyzKg1NY{B
z0q)H$+Sr?Erj%p?!Q97M{2?zFb%*{&i|9cxOTf%kOiInXNhFGj2WBOhjbQ9}SvC?(
zOE8TTbEcM~H)|!cANTc}@v^KFnDRlKO+_&2!Th0^P|dWHOfZ=E6iB_v4CYt_GcI6m
z=I7pAqK(~8GwCH$5KIIG5_2OD_vYXOW}@D6i|2rus+i9~j-sw46uo(hD_bZ95;G0V
ziE+$~1=A2rEyZls%w)+d1v7;Li3tSbhdYV@Fe$+#RZLgSRFzCqFbQx+5f4n!qMS_v
zFc<T2Z%)z1o>eoy5{RNwfca8{v)P}Ud(-7LGsoX?Z>E5mpqRgFI(oBPGN%`DL?H^K
zqJqJ6y3aeug9!ptRWUO)(^oR%z!acBVhVx@p;w+Xi>rf40w%s<f;5v?GUdUP$4l0y
zxwxpZVBEl*%*(xrqK!SFW*){Dz4^3|zp*QrrC=I|^UhOmxi{m%j8e>@8jjv9l*}eD
znJAEoY6_+dQk(>)3K&1d4AM+P$#ek|M;l^NfSHeVz9yJ>V7})erif-dB;y07Ed>&D
zIw$w$YECXH8JHs!uRedHnCF3xqGIBS-rT|kav=p0GY-s`a?G52!@U^^W{_f*Yi5*W
zj8_?lDUg`*VCo>7DPYQjDXEwinkgfhn#d*%viX#Qi+YY%0Cm88%gsf7pp8A1X5P7q
zq7s1FLV=XcCNP8Bb5UMk_JG-{m~%EqZ`MoZ;Czm#NP)z30TY&kcV2qUy%`9mk7CAa
zrh{Y#f(fEPVtl}SYRAkBFeShgR7^$9WRy%{Fb61*m|NLZZ%P(uLBGM=J3jsUA(!nj
ziD_4O!Z-XexrBDL2~X(H-@vacuCDp<?M~<79v-EQ{bF@T@qf999v+#;2^PlHwL5=1
zziqgleSF2mPXseYG2xo&ESVu-hEO17ogGZ)k!<e-Qw>Z-#Z=Zz7ReL?Q;-6Qxt)zC
zhVco=fnXAV`I(cL@6{Yd-Hh{~e-mB(>u&VH`B*&Cd3uDI*<kL|->8+P!9-KMdT&@U
zJ2W$0GAqHX0aF{yDllJOa&Jb18KRh8nyDd~R$zL9Net#xC}*=5Oa(Bd6_ZOd@g<W6
z%rpw5-W<uwy;)zAnW12Q<lv${(Z+tSs-rgteu>^(pr=;UhJF-C%n&etX62pb!0ZFF
zT`}`C6C#<3V7gKuF~z{t$C~{kmU}Z8Oh3ic(@Y`B_=8bD-b_ra4;STMkFz-lrWBaM
zib<lGXFo+z-)3`!8wC=x5=^$zymKU&kJ-65uV`aGTE)?uWs=zr=6NY*T7fxqmYIrR
zwt?B8nBkgfCYhdK!YGikNdqPvPb*!(^aayRF(oyVQZhNg)Wy@v3t6}~uDdy#s2AMg
zLSS+$=5>Ihs53uAZ|=_Gh?BcGn~7kKl;xdc!Mx1Iy?H_#`x?!JN@gyY2^2_0`GZ-4
zhn@jo)`MB0n6{enlT2MOo$%1}TV^imLI=(!0hq2}+9@WTW<GxxMY)5SLxGgdb};FX
z&GF~ln_OVBDCTlyM{hPuCVD1+<NJ7bq$im7C3xp}FpslxZ|=~>9-*18k{JeOF9lLI
zIl<Jt$V_!G%fT#EOcl*!l}rgRDJhVcyP3E*D>gCX2Bs~TW{Uag?<nf;Z=yG^W^hDz
z3M6JOm_c*}q0|19oqLlROj^b4(#%ZBtOiqo0*R>$rc@<nCV{!_!@api8+#wk1WBe1
zn4cAyaR>9GKQlGKEC4f8F?lqTP%_?Nj#40H6P=NJWBjmoGB8cS)KkoZN{-$f`YL*J
zX*z#n<A=3}fvMGqcb<F3y-5ScQ!xuPGe|O%z@(x;%BBRE%Ga2g0_JKK?#(&c*c)i3
zh-3o5xKki8uQG5^Su-+I2h0pG5sFEsnde_bQ9q_}ggpZ@tHJ!)k9T^3sS74hF~=)9
zdb3<IJHd3NK+2{Kn3E@%x%8BKlN?N9#f;QU3(52bGnE2~@dop&4Kp*qoXyO=IZhjU
zY0adPOl~m8DUg^;>A5$D{g`P0W)hfjig{bXQPjDAMQ`p;<%sbVNX#TKjW_VlRA6ji
z{1vlKGvg&QA510+BqjjNyTi;}d&0d*1ST#MG3_-|Q8M+x?4dwnex&1~TJK?I4wz#U
zuihJ0Oa{$-`7DY`3MMB760;LbPh`^s%vdnP6m!+j(VMN3IW~pAadBkR8%({iywe*@
zB{05<iPTJY$&3KwO@Wk6ZZPF8Gjr=P_vTkdF6v*}*sEzKyJSj%iARCN-1p|*Y#7GO
z0x$=_>{Lu#c}G#VKZ)MFiR6g>6iCc`FxPN@(*n#;Fas2`M>De}vlh%E+~3p(bA+y;
zw1{Q|QwB^?#q`rmZOOC)GXWRhq+k|doxk&ld-FL1_vQ_4?D;g4STgCsbf7>g>R4Lt
zP5431W(k-bVE$6fqjHYk9Qi1Eb7e9|)TKaTMu2I6WvdOC{$P42W|3xwNM<sa^jNk^
zfhpUIZ67d2z~og-Bh3_(OjR(h6iC^;NyA0OwPWVNL+;J%^xT_gw6Q1GOza0y)UODR
zh@n7Y)`H1IkB4cbmV?;{W|d-2ly&rGrDS%4d5ou(?ZEi1VWtC^?qE77X0&EnNv1Ct
z7Yd|o(t|06qH=)A1176td^M9sGI_y#ZOhD+)ZClgOF5e-54bl^({XR^(Z>GX*HP4k
z_o6osCvn7UdZI?zOa|i;%{zC2*#u^dVm4@IqGT3=IT6K7RWP~mdVLU>o?tpDrlV&3
zCDRbht5wYWO2tK;*~i&<gGmF%Q!$w|^X;7|$^*;{3Z$ZTgIS54x9)Rqu6lEC&e6tx
zy^Nzb+a(i&Cqq3bkeI$;nqb*l4`vaV*@~H}nVymv1tv3=t-N45rQ)J`fN2h<fnsc$
z$tjsKU<yzmW%KY4?#=n(%oG6Q4JL(RTs3p|t?13Wi5xMT0*P4&W^6s)`RX3`=6YK0
z%>~-n_my_^X0Bw`g9)TSVj6;ZnU|SOU}k}dRLlU))RjyJFxx4R7!NR$?laQ|Onopl
z6;nVn?vlv}rXB?n6O)pA^Xv>WMZkE0aaYWfQjVgc--zB^n}GZ&keE?mn&IO6?k@M{
zd>Zb}3EJ3~Xl9sXrhxIm#kUNYZI#*H24*stFvT>{ObN+U2Q!2MDVujGxTtS*zSFWb
z089-q0gCa`%&XU;C>JnMSSZ$mDTHiFfN=xks+dzH9lcpCnZ4ur8-McWY&w9kZ^q6$
z+?$xx+?&I+v5(bE8_DzsvzY>^sElB$on~eim{2ey6jM$!-jc}=#)AThx#q>aIrt|t
zgTPb<Q%*4-OE`+U^h)&RaTrI0QXnx?z+|8jsl|1{c!F_L%wL+BB$>rv9#&$eI+*HN
znMncWTq^EO3~lV4H4`A2#$b|DAZ6nMrsOGRF5l)BM}P@cOcu@jcqxkV1oPu0GkZO`
zH&3IOnF%HkOl8H~Ebi#dPRSU*yi5J8GiB2sOtt>ZGz60vjEiEXYo@nk#^5oQdN-Ju
z{9v*SW~LyRtAB8B&e6tRLo>N0Qx?qALCic(&PC<K1?KfX+?yF-A`}x}GxuXfZ$9AN
zotLYaSqx@-N6uz5n7UvB6?34Nqc`&<vk}Zt3Z$YMgQ<rU`+`XhCb42dG*e$Poxpfg
zATgd`iXxk$V9uuG-W;cmy^v;-N+vUyFCCaU>A}5ex{-@||2Ow$5}0v{c~;a>)Ug+$
zH#f#|L>3C9Y{q~&nTB_62V(={ub5?;86lZzV5U+aF=fHzugOd>m_%UWQV`QjGo>UG
z2<CkaW<Dh2qBdM%rV*H%6i+XNE-5CZX5Ks(Ma2Wtp8_eHjbK(3W+n}oIbfzL=1dVs
zZ`Mj?{}_(wPJzU90+S3^og25f#SOvKQcS33+DRrD%oSRaiOCG6+X}Ylfk_D_sbc&z
zlU_0f!4#oDVs0em-t=G3OfxVSy|_20Xk-6e*iqCKyXei+(H!ARfy7J$GY=0J(u0`-
zW`bfiYi6=!mV#+Rfy4xYiA>7&KR3BIL13yXrmJSEN~S58h7?FlJTNYJS+fXC5-{-<
zlT|ano{6GTfH^@g9f;YVgnJWJkh5t8=A<Y0CW<!pzY95fvs*I8zs@?50*MI*GaPni
zFyq0DQp`-v^p(uGQT&YqD3F+fU_x^8&U-hwH&wv+DJDoWc_mXGOnC|<=BYc1>dDM9
zF!8{APfkoi%{+W6dh-cyw@jo!VwQsWl9HKrV2)6{daF<|>fhfseoxdw$!r31m;#Au
z3T6eCt!!XMf*GWkL7HhOnJ!?uQy?)Zz?5&p*|-*0o7A_js}^~bBqk^_VIif3L#sz`
z;&|a}ck>se_QJPxS6`F>UsQnp0pABtj0R+(h}HboS?U$Vl<sk{HEqJr(>I^x#vOf>
zoxgc+fX6(z6$fc!-(JAcifJaO-bY$7f+N;bAThPT6!zzxg~5ykGfXi(H4`YAmS8?t
zVkQw7ch0ip+iN^cDuMA;Ois<jlT2zb7mG4;SiKDx=~o@E-nWAJm5ht}mp1mh`5i^=
ze=Men@uDI*UcC<n)1GppyQ2PJ4uILIn7Nt>mdpgaHpz>kih^k}kD20NhJqQOn7W!N
zD49xNGE*S6_(dWvD$8DGK3?VClmSyzG47gq`bZS@70-#D;--8Bm>-LoISA%^Qtr)1
z+SsG>IeN2HGTXo$r9jH2C77RBGkb&C3}&rjhH0j$WO{%(hBY%am{P%<O=&Rw!Sql}
z3C*OCOb#$V2XHp$6LN2oQ;9TEU#@U(ih#+hm{)lnMV)>qdUIzeM_i+Kj;N>!V2Yrq
zy<lD^;odx>jeWId#z|%ln13@cQwdD@-fRyAvk}ZH#kA2(dCAlP6Q2Spo39DDsALV9
z`R4=|)ige5Q;#-wZ_RvqAc}GWbEN?@+raFs%*;wK8Nj4c%%wbz-fWUg)DVssPJxt7
z4={TZ@XntxoXxy=oXvFF*e7YGi)4m^8BKx2<N(ufHZ!ZiR0mT@F#($KkxX$g87PpL
zJMp<Ur=ytJ45k>E{EGRJn~Ta#>)@^XMsL#7n(%TkM@*-{qDNZzxAt%Id^%bDr=#|m
z5%|~9&z^1M-?%~eHCy-x|L|WF=D*;3T$S+8_9vU@A3~>h((`&AN*Y;j$GC5WJSWU^
zadFw>M*n9!ME}fsrL6w+C(jkv2XrO0mHL<157a{NS|iw-g7<h)z__nHMr98>72JE=
z*P!4_v2BYU@gM!GZ^((FM@E-(@tk!e>VAAS#(nJ<ylvdqo*@<J$S~KQbSN^xarG_n
zEyjJz7?Qf^DLT~Eb5_)zcyu7|Nj?;IDkRCC1m+iV#jTE3^4xViGC`PQ_gVU{fi3)d
z=gyrw+9DF<q%U`|MU37P7e^O~doAMH`uq>a^ZXCTi<~`6?W{}P3=IGEyYCWFsZ3D#
zzqatZwsFVYDDk-a|I^o3`p?%lQvaa6e<S~f4Q%6HmI*wdYH*<f0{7AXssG;C_VYd$
z7oWg`#-F@w_)}b*XHZ0I@1XFTwy66_f?RLf!lP_a&lA~-M%nh=DRb^@*t<U_*uqcR
z{(WeRdXe1LExP6qYEj;(poqjlJx&d-6IsG7rdl2s)iG*WHJ_mHfo`@F)zlcfj2@Yn
zK7|sTgZL^ryn@1ma<~PBx6Dz*GwdKum!d~vliMPy<nZ>~RVi;!1nmz!;u-cSv5QO1
zh$?R0LB2OVLmMQ*-^%dc>KK1(%73eA{LPDIrDtdl5^F}5ce9N<;T0H>C@`{iG8=tE
zAKQs&m0HnLfrr(W{yR*=K>ro;%o57Y2nw@@q_su%b+u)^{x;BaYbnpr`E<Y*7Zv;1
zHs<T?gf1=v{cMq2TwJ_d%r6OfU<)6fBaba&cn%+X%|~=*(4X=IMW_m?Brn~2`Yn>V
z@U@5eyU}T43;Pn{V+)V5x&Pwt7m}T$DQo(kLE+z?cc(1zH$L#ZIsF}f>cW3YY>TkD
z*}_h#eMN$E+r}O7DtfuIS&uEEh|TwxXJ~2qV2~{?nvVL|d@qIAl^EGK&bHUcqpfXB
z6b(O})aNUC&N^WWd+j+px{}Yg8j<CFY;hNC?)O43+dQjqjunDAHug&BWzVoa)Oc?3
zNE&IQyR{+@D(m@F!vE7>)yQFHBLi%Q_}kNH*<52B?fgfz$Vb)VWB>60?ezJ-v@@kG
z>}E(7?%!SXZ?VxVbHd+?O^Q99S*p0W1W~(3)0YRizc#<0XP9baU}QC#BUN(v)QQaK
z7C7#xcg=_(H*Z@+a1LtVJI}B=lqpZp(S!54sOjUW&a1<oVfkoE&^Mr=E$^9yW_XM(
z>UF9yPtnYqHqWZ>0>dxX%K9bHY;~nt?k8<wpMyNBM8B=&xiyhzXcaX$lvCF0wtbHA
z2)pP{>zggi?(aD}szzk0TsHT=tA?JWLAV|~UzM6B<jnJDw8qtpOhywus!Yv@hHhnS
zzVBsn(=1tspYaSEM#~-2^;5sk^naD_Ec&khyL{<~LMifn&Bz3l@{y)BBZA$U+I+8j
zhOOg7gK~I}9zyL6Kk6BFnVNz`tBCXSf0gfJHNpN@{hgp9=|uLnh1c`3h5LIwS7W6*
zO$*KC;QT=mmECI5bugPPvcFf=qNk<>MWhKT*N|4VP&Ks!<DzWiVti`(M)iNHrjoZ>
z1U*Afkjka$l_Ahvl`F-Tlk3s{z1~{xH?6a%_5W{`D~wsBF4rz}#VOOmzomaGbuZBB
z`Hs8lyrcD)r@MJBz3~idKqH7@@%>-UpWvqdUt_UYW@*nzdFih4^2#g2^TS_hS){W{
zEx5eLGjyZc#jD8EJFYIjTR@(N)0Y^N(K|M!@uxDeZsr6v#-7HVK5R{k<7dxM`m_;s
zhSsL%r+z7)M>QXfYCiL?bP03~j3~}uos7S_Xkhpm{_9UQDZ-E1BGiiQ8Ty)MJ!P!E
zkDHBF9^1I1YG#F$JY0#675z*f<~)^!N)C_Nr*_!FPZa%ZZ?clg34b4WB9R)oz!TNf
zEfDP~9u(2vEvQ_vkj{ZoPZQOO>`WKnXGK2;hF>FQEHfuq2nv5_Pe};<D&x0-QLhsQ
z#>LbMFC7?B)h#f*x?5o6D6hcC5V~?Ua<fG}OlW_$OntvM^qu{(sXvvg6)}h|-rs1=
zbhU+7ra!)=8-|2bL_A9VlxNr&m8$DSbuHo!28H+IvnSS_=TuO5Q!nd$qmIyA3ktW<
zm-Y3EeMonUH;bq{)YOT$evdnJm%O5<s?rT_vfy|wfsrHKc+T3B-~PXhYj-vDk$$sa
z{Jn(?`Yf%B5>5{=?`_rn9`2XuZYiP~ZXG<;S{y+avk14*BlFO(o>jG|qJF8hYcE|Y
z;^NduJI;wfy4}IWz_G-)4K&yBN{$oT$GGvLFSbPtqy?pjEh2H%qM!dC=X+4t%|Oqp
zpKFExsFn3tE%&Q5g=%?LdG<EQb8BwT(6(5G=`t1+c9JgI$LUf?*NS67k>!ihucWT!
zey>XC6S~U$3SO@wDu!HsPH)5WeOORLHSge_LB3I*VS!3HK{cwf>v(ZhHs8iRo?-54
z>Z(~v&mieoJ({wYJwvmrzr>xU^^oTV&Hog6Xw1Aly{e?}_DoTU8W)_xOI3HGN(#R5
zd;XSe{$8Ib7Wam})qFfdX#!dr%1OAmv^DMu)E(*`-p4IV97Ml+J1BCJYId!#({yn>
zS}QDCT^y_W+zL_`ZTAmVLr+nI{#NB?39eupL-!+|d<UZM@6<2w4!it6Mu%J{%QMuK
zQhfe^dqee#vRyCHCD+U5donn+r<bdzSD=??3LE{8dPyPHx_HXao%jD{qVNnUsqarM
z*MtAp{0Oe1zVv^d8=H)|LAS~{snkV)F9BO`(j_45b4U(zk9rMPnV+Mke(>>I+#&k8
zY(Ap~_oWx1<4z=tJw{)U6<^SzFP(C&=?m=5)zL^lckB;P`-iIim1+Okvvui(>D%-1
zU1&j!OnC43_qYfb->6{!h}36}(ktsR-#*fpdXCFAm^vDM-4^vU0a1JCP<XNmC#qCP
zkX5xh(j!p^S}VeXD!4_~a1HmEra~h<7Aw{r+6bw=9VE5xTH}F>^myhewaU;M2(7Tx
z_F1%yTFZvq{<LTxqd7Nsp*>K$%t}Ko+Ic!;t#o-3ncHe;htKIsXG*QMMH{U(qf$>x
zZcU+u2(7Brye(QWtr?>bs~(;@Mj;8b!b1BuN>uuApd5wAbjYaG=-Z%BsU1+Eksj~P
z>Ppv1ZKFk-qc!#Fo&wdc_0+lbhPGd5Eu=QkqS>@&<hFOE)O?^t3e7`ml`Pt~BaTXq
zzRg=Awaavz5)KgBVYSPgAxSLSMy(lRx_z$HLZN*<qx;rRYF9(#D71Fcmc)}<DQKI8
zmR)K~ESi_rjJ{pTDmD9IuC$ZVQusy(2Ss>XQ@h4|#aZ8#we!YVA5DkCQ&pf_f+Hs>
zAZ28#Yw@V$@P-v=O-Macy%j=#c8iM&_m~RUOTfb=oZA9-bHKw+Il~Fn17Z^{1$h5y
z-OcO_8|&25VA)MC1zXmsmx(O(AL9D^3GJHNWsdX~ixy3XtXjw-sV#t(TWAxcHq@d8
zYt0y&mHng^1nrLMh_QzHO09}T%cV7APM1;-HXUohuY+9aOrgc9U1p`pE!s^wWK?R*
z={M@lAcwXETA<JtOYP<WISP}Uv?S`;r$cKGEly1;qtZH3TW-<HYt86ecW<esg0@p=
zu2So1(Ow^LRBH4sr$vjQ<Fpp^5ZZRN%j{bzi?&Q_M&JHzA#)oHEvwKvNzKKg)z_Ml
z+sMXJ%Lnbo3F@11_NHacID1d`SEFE_y$O`YXK%dY6rj#tSAF)*QEvpAC)uU_T=ZB4
z8v}Ak?J|oFXV|<M*g%Jj$>L5Rb?Yp>`K5<<9JDeb&AzM|X*RH=*-B}gW}n}rIh18%
zk;w<RW{jv`!YM6q0tcM@H?VpaNY}rQ>;I;{&S>l*waZ)<FZGj+-9U$s@y=t;jeSy9
zz=Hv=5^yI8PqV=F9dJt%7VGT=x&d@#oCR=Q0Vk7i9Si(*ubFZD-@t$F5gEtOm(qyF
zsjo9yv0d#lTanxXFLS`_j~cBI8E@gc6Jzjt0bVcQ<`O>FSN6EN1FrWQcz}n<I2GVV
z0(O<~WDES7-jl%KeTjC?xItS1@24-NR=5awt=eU_qNW9&?SPm52JWmrwql-c?E!C4
zl^H#*BjF?#xV!_d_8WLy83DTkZZ6=@yG6#Q`p6zXqC;rKt0<!tVu5HeTEJVB66ujp
zz>6h3(E?9$z_We>Czo&&z*|&hMk}gFxVi-{>VV7r2JZb;oFKn;amFnK9IJMjgO|tx
z-=srm#ls`ct#DB<q|Cuv1vrs_XGr*XZ`tEf4mkWb@Qwlk4g$PYRc5rpSHhtdIF|!1
z@*B9G`m~Fg@rRw<ik1StuXdTO2(ZBC=nz_Q<FIopUS<*S0>BLfJVC<oEO4*`?rvr*
z)`}c61?&&_m-;%R6@?@m)l2p`g9A?e8@OH$0o!+QE553)GcI)3)GqTvH=1GdLKjVk
ztan9geyAyCzUo~7ZL`oONUerN3)Wg?TsD8Ix9J^P5VTH0^OagEi<Zkt8~Iu0_G>#=
znn7r>YL{8*-JYt2W~DdjkhRhy<794Ipj}ck!sy##sjanWleCr%l|JN)qS?3h&?X42
zj@0^FwDMXr?ur)Im${{a<|i~)sg<{AuebfT(rfAyO^#8Bq2sjDeK@2m-L7_-eM@A~
zmN{wD)n^|a+F)qwgw{!F7kkK2sIN6+hRiA?wS3Up2raGD7Fe_dT1$aZ7-e~daC<9P
znp$Y5)Go8qjuvef9WrJ}N@yjU$=s$uJ9AL?ZJ5*wS+uTNGiJzrKdDuMHb!U#rS_$}
z>|1&#Z5Uq_&H46z3s+iNX!q1Ev(jjbc9sqqmHMO7ooi)oE1|tQpevm&waFH3q}Gfv
zJ<W^0nOkFMYlIdcwZ;~$h}I}IHBoQ&m$|t?YbCT#n?<EQ7VRM&vYz!pb)>dWg+_Y(
zp)_L|Uc;JkBmbhC8U^$9#w?}rjr_K~6!4#?1v3JaY}^aB1AJ(|ZeeYPjTiOn7&aHD
zatg+adZXwTmd7`LKnqryk){i4Mw$aGX};QIrkU7GQ{45;;+Nz`n)~QWss1biUZZxI
z7tUf9c$Nb`w8yAlz%I3A{eVyJ(;3&6@Vl<EvE>|atKY!Wd5JJGb^{zD;7@;vjJH_e
zhja)d_G7nm#<?<yj5jMK(!)o<izM8~0*5=`6~BQ~bQN%8z-QE07=sre;X)R;hy$+i
z8#v1d0sq{{84ncj3$@D}yjNXhk8jW+wBp?^=T=1Y5%5aDSp_^@!W%8{NC&*pgyYdH
zrZYi3L;H9~+_KgLd{zyz(TXw>?qPv*I^ZV1fj2!B@cRwiia`oCuHN_5E_40M$FO<2
zouxz8)9vsIRld1mt%O!kXw#+kv9oHdsf~2f=31_hjiEhK!)cB`YsSfYh&6MKDWWu9
zV`}Vl9;SD_#4!C_&qYsBuu=32waYAegeA=zbjVutwdJzt&Csd{ZIRTfTeNT|ZD<pz
zwS)Fe4V+PQZPtvUQ&@^Fr!+3Q-43H@F@^Tl6GgiLUM1j9>r~MOe6f?9^bhF}U~dx^
zD^{s=0^Y2YNRPS#UL@hE7C77iAKvbqv0c4IY+ec)1OBdt(-@`zhK)->ZHCR+R7AnJ
z6c|_2ke~9d*A3b!RgTfvPisZ}-j*~U(jnAu`%Rij<rQnQQX)Nq1-wYYH#*A3hCAR7
z+nnpq_DGzYjR9v-uvtIDM*VXbcGRz6)SnI4|8bUAByP~IsdCKv*NFPtTGD(-hfsge
zZ_;d9LJZSpr9^s+R<M!gB8H7Lvsls$S1{64GnE3{c2}6WgtUWJN@%sE_Pm2!Ldt2)
z81xqp<e>in?WL;0$jwD++b!Cw)&I>clrPO@ZpZ03CA?f{+te<{x?$0lI%#)ZWu=3l
zH5FP%sa3aV^_;ZimP+$NOD?oDQcGdc;%hA{M#0ZZ=JwAjuJqVe-M5o!ms#mQ?d2%!
zq(jzIFI#G<MM4`Yw4qX4Wzo85&6pvHdP}V$v?4+)AhkXgEuE8Azp2#Tt>j9dZqb$A
zRlCeeeJ$D<I%J)_Nh(Th1++y%n<lmR7Hx#qvSAcH{vowS(CQ1VveeGClcP{rYsQ!k
zsUWq)(A<<}EC3%@s7j3sz+BeM%jp9;gax4bCJOk^TLfwOnq}NsZc<95$M(${UMS(l
z7I>lqj{eITuE>{G18xMkwStYaw=%=V%97ezzk;!{_@kR6`7&jmy@{cv657Y*Vx;f2
zmEC+mhpbu`^)t?nb!x8)jr3?QwAE5uXVGSAjZ#yA!JVYm1DdyJVN=$Ou^GUcIW|?5
z##jBh8=YG?BAJqn7N!JzRE?q0!ni+0(ZwujzNAA&(MFp6QphxqsnAG|Awt_KwI6L{
z3zs-)T}DZ5Ahg0l>maoli&oc3yKnhSLLO*O)TA<|cWTy*7EWW$Y+*d5aSK~+aBgAB
zlA?tdmT}S31bjg4GS8bj7I+;U0{m>f0mq|W8aMUt<U2qE0hbeS2MH&)z;zw)jNic1
z=*MatgPsZSQ#Cn^K~KW4ao(J3Er;pfrH(-#OPSL6R<V4n<pLe2or{!aH1>enW!68J
zHM6nnl*WyHv(CBxfqVrt=H@`a)dk!^!i_C(T?f42H*mLsvVOqt)yNtdCy{V!3;gdA
zGvmPDz$L$lJF;W+rPPYW3N{+MRqZk_7FS!z#x7AX79`_r`cA(rL@yx&p*0X%2dOQw
zXmy>mt?%WnP9A9PqUhAD8S}p@YvwS;QyLFbm9@^p)I`1|bzw0Vy+h49bNtmVbNq8#
z;B|Be@V{%E;e7Jlxq*P&DA?#`2ZoJl`n09&W?cnax|vP=w4Gx?&I2vANHaBSM*Uk@
zbJVXiuD_C*rmH?pd&@TlE-d2ukEqFEq<KK?G6%hnCCzno$eQN3p)$=e&;}~a=w@%$
zj5K{MX|_-rr@3mi(M^%&GrO3Zc>re>aB2yEX`#C5sNVrM{0*FEfH<2jEadvns(E78
zuXdUB?`GJ18M2NJS?m8{dGlcmw6Q|#EwwQgt%Z}8{koj=#i5l^n$b-k){Jh}v=r^3
zH11}HRYo_(+C2NL=;q%Gxadb}au`LQP`k{clUUN+L5GZ@jT^ap>bC<q=H_H*^Oa_#
zIfOMM&8yAj*mPDJr+IRvbDAIIldR%^tB5px7&cC!nGBm_<Dp=jLPnZtX2>+3%;%!N
zsTpAueOc`?i*9An4$~p4_LY|n^L}eCv@J?AiVkJXC_1;L=zdD$qA#p)E_$O}kSYLf
zA=1pxu(@tDlk0}Jf-ObwIW4Eq+j(4c3ZdOmyUe1uShUl0$f^x!CtJ82+6gtkj22Fn
z+E9x&+)29@PihUJjS!l@)T&stLQY!Vs!~e?t+>!W%oUX;w`lk2kWr~|^{S+P0NydD
z_o&cFkLPNN8kMe++RdhN6lOSSZ!A}@?$DMBt%=l@TeK=p+Rn5xw~Wvl3C&$<JuTYj
zIsdJ6AwPyOC-QkZPL(DV+J3dmoFSzw+FB>g)$*clG_)u+%Z<MElA4P}Ywo0dx4aux
z4B8r%n{kq5Va+(nPB)P&OERVLNw$0$1^nlIm_G91+pXDLbY_*N0mrCa=2bu30&k~7
zfV2Jv&M)6C7!CNVfO|>UW`UbK;Ezj<jKu|TH@`YDZYOgC?z^16&S-2ZhK*$*F~jCs
z=&E4LvM_}|_GJ$Gtyx@uE}_M!UFM))YAmPsb~<F$y2Z#vCIZ^+KXq<{rMA$bb#l^T
zELS~0XfuSCPimbkT3RRVz#w_DyqU?B+Jtsn?J_GZY|&29A*0fGG!k@Pbf(`gpqGU|
zq5V?Rz!-%oQv2FS_HCHfvf{PJ%m}G9fVM+um85pmq7`(~mRdeipAcGip}n6WDvh*g
z_vnyOsgYY3%L2Gth0^<8LR%@dCKheFljdz%7P>*Zu4cK>x5iS-YS99mw7*`-`IZ6N
zWTCl9Ew-T?g-_ERm8u`Hp`tommD)KvPCF|r%~%Wesa@t;u#+|O)^d%~c#%oBhywoe
z<S0<Sbr1~r)e;T2lW?#FuI+$3E_8-}@n;T<wIBoFc>;EmaB&O#X_}dFm*2o$<!hE`
z`ci5|fPgovU1lrZH&Bf=Td~LiZ(HD;@u-`s$wtQA0KZ<WThUm;TP<*a18!!*;!gJO
z8tNP}U@yS)6>LnbpHo%F#<JL#VRKo0L5HjhnJ2#%GMB|@6&mSLN2Ix#HRF0y){^ET
zrSVi)J>RHbq`9=bNV6N@Z)(aIbF8t1zt)$H4RFA<eghY|AsXuic$I*EP7x!v#{$2g
zL&*5!Jm-w<twqMWl|t`(DcET2N`{Tb4r17B>~sZNUcUMT%9XAgv_wK{EH!_N7T~1a
z<PRd6!;}Hq0X3bCZo08%3{z4|(VrsCqLcon=+@6gH>2rGsc1e=40yBJWzPT8_2e)u
za==UGICrz<WYNuTfK!T$8%sFc0tYzYW^)ag7B2HCThfyvV=ur5)p=sh|H&$2W0-6V
zo5S>i4p|rdZu}zKY;3d&jr16xG_!uzjK-$4q`63GnEz(|BF%xXMVj3JXBKHTX4pvc
zQeD+pGtB@6TV|76BRQKgK)b3=4s-ljGtykhnz>AUiZIiRo9*1qKjktNO<zj&j}q`^
zwaXm;<`#I7177_b_%HeS<ZgiTDcHRJF>KVI&RV~ME%o2x=XB;UWq@{HO=q)y){Oce
z)sfTm(<HP0q`yfsM1Ec(n!c3kj}Y)?wacu3xdmS2fODC!SpT9k%lQwulz<ycxSa(K
zaKNW#IgePX<6>fY0e+y)6QdPB!$rnE7Wf4n!i)_zVUh7@`n|x83(Ib$M0!jX@Jb2a
zt1Wvx-2rF$4ICRMGHwmHw18_$c!>or<A9@Q8m$l+M?V&DLckBzxoFI>_Y+ksj5*ed
zVe=fjM~AF)Y?u1gy^a^ayH#kU$2_GOja|u_F~@RS(wwd|UJ5e&CQaKuk!EYa0RpZm
z;it7^W6L<;m>EX>VvR|oe$l6SE07TIYc&^*jNeZXQ(=__zDI|Uakt;Vt@%;5agVW4
zDUlxY1w3EE9WC&92kiA5xVOCAGyr^2!N!PHV%Qk5><pVDR#3r~*K`@8RbS0jAtAK!
zLVG`6jQ^`3xyIh3Lssokl+<>s&`6K+LR%@d-4<=S){Ms~g{=4g(B7+)!(9KQHpZd_
zIB7jb%DI^V+FGHxNv)<u`xNG=)X1$!E2*8M<5X#Dq3u(<%=0F-MO&k_Y<Nxg*E89-
zQP5Het*6xP)s&;q%t?#BCAFf^POFp3=v!u~t+QxJowSa@veHMPT<K_`T~xcwN(Wf9
zgLKF`LwZ}@E}adnl+eaX&CjCsanioH$lR(!d#TPDqtX&mOKj1yYRwqaX2YfSbsSf^
zTxd_#F0;~0HRLE<p+iQcSuv(H`Ql}sSsS1=RhqG~%wx^G-(bzW-w0D0uZ7vCP{4m)
zA=>3f_38tTFW`z2Zf=1KIN(E(1}v@xllb${#*+1QEZ4tjy6)yvwactOowa^CWF6@=
zmIsa-pmh}5JgGejl-&$-((dvb&Sp1TK}#>R8dCeqqLp^i`1k%hRy|K>7pLh;e~b~8
z4zg&^>5#S31(r`LM5)k7kMTm=BsG7Fw$Mp?X8H8Z0BGff)>djsEm|!nZKFlY3GMw<
zs??Z;DOodS;We8Y1#=e0jW%cDpAp7n5f{#wZEBS<Cd*m+QYw0}f{jyeui9l!mRSs&
zm!Z`Pw%nYiv!po^S_7f=kXmbt*3?OhdLb9UBGBB0mPu-PELsvL?H`+579Nh`N)M~^
z(x~);+GSSyzPg;q2k4M>S-8SCGUh~{1uaBqW2AP#qV?9Av1Ii>AuFv0Ex*u;OKrSG
z^U<2IWOcPX_WUxED}A6A3Zv2|YL{7QU5j>^4p}SR-%|E%J+!$>Gp62L){Lo_o;7po
zg({5~z<l8p@So@K)9#C@R}XMyk!A&kjXR)Q)#QNWSFq&{Xv1aM!uZgBOx8ue9U+Qd
zVbSi;A#2eI^UI=ls?bP}Z9-cvwO$r&s@9DAf<c>QZe5^t6<R~7m9}X9PTF&RooY_K
zbkK4MEs@k*E!v0Sj!KQ(4p|<1ouT7Y>20-87?Wj>+GS3bGgaj%ta8!<_~%c}+(tl~
zA++vNn{CmWIBCZU%TXu{%_g*rQfp(;+?}*emOG#a!?@C4lXRu$)h@HrycTUg9kR~1
zz5E!~taK){9YPx|wGUNf-+DP|B`rUIR~1@!p%s(bL5r5fNlW*aoNxaQ<w|{o_E_yQ
zE1h7`F3}-tr2+gQ3A565(5|cV*_a`7q*l+OjdRjE@kh!{tp&8nLbFLNgGDRpr0qB$
zEA@a@S!myeh)SOX$WeGkhpd$jw`hk|Xr#xNiMrB_Qrm3N<~wO``FXloX+LP2gw|SW
zLo8ZNCoQ?<m$+ny)=_9)QVXzXKL`J}(%k>ZzFni^RB2|RMXO!r4DqmNo1L_LUFCT?
z0ou(8x^KZ!yHQz=LOUnT&qr#$(5472m(>2WXn#0qCoK1`v4gnMDnh%dcA1s-uxQ8W
zkadQ1wY<Vx4DH)^UFjsLm9%JsoU~+L<S5jEwnb>=r55Ke`<B;9d+<wYuFyIQ?e#!W
z=_!l$4;`{rx--5Uh3zUd(j%kLmPu`<MT>OOMq6G*b%J&|O!uw6)LL1zicZ=q%O|bU
zLYpYG1X9aw(cXppx6*a|aRBo?Jw?Z<(h5S`rFNM!<Xt5>3M-tnQI_k%FlZk`b){XU
zw%?*Pa?(P#$T2MlZN1RaOD)WzC3e#0c9dg!FPJNBE3~s}msx2Yi?){zS!c*L%Lg>3
zLrWvHky1-%(Rw&(qpQf=0-&88r~6h!YL6<)QOM+^J+(Y@`ZRzm9V@hlYL{8*Ul#2G
z9kN#1yr;}<4YV>sn<ceD7Hy1^Hma1=nn8O#R##e0YW^0jxRaL4^6Vrjw3R~p(qB}X
z)S^A1L)J>?zLE3opbCxjXfCw%QoB|`j>23gtwMWQX&-2*gw|4OOD$TUlh)eu>>w+&
z^J8@1l1r_dMf=|Gzm?v$Ji)p`$Eng#p&e1X%o$SLqW$Hh^|yS;D-2p$q4k&APe0kW
zHcpzW<tN2TLwh?~SDHg=CoEbDC+)y?Ii}D1a;2+<c3tf<E1ho9j?y9P3>n*2j_E>Z
zErm8wYAq~Uh?913gUqcKw3I?CE47>!Ew_`l!}59lxISFz$x*s*FV!xy(l_PhDBPk$
z)=FRgDRbKjZTSeTEs@$@ix%Oe)wbMQc7WDYXmzDF&Z7A_X;IbWD5Qp#Txjv6R?DKj
z>HXhI?a8Hff{xR<aBR4)bcfny_ARYN`_oCgSV3w-pbZsTXQ@3XCr6=yla~Ch9EJSQ
ziU`eHY8xzCLMQE%<$8Lj7gzdpn6C7++GSQ6V$pWfA!CNP(0|mTR&KJ=sn8ZF&3H62
zoHgUoNJZAnM<d;o#*aqs4WfYmJQ{h&@4y&O<w^mrF5v7E{#91B(1g7laOU5@U+Suq
z4fsw^uK)c|QNP+{)_;&;^Ksv9I%KWCToX(E(AElVxYQ<Cv~Et?`tx$6{h_s1n$gWd
ztQp;`XDK>^(zu(`2O8aUr7@yH)vp*xB8#SfOGk~JyMV8$UFJL6UKTi-4goIn8~8MT
zw8iM=RKPoi=!}O;_=2w-rfv@SUWjwX5BYxC9Dl%V1e{&MQ!KET1J3*#_$pth%<=Ee
ztw<u^(`uL5idq(UHyuJNjt4tuTt;Rb3V7#W-HLt^PG*5yJK&zbfeXf}X<=lXA8=a*
z8&4m+88)60o+%?&lY|PkJfOb7pK&nn_3w1!`qPLsPpe&K{j*s!PpsW^$f$pSQGYx=
z$L#d)PCC*I1stVLKqJk53>#@Sw4~Wu!Ip=ag)M28f;LEK*`=1*qIo%Kt1KV*wRh#B
z3kvO;+GU>f_R_M2(R9ej%^i#02+Pj_EP(b%og7A`6Qs7yq6KS>Fm)O>;djB!0SSUO
zUueEk8*b5ZX)PPHSj)}HuP$6^9ihdlU1p`#EZR*vWF3Vt%g-omftFBci>2me(I#mv
zD{`x1(b_}XuTD{8d8@;kvAq3VN-h@VmB!0kjef?|6U*CKzTg^Xh&$lU3N{Ahb7xWX
ze1^?6{Sh59iZ;>=ST0xL{VFulW1=W}t<*YLwAorShPP8zd4=c&?Yuh6jog|`t)NA#
z?xgkSkL8$?B@48-qJ_y=Gg|m>N!h}$oy-=l=xel4oW0-WZxD*1FQuaMt2B+V*{*h(
z%fc=TyvzY-`wcvTUve8`(+hBAF*eN^HkO4DhRw05u3*cuu+#E%JqxriqMOO2R?(t;
z?dZ^qvB^1Cj`SruPDPhhOPf*jA+^ga+TBw02Bq<Y%-hH4rkIeM<WFV~2K=PIhC504
zWC=M;^&Rlp-p=shzs0p63*c!AHfB>YhK+7cVA$;D*AD-!zpCXG;3Yav^>32(t6gUO
z^(<*_P#V|Y=r?JW=Vy+_pbrMzOu<H)oftOK^s=N`U%{43Rsesp%bfK2pm~Zk)3Ro+
z8^vWe6DW<-Oy0}rrkM0s<&UdeYR~oW6T@^!?K111Z-F<^A;3F(I>UST-7~X(z?}r#
zNy1GnaD4~d@;7i*`D<gc0QM4aG6|=(z+c;$8OQ$(T!eq(!N@p<zLciIeznw^=fB!z
zw&Hp*+2dsnczqA&RxFjDeeDIfvw)jRc(w(u?ttt425v0jRDe?m*j2&}E%58MW-FHc
z2KMC_gGMX%)0a{!4ycvQn7nJ%F7p!l2gBw$He11%ylR?J;6BTW-3wZOr5TNF&YCfK
z?-Z4dt*$hlykEN;jTLLm^&Mg<NCh~PNYj;J;~ZPWu$kuTHjXrn`-UWzm#i^#oN~XS
zRyL#P?P`}<bZ3jU%t;I4)XXz>Ftl)?b&^^Ui&kH2#sxXA<t>MN&?*Wot<=61Q7tqp
zP2i-BIVBhU+pW3MPihS_Dm|rknUx;1XuIf;F}!1GB&bF1mN#gpK-(a+VN#o7(Yk6a
zo(uh(FOeQEEkBG?30k14)R-&<Su-X}Q`XFz^YlvN$+D=6F<Hb)XXD!tV@^M7#YMZR
zG!6Kw+GU>JX)SOR9Rj?jt1~=#y_khl0B=`It%7Stcnp*9^}=#`cXhxoIx9HpVS;g|
z;~B45ufG#+DY$KvYm8EMc8%FRhh1~by_9RYf{d6a$K2kM%TFz|YigG{=4~xnG##>P
znJqtnu>jf{p-qrlK8qIYq`6*^xdlOMB{W~DeJmtrcP=L_f#oBUzglpmh1Gg*40Np8
zWe)Tq*321nlMZ2^7w~vk|D})Z@`nRf0p6!zV?bswYz)W<OPZq;i~&i4RdImj6C6#T
z^%Ytbsa3aVMYU!uUb&jfdu(@T71YXZ6#comC_06u=tp$u|FHKa;87K6-*-1jL!v+@
z3K&#0aUv3Rkf1CPf(e9h8afb0L~ucHL2+eCKwK~}3m)4>aFh{eWMFhubX1rH0VSpZ
zvj74?WO0Gu0;h>0;1~!X`Tqa9tGiDFIx{}+^*!%(y~CwBr+!uEKDFF+*K(?=CDAY5
zfd@{Sh1z1@U$mG~un(HHSPOp9E_l<g?b`ihSg>&#!LQqQL|iX|pQT`(n>k9b&dsNl
z;E;N!Q?TTwPXCUm@9gm+ZLm#R<7lQ|OB)>}?Mrw?OMCsFc7=SGkK%+`o3z*L(tZ^s
z?YUE=RYncu?-OZ%oe;^w8<aF{(!Raio`r)bO_{Vdx1U-zFRAqa+N8A;!98rzbSl9*
z(SK2bLs__gR5;OEA*-X-##O-|iFPxZx<LL$6w1P>QPLJfr0Idhkf;x;4~n!GZPM<r
zOUsIq_M20r1*4_~FBNIiXp6NLo};AcEIcDB3u7rwWntB=Ity)Gh}H{iMz-#HmFS}s
ztP@>96iRgMWp>l^2E3xp;+uW!wXj5_Ew)LUXqWceC}}sJA}wo>U0M&3R?e19$L%7!
zw40-(xlfUHP1K5ty(3jh`%;?D!YxE0vv`S;7BY(gcu5vMJnqyiJh9A{g|kHPUYbuW
z_+h)?J(t?m`>wFy&ZiOl=zN>t9wPV|3J#@T3D)ZUXC*kK-fjwxY5=cOdvQY@vc1=-
z^cyy5TZlp>{Xmqo0KB55J*RdZg`~|BX=`oL%I(rJqNGhZMcU*M_L3eV(iYQZ>MZQ7
zq-phziORx@DNU(&!!4(l&43qeS$JrKO7x8stg~<zQ7F-?((LNJ9$wLjj+seep+vtd
z(stOS&9O^+DoWanQ>3j>BhQeuks@t1t(=bA0K2r2QPR4dB5hODIVW)<ZMsd`57*jK
znj9tV3wTARv`bXnR`5}On=5S6UbRcx*TtT1&z>T!R;_UjrSx8r_VsO%l#aGb`+JnM
z+*70t-DEF?3q;!QZPL!POT!O|+QtG$eiKfqZa7;-N!tW}q;$AV+Df93h8R)OUOz?J
z6H!}`9v5k?Y|<XEOFJt{+D#E@x()d%s^Pp!q^+b8*BX*ym-g8u_LR0gMcVr=dnMa5
zT%~l1P1;99p_IN7C9M)((Hio4)b^<7MOvCoTA5wi#3*T#Pm#7YYKnBQNZUn|s0~0b
zB~2TEtCh5naZaT)WdQcyhzCyUzTH@2GXMwkRidA?3EoK*vieP(nS~+2>)<7Vp9=}L
zHJm5xM|M9if_qS~E}JqXSR3bMN^r<HPp04~<DB<fyG{=lX$@l|(t6pYJsu@3H6qO^
zV@UUz_L4q6OeI>yP3LBiD3oY#RHC=SOA<Y5w9ZXiqCdWpl68mcB@t{enp*H|yWlgU
z1V0)UeA^AD3hr2CE2hCB_<P#(Q2E;huf5n_OsQePZBHY3>q|Dl2ZySxm>&|1!Rbz-
zP*#{xg4e-IvLbI3W2^?JPCYnvpJmI5c_Mcx<?7-uS8{dndz9Qz@lT;#DgG203q2LJ
zmT!nii?vDXZI^a-l(dT@(zMy=7;i8BAM;e2SB#0oZ68snqJN%hHyhjF6|I^rqvH0q
zNc)D;bkY1lNz+yHWhE_CgG@?ORdbrk!RX;+_ls>=I8p@v&L(((UGUf_!QH}wFF1|h
zzwfq{`Hx<e{>GalG2TZMD)aNB1aE_v#P}cA<AIZm$tT~~1ivhT7up2Ru?yb*E4!A?
z2n!x{8o_I3*aQy|!Csr--gd#yMhU(+EVy~3j<K!6exp_t>I!>kh{}q?Hz6F=bcoNj
zizrim^mDk9y9&QMj`adPK7^0ix;t9KHz+sW-}*x(*V`SwZC;Zpi|%pFm8R<j1Rcen
zD87JAjK6(6W=|_H@$3jDQlvk#rSO9oq*h?2-w|v=>kGa=R)y!_wE~}3@O#g~w+auC
z<8zoJSS!3j_#$$AX><ga2!BQR9CCaIb_C}MpCx=cIli_#g2lq83Li&~l@5;J1mR<Z
zd&sd+#St7XJXd%+ITrpnf*HcQ3hzjc#c_^citrA?o#a{M@xo7VJ+Z%i5TAxBvdIs^
z03aXs2(KgWPrgI=2I0%e2apGZFBQI!JcoRd@cF_`@`2=YgqI7SM4n4NUHJXNN0SdC
zA18dI@N9B}+#|f7@HBD{dAjh<!V}2{lXn!}R(KQIqlzKqPT`IG4D4^eo7_ttOjh}@
zQ+O?T9(kScD&b4WhmtQB{+950<ip4p3ZE;ym^`1{6h1@v1oCUiCkY=fd^q_C^3lSF
z3eO<Fjyzj<PvI%#BgxZ*Um!f5{Ce_4;Vp$9#L~QqQRGb*ivBxA|H(&_?-ssQc!2x{
z@><~)!WWU>NWMh)E5hfH-$Xu7_$=Ym$!{hv7Oqa_^|v2KK8AdP@G%nJLw*bSaN)Va
z)5*t@X9({qyd(KI@)Y46ggeP^C65<=g6njl|7f82dmhmL3q=2g*OA{zzC-v1;mgSj
z$OFQc3SUTGNWMt;eBmbfRPs5(%Y{!OpGH1i`2E61lRrp4PWVXS+2qs7J;M75Pa}Vb
zJYBe2f9!9cNd7Q+N8xQHd=r+)RXjrO6yC@aFrfeBkCF#FiT(?(C7(fFC%j7d67t8$
zmkWPO_&o9=@`b|Z3NI!vCO3u85I%wYaq>yR#|s}$UP3-v_)y^)<fY`<!g~r&Aul6O
z6Mlj4c=B@cMBy!kA8Z9alf3DC(SPpN@VDPV{sj4M;ai0V$W8KE;T6Ickv~bkMEEPh
z=aBoz=Lw%Bd^-74<i*0L3Li)AC!ZjEjBpS6Eb`&PbA_jqKTVz?ysPkz<g>|Bgm)0`
zB%ebbFZ=|5qy+kpY0HXd$Pb<;`Y*hW{I}#ggl`bOocvkxfbgZl7m`0mzDW3d;U@W9
z@;SoGg-;^?9r<+O_X{6Q{yh0O;Uk4-lg}gf2=6C6jr;}jbm5(aCzAi3yrb~8!kf_i
zSNwt8DZG(CRs#Jee~~<RuIRt;TJrhib;7HJFCqUU`EucJ37<#4fPA6wxx$Od7m}O8
zX9%A_{wMND!p93APW}@4XyHSJXOO>4o-Mqm@D%btlcx#4KzKa)E98m7TM9pjZb8MX
z<V_t#|Bs3OlfOp3TliMt0rExUwZbceFCu@Pe2MT^gwG*=gM6OwS;D82|Ao9*_*CKJ
z$loNNAbgB)5BXc<!-eMxPbYtyJVSU_;T_2rlcxypAlym5ggjpO37*>o{YST;;;-Zf
zlSKc8*OC8?e24H2!k3f(ojf3Xsqls5{~%u^e7<m#d@1=H;pM_7k-tMeUHJXNN0Toj
zA18dI@NDws<R0Pugr||OAWs+GS$HD(yW|~(w-w%m=D*@Sa;NY{u3>`yldmKXo+J7%
zyq3IzyiRzP@FnE$lP?$kmhgGxmE;SB&lO%w9w0Y`&k#O=d=>d5;p2r5CtpoITKG`m
z8RToovxWB*o<jZsd7AJGgvXOtktYgoDf}S11r^ogO&vu4kBI)0*O2cPzEyaDyq3IH
zc!lsq<ZH>72!BQR9P)MK^Mub5KAn6$d9m=R!pD(+NIpUM7~vlB4dlax=L%0J-$<Sz
zysPkz<eSJ-gm)0`B(Ecn7k=Wf=s&sz6`RQqwio>uUPt~B`3~V5gfA!GLLLylRQN*j
zkI5GapD)}b-%375c)9RN<lD%n3%_6ZX!7mk<AjeCo=v`k+#|f7@HFyI$kT;)7M@7{
zDS1cXZG|_X`LFnl+$p?~a|+OZ^3TbG?L_~D*OKofuM=J+d<pq3^5w$c5<ZW-o_wM3
zxx$OdcaxjK)lcL4+fN|>f_#$j@e+PG`IqFQg%1^;LH-qaw(y?9Q^>z2PZNHD@Obh)
z<cY%79E!jFL39f$_L4W9E&Bh1=s&qdzFYWK;Q{g>d9CmY;fu)kkuMScitst)4dnBL
z&k{bJ{2TIO;Zuc=Bmb6sg77iIJ>>hzhYQaYo=*N9d4}+=!aI^5AWsqALAaCrAbGs-
z6a20W`;Trx#Ub*8XNmp`uOt7Se24H2!k3f(lRO}Nsqls5KaejHK3}*=-bg-2c)9RN
z<cG<p3%_6ZX!0ZE<AjeCo=x6F?h)QkcpCXp@^s;yg(s39Bkw4@t?(u^{}so{ox&Tx
z7yT#ykvy0v`Y*he`~-QO@G9X;$eYQR3x7-aJaSC^LjQ%&6<$ovLw<v%@EO7<kn=$2
z;3VPWg%2l>As;P#sPGK(Sn_P)J%y)`yU5doUm!f5yd`;}@Rq_4qFYeWio7X7^q(6s
z(ElfoBi}82tMCAMJbA6~3gL^$&mdnS{1xGI$j>C7Cw!Lh>Ex}+i-k`WK90N%`2^u(
zgnP)_k`EW2D?FXtO`ajVtMHEG3FIllI|z4@Cz8htKfx7E(0_CbD$XK5=obAKUPpd5
z`3~V5gfAy=M;;KqRQN*j_T-C%&lhfzcOai5yj=Js@^i?i3%_6ZX!0cTal%Im&nE9k
z?h)QkcpCY+<mtjY3r{3JkG!Msw!)jx{8yY$?iAj5K=hxy6M3+$=)dq<@(al8gjWe)
zLVh9na^Y_YpGTfdzEJpF;l<=B<fiZ$!Y7blL_SIQc;Umze?>l8_)y^)<f-J@!g~r&
zA-|YBP51@E<H<XdCkk&V{2;mo6_=1VwGsX2PGjglc^C5C!nX<!kf)K?3a=2pi2PFW
zCBk13K8O4=@_E8%37<}WIeD@0slvyR|C)S)@G-(Y<Xy>!3(pmvPJRV>hVZV!JCb)J
zPZ8chxRX4cJYM(-uGfYBqgznXo%~>H(SPA}<X4jK5WYe9a`LOl1HzXIUr64Ae39_^
z!cFp?<a30V3!g;Zi+sB9`-P7t?@c~VxLQ~2Z=X$`LGBUWPr|2>UrnAaytD8`@;>Ap
zg|`*ngl<7aUvj7LMxNh}{ttO3dGJipf8n*{{mAQtR|#K2ehv9@;cp3_N1jE#Q21Qo
z#pK!KrtlfUCy@6ipCo*|@ZsbG$VUqwDm;Tchdf(&PvI%#1Ig2bUm!f5JeNFCcuV02
z(JiPLMBa3U=s&-wK>x`N^4-F>3J;Kb$ZLgH2wy}#n0$%wSA@?YA3{D)_$=Ym$-U&o
z!lw!!N1jJMLHHQq9`d2&!-eMxPbVKno*}%e@Q&p9<SD{C2zQbXCyy6?g4_6@|L7J}
z{D%Bsyy(C1I`V7DcL?7gd^!0D@__KA!WWW{Bwr+azHpO#6!{$C<-#YCk0zfk{C?r1
z$!{PZCw!#vZ1S7PJ;M75Pb0sXJY9Ha;fds9$U6#eE4&HKf5ljGr|?D|Y6tx%A4eXH
z6a5!nOMWYPo$xB*OUQ2{UoQMD;q%CECtoOhuJB^=JIGDpGlWkdA5T6>_;}&N$?qf|
zEqti(4D!3kvxWB*o<cr>JWcoo!sE&BCQlUJQusl13o7m*Z)zp_&y9c3fAV|DcMIPt
zJV1UQd9CmY;fu)cCto7`72$KpA0VG6e3tO(<P*t@g-;bej(igN1mWuRcYk{i`DF6p
z!gD2jI{6gx4B=gccO)+$PZ8chxRbn)JYM(-9<L7lN4KD2D*3^dqW{9{$fuF-5WYe9
za`Fes1HzXIUr7Ea`6A)-g`4Cv$ma+z7e0ynG4koc?-xFryoh|9@R7o^$&1N7!uttN
zBY&JcU3h2ViR2~Z9fh|Q-h}4AqLkbzyph`(p#S7$<UyC{zwlb}a`HOiRl=8$&m><i
z{4L@0$e$ozD15H)Vsew*6h1@v1o9`zCkY=fd^ov}e6;YP!ZXO9BF`4yQ+NuwpFB<Y
z1;XRWXOSlgZz=pBx&;+alQ+eR{&Rs7^q+h-`EKD`g$Kyzkk<;Y5Wa}~8S*8<UlBfs
z{I}%ugwGN_o%~tyV&PMTk0XDM9NUHY9d?Xx5BXejEG;6>6`oH1JMs+SU4?fff1W%=
zcn9H5@_FR(!cTlH`j2iw#S7#IW0d?o!t2O?PrgI=2I0%e|3DrPzEt=^@)yY$37;?A
zB%e<{M|ipLN#uVdpDz4<;iJhHkYoK4(>qdlHu*wwkMMrN)5!lso-Vwz@I>;L$U6#e
zE4&HKf5pq>PT`H*&IkP`|1){8g?jH!;kD$ikk<*X6264|Rr2M+-x5BL{5A50!siMv
zCSOEu3ZEf-0{QFYlZ1~KKAijw^3lSF3eO<_3wgHip2Abe-y}~Heu40K^0&wng|`%b
z5Z!``x5=CE2l>38Tlt{><crC73*Ra{K)!^$R(OT*MdW`arzhmRB76?{-^i(S;Io8J
zC;vM+jT!h<;p52vLC!`De2j1p`BL)X!gGbElfOfrA-t>bj^xY8Q-pUA?j&DM9xwdF
z7oz{@7F4VtKZwZ;ymycAI`VhPIq3+#LHKg=_s9dnmkM7<zLI>A@cF_`@(S`f!pntE
zB7Yye;SmU*B=9D}O2Rt<ZxT3&FhKYlz(E4L6Rsl61ne%b6X9yYE`Xf`#u2U|JR2}h
z;P>bRRD1xKUOO<o(ptI2=~!?7cdb!!qJ{hM{s_UmbTphhp8vYKFD!)%`O#cyWE^mp
zCc)D%14dwXtgp4(Fju)RETX^H-@iG}40!5-dFEOV7V-4YXnoF<T95N6SM=ogZaOE{
zz@NBmcR~wyV(dBD<{1Oc8)7}?4RKyPP;$gM?Y)K<Z0|D<7=ij&cS0+7LhDxUL^mF4
zm0pQgk9S{~!}4i|IIS`=4oz;S?n_p+pd;PL2ow%$D9~~An43HhgQ)iJ)4J8<7)01G
z>ue89+2}EA7{p`Nd61}FUu>(~KCx~FUCW?YsdNcrLMKw<PKafU&T%I)R<Y+8W{r_?
z(Ea$m&CSh5^9Fa>0NmWTdi{0T*JqE+zAk$NGWMI^9)5puIN;@3fj|8BoxyK4{Fm(E
zR~Y_F@cPCidCl#3-x*%#c9n);OCvpCta!=Sb%pgOUst~G@E3+W(cJ7d>h>A_*k|y6
ze<Lm6^|kW)Th|WrUsY3;+r_brFEY&BWUo0j&Zy4CJdndtG$RcqSdfYr<)kKfO>Zhn
z;HK1W?lLNKdSydf!<Unq<X)PU=JkP>R=Ud$p$9d@ms6AE&DicPy%zlly1FRWFzxEC
zTtl=g!6>VAm$JJ##NW5ZD5*~H_*#1WLtA4Bdup;#y^2vwul2moEB?K&)Gbo}-ZHE3
z9K%1+X{2mD;BhbM=`Njz$Q#XphOe-C?lUy)Q&ABXBhwtAAceaP^O4jphVPNoWb0j6
z3Ep+_`j|o_DIt>g>9q}=B`iN^@n4ovcBC-bFsqFC6Y711zfw;l)_AAaJQ}<ku@sl`
z2;Km<+})|}S{Xhg&M2!T>js@EsZ2=U3}w~z$JfosIN>hs#Wo0f3tuCnuF#@H|HNiv
zxkSS;)+i33SRF>$hAj89)kfKV_j9YVl8+Ad_f0mMHyZJ~N;ezs92LiY1?t&`k4iVY
z%kD+St1O;@lGeGK=U+)#!KWeor2J(@J{-!(0S3SKIUKCp&Z=;Be#$ILW_psXdwWZE
zCjUfso@JD6FT7CY-%iQD7j<TZ6n<C3nL<?k^&-2A5ZoKTKlDC#88g!3&xMZUq$UsZ
zpI_rCS)DY*=dDRHd<Ch<z=Q6xxrnDy(4q&^98@1n>Op<nWtTuD@B$QVU-yO3@G2v)
zzg_Vj$;|DBd*DGBgS?a@o={e2<;B++WsP2U&Z+}>?j<eVr8z7d#3^N~!4ItbH`XZI
zl<j`D(kQdC-OmLE``dLk;y)Q!T7yE^S}>1E?NGQbxED?B5PxeZdZ5b?--wznutoOB
z&4#E*tj%@qvIo&Aljvr;FU<O7@qLyJ{4a`cFZJZ1A^uoIxw7{VUqMZ8BV(((OtodG
zF-b+!k-g?>ciBc{ibOQW-X8pt{QXKb9={}iIqonREyEn1Y?#>z!E8i}ISoUzx!^Ld
zuYX)#S!KaRhJSLx!1UVLUS9`qpAoQXrBrQCbHFI6O32FyOx{CPN}>^TmsUfnN?OD+
zctTb=tD#R+`}dRS&5Pe2two%J_>W5EC9Os5G&qcsHN9`lzA5|W>@nH5+#F2B3}Zxb
zLUrjocUeA4P>Lnv7wS*JsQ)OIFWO^iJ$04S5y>@WUX^7CzTX71h+0L1tBAYwFGR|U
zjKou?Gax||L9{X^HJqjIx-`Uv6jYZz%H6}<Eo*d_X7PqngS96pUzO^mDAh|tPBei|
zkMBwqTJ-^a=^k^Pa`O8bwA=8}V!KNlboET&d*TdO9;0M6HLI}4`&sZyZ%p58WlZHW
zRVxS#vpT)es)iV^dB{`U63XcCROe#Ch2%=FZ*rWsPxr#xJb^u~JpXt!z<bjhJ?6&{
zbEuGFs+9z<`Gxg3gm?n`V?8AsoOxz1k8fa{#~c*r@jsT}@fV`mye^I%59<beJM^00
z;kAn&<XxXU-!!z`M`19XhS?vts>g&Bv_Q;j-DSm$r*o5PYFKr=<|I{h8se2gdCgG?
z(aMJGK*R4f4FsE*(C`I%k3(jvKJ!CdOU7^F(tCVtEehX1BTgx-Rowk23iUpgyFas)
z$jGA%&a<-rnF$H@?x*fAp||AAm9E2gv`{`YX1=&0Y@>3Qu4i(XuATH*j;5lynI#>r
z6CQMTq%jDa__3al9nT6^>}1{XLSVx;6$Yi7;cGQ8z42%2ySHq+$33ty&pe)&vL-Kn
z3sfl2owN6V*S+KtD5A2ZY&N}RHE6UyKvRgeVvX0|H{H`*ofp3=r}S&IGA9aN=M&cy
zZVnzsHqZoleYr^mw|O%H?lLuqt411`>|qHiRYu0hWOrGUPPp{FeZ!%$o86^nQvI9J
zMTHyc0sT)zS|%dD3KA2TsOp?V)!z&5#`EeGJMoBg2{Uq&-KCs6vF9USjelcxXMi2k
zWs0brLdP<{+3SCjd7M|a7H#6{ys}kn69*>m@cPe>^TdBYu(THW@(ELUVL>mW7~K%J
zL;CA{+EcdqUp`j26++yl9GV5|@W)_!ZOL@jp@~J;sivTfJHhErbQm<^(FswifgbiR
zQ3$08zaBpurZLig__^{g$YIcbUG{J3vPV}~>J5#&GV$40z{Hm|7N&*>H#2N5Q5A2}
z5DeR{M+#YK;*eB4-nwB80xXdL4OIxh7RTCG1z&&WJmDXaKSaJ6eEs_1dCXB*Fbwng
zM}3fZ75=&wUY9-cTEqOv{J~?MF#N9Kt79D{y&VOY<9<5si+Vc>&-9vy4RfcpJB!8Q
zdI8f=Am)C<{N%>$F*i5Y+g-sour$I}!#Nz{3p^&wAgqGxLj4A`*sgQ*8`AU}9>W{F
z2!)pw$gS6Gl-Fe65*&_28c3-Ato$CQxo$<0#58?{F)m+w%KP3%beW<Iyzf=qGesTr
z``nTD#o`u;`>;0?*KB(&lV^7a*Fr}l2}J7h4fA`+y#e}NSKwWJ^t(d2_tG`$1vjY|
z98PDlV1?t5gDL~<Ab2O1o#<CY$^Q}$GcA<=Demp1GeG1Y@1;t|F78zMts?UCk)wV7
zMxE;sz@wNKTKgldcbnC?Sa03uaFkXSvaEca+c<m<bhX;!iTM@ki57Se`e~qT2E3eS
zZ~1#XE@GY#F~t;fXGBbnUCi|&=6f9A2r*6(bI;>S_5cx+D`KvpnB<6<A1}7WCPBn>
z$Ab{F0&R^^lGy^!qxH~pF+q<K+SpSivGKS{;&-@M>+TIF@in{9cUP(>7UMw(y@Nvi
znQy2m<sR>dn7@mdHz}r6#QX|5H7+Ek1D+@lF_ZCNdgbU~C&*EomMSr8iOfFZ9cb})
zVJ*G@b2<^-;@v6wybJFR>GL+++x7XH%y9j=3b)YbAA9^ueLlKBtZaDSN%mlpejDw<
z1pT&<M%VRKuXs|u;wSCF@1v^Asp)a+H&Dk~={JPz!2@`Mwg<P%t*v4PvypGe;dke%
zocv1za`MJ-PHw?G8uM>8|JeSW>{j*5-|0mr!gnS5V=>m~OP!7icI;p!(l;CC4v$%>
zt7@>#Cnpt<0kc_M&6X#azNLj@eQ*dK3~q&iRnyu*FGi=qUYJG+cE{J)pdX*6gU{o8
zeDH5=@TUSRHG;ML6N?^9(9J)WqV*15oQOZkShsbZVSX(-Q2b32zw4RDtYvrey2A(r
zuPB@dV~GYU8Qs4F?9&$9;xTt38STQLM>Yn-5Zu40Z@Qx(%X&yXjeh7(>j5qdGk3vQ
z6tAIh7$c57jQfeJtx7}!b5Zu^3oE_;e^`f`n^zn`nvg+iMYy#TMd3C7u^k>pby+<w
z$OSaf1M7LU4#PH;9BDDFkM%9?OclWT0=F<Hj~eEE_13__E%2%`O15ex^yJ$Nbcq2E
z05*)YE<zA=!=Wa#`?qoAxR*Y7ku@Jd(`(BrG0OkYF!Sm;otRkI3Q?gXc~Qf81Kmrj
z4gX5UPU+9S^xAAQU`@iqhF>CW?&8d_!+P_p<IPc-ak9Rgt~|O-U_SlQ^4ND5pH}6u
z3q^mL@>mAj!t&^*y#Jl@=!`o3Gv#q-zfgJHj5}Q(zrk&|JpN%s$|FNUpIjc#Ac&NQ
zBRf<cZ4vYrm&b0Jtdq*4)0h8wc|b!d(v@LETV?KwD$ZbQ6f*+MLjYJ&83CSFrN8{<
zXR|OBkrkCb7k4@f-M9^B;p9P)ENq1JLg<sTa2SF}7OuP|l!faM^cQF0d^{Y<LTlsR
zlXK7d!&NXWXTn@*n@Ib&g2PvgFZ1uDo~^b<jptSL`Sz3T%e5!lmo&S5D7eZxS3Rxt
z*$vBPU|L)HIjgC|_rmO*Y*W_5igWiF+QYz2u;QtVj{02D7T;fN#v8Yx`o{_O;%^d^
zMby2o+AE~c&z0UC%AMY!&UNrCDfvGllq&fHc#GG+lEz9KGuhr^1^2P!S8jp_j2W#4
zLeiLR=heEjPpIURac8OkR(ss)lHYDUI8c>5C0oa@jxKrHB^a~o5QLKfe%I<uRq|sm
zvg#1jR`T1!#!SUgUZji}h5;&eUbH=?HfAs5VZ;9%Bdt5PzuBh>-<}7ntrxqa_*+Jb
ze?7||lM{uy=x@HA#q&#x{zCP%D*AS4<4!7iwe0Jp=7dGR0UNkb^i?;Y=r2bOOVOVV
zgZwk)?e%Bw_;su!yXYE6!A}(ULKL|5)NL%xgQ~5(iq_xWR{ko_{HG`IO}MRemvWt3
zcJWuby-aZwzRWkc%hVLP-*qcqsg$P;!kt!m{$l;C@_gGnqZPAkuL+=i8>&wz;X
zyf2}{W<x7a5`u{Gl=lfK&$9^ni<Re2JRGe&Wx-RG2SxSF$EvTDg>Q>$i?$TiUHEcn
z4d@2V3UXowwY;1wu2|HOoOG9sglF(g{IS<)_H9s?^D#a&%9`D!3!nh(<7^qnt5QUF
z8F!DIWQW|Ps{lRbE>GY)Tlvif%r4%m%P#{4{`VkOxss+7^I=3SudJ~k&ttAZO|h_A
zx|x}Y0%)I?u_w>ne~)xIIY+niOFZm}JlqctH>lC0GHGfcA6yOz1JjXibfm1-cd-1A
zh{;p_%Ip;?Lxl{nMXl@()I;h&HgHk@uScfBjD++bpS<n*uRba2KA_+A`K8L%xy#x*
znKhB&btOEMhSJ=5{ePzU(&lZi^wrMT!x;X#1|BePG#Gd%&D$nktsy-_8k&teQw6Yk
z;#O<uCTkuhz6`Cpog{QvLuvD1-lif5@)zdqtKLdO@4Co3)FVnmH-*icile+pnYRyM
z-WWTtxhHJiKET6<-!E7X$1TSmTk{s0Ip-uw!)B$X^L8Oehapj-rU5KEu>SCCsOeV`
zCH$%O<g}H0<5xdhxf2jc=@YcXoz9_SXmG=od(1VF%KZgA?0o`Vx%(i9RPIH+LY4a+
z1pUR8`$;?;Rk^t~0`=88wT9dE?WFegwE6Kqs>jc?lnzNDy8Vq;hV!EVx8eL4+AoqH
zo8fVq{J0E3BtPc$4CTkG2>Ofj;~_kJn*6vDeI@qAq$mE|c+72MVm-PKfx*Ckq%VK`
z;%D=x9*wc+3(vLI`mzqU;r!{E8Ofi&O6YJ^)B18If=K>M?-9zMnF#ue^XCRU9F;%T
z6H?EjdJ-WU9`qsKLJnLp*LTcV_PYLOn1hntU0i#69Bp<d4oYw*wt`6*eQ~T~MHg1f
z#%9b|Z^x>M$iMVTyUUi}cKa8ycc-;)m+t;q`*u%yDAUK`PTRMUxDDI4<$WUdEmJ~=
zGhN%axd<ZmE%vIAeLD+5ZJGX`*|)FJdY)q6W`BJ;`xeP7-i7p$y<t5+aN6NayL#rz
zPma*9!}~=Qhu}^RXR>gsbEV#T>1vfL)H>@D2_4QAJwh)+5E;(w@2+wMBlP1}MCD3-
z*y{f~BlL}U*fyLg`ofkUoT;$Y$A6jrT|b|Gyh*1YccJv-Hj@4fTlyvRDd|TLo&NN2
z`n&#5(vOF2>95~?YWjok4zy3-oig90=8o98GLJ2Gmbb!W?#RKuobNGJR9QFxo#^)x
z+Te5YHRyX4o@q6sTMGl|F;Pti;cISnA(q<AWM|4ayvs<zaHYpE>@LQ5;kt92j=?76
zRK>`w5qdehtnb=`hNOSpumDZ5nRm3=+WRURoLk>yhepq5sv#RcTv}HYj?`T{k1wZ@
z@hTXoyLDl3L8RY=+3(IWz)_P`K}?N6uMb=8(3i4@D*VLbE3a(B>0SdV*-@Bel+_e=
zGtyQYF+&oJ!`qFN8YO}Kg&gVDZvHF!3u$N_rN1x-5r${A7b&=yDOiBjm^%Iv^XW44
z?=cUf7n+QeH>>#HY6KeEVWHFs<;pDnk~vt^H_1_8gz1IhyW#Go?UI%`S|F24j`k?L
zNn(YGM||CDco1pBFwg3N9!o?{;WEUl=sMNOS<7v|pU|4B5%Xt=qkhJTXQE^myj*lN
zVOmQi_(K2S1A$<RWolsaQN1iVYf<t=jNyK}&TwM2+$M=gw7JpcSMp6aBpsH$O*dM0
zd_A_?{G8M-WSF~am&&C?N#m<?QqvsxT8$3fHfsWsT7(5!j)HFQ&r!_)6l%*z_U5aq
zsPGRuSIh?I4CT$+aaBtS@QJhG2D$Z3j9cu?E^i5C9^)%1MNoW{c4qe}FH=up&Cj4}
zuW#8CT80PLl6M)*QFumni8?#RQE&!&`RK3TG#Uejs+M?3U&M8fbsV#_=)8B6#8b1*
zJ=^JMcs`U3zLrib_{xnnoP+S~6-$<U!|?z0?eJyNaQCk<OgeMn+=N(cGEDgRa?eNT
z^L@FgzG0~uH3(%6OZDY;@eS(&*C4pCbS?KPgt^L>+t)X&uMX4Kmz(7qmZigFA<XRv
zbGtA1Uf-~Lb(nj7xf6ZECh9N~5vBlP3VgYb_=Y{A!#v_EFGAXVPxG&YRJRCxd3tT1
zLEYTX1kg}l?#s<d-<&Zh$NlW8@$UPv-g3>V*zxX(mDYIor>?8I9&P^KJ!4q@c=w5#
z@$Rj|@_j=M1RrJ~xbyl^=AayNnBkktZ1z3PzZq+%Vpe*lDh1zCDU5e`F@Hzzvf9G7
zSa?vbNF64w51F+kP0qsD^vw-dnj1>?oJik{^;}+G>?W`8X@<&k#@1nZ$tEo7)9#$Q
zI76-f+*-J;G2kqC+dybTd<}$V`=ADx|LJ<n$U4iPUYYr!>xJWgZ~Nphy)qN)Gt)L1
zbrx4%1gu13Lgj!qF^>Q7+DxvsKK<IvJ*{LlrmW2z)KXlsHgkY>$=b|ov`ZFP+yaUJ
z=Gsi=|NnG7=4A=6qFAo;-}RXPN7iG0h5^4?{dBtJR9FM~-}RXPU61)M*Sr3AJ?3dw
z_5R#?%xAIgN-dL9%a6lr&;HlfV;(;POY*|eiLAa1uHz!S==GS(h}3$_Dm-;M0}^}=
zLH=)FkGaUJx{52Z&?mrX1)aEFbsPlzhwCwKRbntF3<;d;J+uj8|BdySYd_@6xE?ck
z81MdnS&#Yd&8oA%?{jqacOu==+5c_<3O~Fa^I+Uh6slg2IZb4&qzo)WQA^9-goIGn
zR$ZmPgvr1EuE#`u|L=NC=-U4)>oJ$OVB$|+kJ%qzCGl})cM2*^Xg%hG*!>z_kC`F?
zZhR2|)OySt=<Cmn7tWsok@cAU!Pl?PFUpKdjCD8*&d183D%bG@+#_@ou9jnQ3(Pka
zUZ@s`J#nXc@e`PRsJJj(<A214T<>j_c;YVc|DreiQ<T3|4^PN{4YC{st$mp<<7QNF
z#oL&;@@Mvz@V8!x@E3>Wy@^Mn>OY>pOFfT0ekXjH<CKV*BI1YfDoPhp#HC>ocSniH
z$7AlLGyU-H+hTIZ6E#ni^WK+v4kTIqAP6Zu6Dc$gj|vXMdI?F*3nF?dzX4<Z8TQRO
zOUcF9Cn0J-MoINuM#kX+44H?G;(Dk3YJ*dFzoL8T@FWhweDUxsJlL=id+J&mB{gw+
zS~+Swc$eXGdi_{C&24dz>Mq5)8B9a`$jxe?BT})a<SWB}Q)+TUds)xyDf!?oZ=ZXy
zPfmSV^!V;WEWRzU^8Al>X*d(>hkf@!iM}n^R`e+5dk$iXWZwmtB1y|h&F7yi{^`a)
z$z1U7PQd?kaOV@3Zl~f)ES4=^Ul|UZkBtDym>Ei$@*pGB+web<n#C!h{?!hLY6CGz
z!Ksq&aC=IsvOLXef>Zfr5z{7E6%H$gWO&VFPg<2HW9zg|nBBN5ve9HU5*YTJ`Xv<Q
z?nSDU@4E*@aUCL~c9UKwFntUl=ToHpdEkYUv`6Om_&wK)O$u0FkCzr;mh6}(a3UuA
z{Wv^{_cbI%_WOC9nBO{*-Mj|hUZ;*Qd@=ofee12p^P#ht2^?l_<~AP#(^DRQ>^8Nf
zA@D_v2ipqM8)ZLe30Ji($1a~ZY`rQxfKMSJY}(%%T+8^vs86x}Oy6rNc@qkel3T+m
z`SU!Tl8Aoblk`?78B%eM$NbQf_OU1MV+?lQc>QCs=K>qO>Ixp>l4kR&)bwxPMTR#w
zoQLxF$D)70e!&+;8fMW3A8G!;lkx3T!{fueQD4hr_D%JatVyuWMiSF&jg(Ic)<@~L
z@2VOYp@Mg?sni3}2#a;_0V88=;aAv}+VDrj!0>^mt?^|1bDBGpI@vbF$;{VZRSA4^
z0uuOCIDu=Pmjp)AhyAIeJmzs!0kz!%RUp~$_4JrWJZT3!fg>@7|1PA{p{m6RbQdCZ
z!h@M|eDa67xd&8IbCa-*!VA6Z(vadczs2mW4EUj>Z{l--r@6}82Nld+`Z~oxE%749
zP~YVTY?Fq5FZAEc=GUU-=W$Lk8()i@Uh{TL9GLf}W^oD;`@*rG#%ms<`u}P_z7V4`
z+dGfT1_~hAYohA;AC2=+^_r{H9HMpp8hH5Me4erO4T|$Uq2>s=_vbgNj?6aUQ__U3
zH@hZdQ}2EwZL3jS#o*W&o$M}o26yR=*=9|4+5y-s<M1}axzSTn->RUa>}T|~K9pzX
zq2_(SmwK9O3coQ*np?Ra{}Z01J?MMLV?K&xKXM2!c@*mO$e{$>B=9B)H`wF&$f0E1
zAXgqa)CD(Pc+(9x-FVX*H@zV~OLY8T;a?gCbAIf2p8wt-kl?;b-;9#I_1G&gyFWH6
zx4<842I}ulEOjv@xOvL;8g4u#mC4?W>cUTq4D49G1Q{@!CKMZgmoXdArOnG>K7JeQ
zgSq1<17f6oU}Ste?Q-bbD4X?wzFqxirEl8?L*Fn<u1&)ToMZvdPU4x2#z&ovD|Vw7
zJawxRrH^+}$IyacBP}*1(khQCxP~h$4q%Thtvz<?mQ=$2Zz{M{lT+>gB-sDM5&O@_
zN2HF`TUs2B<nSgT><dP8xr@|o1opP%YYkdnOkF3?6erUYTb0ed_Wh4g-)>%C7sfQp
z>%XdYu>bNO7|`qd7E}5^IEMODe`pPNp7WdVeq7u0*eZli5qf{K;a*aRB`QkWurK$^
z(7xQhwhhiIOkq6~->N^3A`HjbmX2kN0RAOo0_Z}+y`sL~WNa9(GTs;L$TG0GSp@wK
zi)y)>I=p@A{on*u5flyHjC(ey;r5hi!lx5d^LtZ!yGyY&4u&8$)$6+*nS|<xy+_Bq
zX<r&8$Bwv5U!-r~Sd7t(nO$u6_Lv*<(okWK7_=uD+ow!{c`P!_4~)RKEsT;QvC9(p
zguB0GWPEP8a~q(|-!jhJcc<))^&VD~dNXcMjVrhSBQ-1u;m$X#Fv_W<qGEHM^+Go9
zj?fMz;Q8$=N91s<MUvbQ@A3EjOfeKBdTmG4YDZcSCq(?bS}^flkof<FbgyC8=rrl}
zqD&>-f6DrW>5hkpf1U2$|2yekj}@$^OZSM1P`Y2g<`<?r5hDI|x)Z22{BY-~;4U`&
z8|yCZsT(eQZg?;bHMVou?H`qp{%yl>OsIczS|$6#%767zqxl21=~Q*s`rH-gE~&vw
z(Gx-(3ciT0Z0MTirG4!w`SD1>Jk=e+v!_%vZ-(VAxrdb+l?~t2FsTu#Y+iT&19=(y
z^W3>V;AjD5xk8n#i!}_(&zSjLPO5C%-V0^^uUQ>+WyABRW2kI{{jq2pQQ0C{7W^ka
zK3<r`b`O14^nJLCu<Q`FNmhBWjq$laR`NkZ^_sYuGh%NH|5Zue_^Q%c?6W^ao7D>=
zrPDmyuvPlwrL{ON0$Gqnn^xS!Z!YdKtfXsh$PVwgv$kOi?av#|Q$)0b2sVI8#$~iC
zv|ke{rCU&mp%GMMVEVUQ^@lKM54sKWr()vl@;yU*v3qcW!5*Ao(E1nKf8%ua|0X%%
zqKmbo5oLfAI=cLPlKypD;5)S~p#9f0I2`x0VFLHi{ui|6U9zu;{U?v}1G;D$g?EeM
zG#$dTqBy@l6%18X$ap_gxe5m13w;tk`kf{Hd7hDhjh=n-<EH%D@J%|x&hS3Nm+o6C
zZ~md-Gkg0K5yWV-|DBI%&!f?LcCt<tItsTzan8anT~Bed=**Q+!-i`NcH_+NDp0@i
zy&u)%=kwSu;1GAJ`R--22(`iLAWzyip1^Umd#WYsQZVUV8UHle$3KP>Yd3dgH&^9l
z{B!Dc{e6%8;INLk)#%4v`W~WzqiEP$<##}AEMCW<4-Vmpp~=88BeRW?z6omlvkK#%
zy21nK9g8rGe_G0j3+L~o8fq<e4bp}&C?g0<R)543_?ly>R-Tf*Sh2dUH_8=t`7#FT
zjoRItwL4Djf$z}^S%Dq}p3L`Rlku|R$>SfGJn=p(nQO`Y@0f8wRfkxQnVSHRgcU+~
zU`#S0da$`&JpSBnh8c^G?7L|bMsef*SVk=iJ0Yz*4q%+%+==mPH!ns#hQC0%7jW&v
z6V|}*)x0!%HK=aG%+#)V&ix3CG~WwrbvNpwjKI!~jK2=)@mF^92YDIaOwAqad*oY0
zsUIFou%2rh8hveGcgJRbqQ+l6)jMSP^)-iI3%r-%K$5(S6VtLd%0C0Tf-cB889MbR
zjjhoEKK}*PJ3fl}4cl<Dr%&~co98Ld7Toj@8LD-JXLwE~@ErzXuAL|FDcR?6FKyks
zsHwTYbt3RY>sD$;L;Nd#(%%Sthr?oAFG>KTIrfO22+?oQ{*V5YzpNjsI09RMe-|3Q
z_LCVDS{c9zMgC4X>OsU2!&a0>!Cgpm_n~UfWVZTL>FUoA0V~Fvf?oWY2nf|f!C|MJ
zH>J^VZor{TAgj<qWTus?2jkR(F@~>AOZ99JArBf(9OV(fbIR4wf+GoZ?O`qUIX&+F
z+l;_I1Um%zXzWfjL4#VvdE7aNm3V9<_h;VtppyPN7dffZAqx5%{@mtJ-2Ck@Po$#p
z4Av|`IJSMxP+`vwzr&xo=pPW$2(R1V<@NPJM^Y!n>pa1hvBYw!gJul(fEpcHkG~zp
z=@2bZ^x|wRe`c>oRcy{zZ)o4?M`n_w04+b)-)vR+*)5z=+?Os{R%H{$p#Dq;VvIO#
z#O@}z;$|sVK)GDEp4rA>+%^$)Dl>PG@_0>xeVz)n<32|``JAA~uLYu&0(zVX`Q*>+
zq8?K#VuJUAhLsaHm_<k`$sB$hatd5f|D@HnK|PyE)>!n*3t&i^tuC#9R4pc`>moc;
z?P`a=G`8wd$m5yB_80INt%Q<H>lj~v0$P)BBPKaqPxWHWX)|;UET!oz%3M!>cUfPE
zs&c&#htoRiMwsw6F1*m+Y0MeeF8pw+HKr99eh%GnawPq(-!GvkM%bS@4I!+@%T#7I
z@P6t{-n*{GPYPqdt>JsPOF?TcnWNHP3~2Ru5}^)!<8Y`sE^9n?<5<IIo@lnd0kiJh
zjzyFGU&HcqKMW<F*}4zlV31w3%yWUPfp5a|z#^>g^Se5(k8u<|7=tNX__n6+@7{2t
z?!QaDGJJol#Kmg25f5C5akTZquSp#_zAT=EQ#jgcVvJwD6Up(rKDJuRuI;vvwd)F1
z{PXZBv&!|tQ3R`YJ%x+Tf0eVD)vc9H8p`>xm+&STv6q#r%y?ZqpXXTrbLDB>10S10
zh(2Ufc)=M~KQyZUQYp?pBcc>-@L;v88A)KItP^ZgsZ`-N1RH?Du2c8-YJ2u3ByjHY
z6niK3a+Y|X#-19^6F9GHs@m@?flg{4%qC^|nt6ayHQJxUlfQ|?4)sBv$GS;ZntDDC
zGspJ!MIM@RzIhbY9lKa#6SIqsG#Bj6K2e$TMC=(j_N3ucn?QquXA73L%_m2gq<ADL
z;C@GpKk^~szarxA3wzuAUx@glKOTBT<oSmp{^QMFr~2Oz@sD`h<X;`}HzEP@trxz8
zewwa6@U~8%HlT(6fNhBp@45tRROn4osQGGQ@qjzvIt!lYFEaOR`DgjIU`iV1-SuS4
zHja`9^Bpr>);moU>#DCo#lZqm%niBLA6BgnGj1VoeK`N&s%n_jRt0AoX;@n}Dp-f9
zhLhed>#4(QiTy7g!ThD#K7-}0vVCUVUzwa2xeglJXSnhiipuRX2Y9s}X~TNg=3AFq
z0GyqS-OUFKzXZp;C^r~z$KG;2WqsFzvUXW_w#P$y3pN)u+y3QS`yOcFu<m&GIL<%7
z>A(4?$A>Tr4+FisCC(BEzS}X%USi(rI2YgZ$_SzRbsTH13)>)no^RBZ!V!5#Wd1GR
zI#7uzRez632!?JQJL@-cJE*4tvBbJrziZ}K+M2kmubBRop>6>ur4bxjkHxlVp^cQF
zZs=hl+do$clVa|YeJAIkcZ1*!XQzJ(i9NK$!nYch;FL#yr*<liXNHCJb*gH{YMb(%
z;jD&OkSdCG-gdNEWL7`vbmaNErdN8+pivTZs+E`$4IJE-##Rh%r;)PX>)apee87c`
z`$$Ezk%kWoN1F!wZ%T{9sB^nfjeG!h53ELb%<*-@3^w9gjJ3{14>}zM$uPuNSZwaX
zenQMsmGBgg(DtL?!$=<j-~MH5bPswXLOeYPO~oagwVGeg;d2xBYnb@#=W6)_O9XZs
zQ`n^_Erj*>b5e0)&Kh^w9{|uN#$sYby}|2j@EqRdNx_D<{V|@D{T}BAEY!HlXs+=b
z{>+oI#?y?wl&-J8&?SV!F*92w9K}{C^h&;fw)`!Y@oIl4jST<oQCW9V<Y$;D9CM@9
zxJ$>ATe#*yCO#3NM-+}*>FvxQoQH8C+P?UQpvt9>Ni$$(j`?SdO}~7up;2y|zHxbQ
z^&s{(m|?9D3>()9?#VGf%rRGc%=L0)jCDg79MJJ;L3a<P4WVO0u=c|0c@ZnlYTow2
zEfQqz#{v+;*E$uG;2v|w-OhaO(~ICd6Vv0hcRTKOSc}d=Z9}8d5F4F(RVhx{Pa$<T
z+6Gj=#n->cy>+KM?%=_Ayuf6I+2fRb04Y4=z;-Z#-<9;eB==71p-b>lthBiRqogV(
z@H#F!zAlBsC(L&qwr;?!KD|Kw3=a>EF@s2Ep4r4?VtPa!Z4m7lO6+?$3>e8o4@?EJ
zjweJ+jXLciVhWViL8O=Cd}y?tE0STIa5My>)S|gQ_|con&QKpt>;E_L*X!et89tn{
zh3_!xB#P@XWZg%NeB7@r*Cj2Wi@s|yFEk}fh45#tj8#5u#0TF*rX*{hwU~;5{(kRN
z&*0n06!wWJE%SBdvjaX}-v9#=^9|pSvFtjh;m5!chHni7+j)IMMj`w=Z>#W&D9NAM
zh*=_RXjnT*!DlJN-=kxh>iRb%hR*Ng2*0GcXF<ooz7fYJ@?+$;C=}ExL}@IZY1j!-
zzmt<^@+sIig+rhtms6^AnKOXH7TxfdGhdC2BU1-rZ-v|+n3}N+9VQ3Quk?AVu!h)M
zZSn2^?=WogRvqEp5h!?-;cSg|tP;w5WKiLz)!(}xXo>&b<K0_oa2=F@Ki4PY&n?})
zcR%LF&Cmq#du$l}a+Bd&1+PI2l-mt1$ariJLO+n3g+IfxLc!_E%_qXDZ%Th`7+twz
ziN=NyUAYs9CWa7Q<wZyku4<{d_E7iqo2~tL4f$zSzCio&8uHVue6jZ9HRPvRd8PK_
zHRPvR`6lhhYsgQt@_OyZYsgQt@&nqB*N~rP<?Nv{eY}SJG%LrNXXVFh$WOEKWbMam
z$WOEKZrYF6ke_DdS=x`+ke_Dd`Pz@yke_DdW3?ZzAwSK^Cu%=lLw=f-7im9ULw=f-
z&(?mthWs=uU!eVX4f$zSzF7P58uHVuyi)t|8uHVue3SO$HRPvRdA;`IHRPvR`2p?6
zYsgQta_+cM<%es?PqT6uNhv>@ADR7pXMCqT=6N5A!GFp6q-PvUtY>}E2VDR|nO>Xc
z|4S+-JN4H%!~}N6PaC?D-3<KOW$WK^y?sY?6TR_!cs?#Bz<Z2F*n&HGaPJqP4}@LN
z4eA~8&wxKa5FWPPuZLxkDB(4j0Peu!c9WGj^#QO)8;1Qm;ugyYU}DSM*YFgaKO9ib
zh9cxh406M@W`7KF<uTWLxPlF!4RO7%;X4{86RyzkuNu;#xc;Hxn;K>jF4pih4QW+e
zZ)o_EhGPj|(Qv+ov@EWL8qO0~QiOabr<+loqxo#j=_WTdU!XbN<i(mV)|_tg>6%w+
zPB-}^%{OUIH~9q3>oupFe4OSq7>p;~<fAorh*Byy`Ebn>G^d-~qj|FCbdzUm-c575
z$ul(1(wuJcbj|ZMr<*)Y^Rb%KO`f9pM9t|Y@2Gi^=5&)MYCc<Yy2;};U!XbN<W9{O
zYfd+L)3-|HN-8y{oBW{Wn>44JJg9lS=5&+q*8G6xbd&GUoRyvBLN|Gx=B()Cbd%TO
zif+%JEL?nXs#}D<KYK>C1+mR_4H$pEjC$vDEoMMDpp@sl%a?i{tSMCzMnHdlM4d<f
zjLWw1ff0vThtDX@YUrniySDjgbPCavk@Kukn{gf~&a>W`m$Ef4eka#=yK}zh36njp
z#LMw<1;JA`;OuHSkMs!6>8u*;Kfk3n{#Z`w4;bC-EO?VfrBf_|7vezt%oyCL&mGa7
z;owW3Ax&5fd72eQTfT{LG;ESCO}@3_d<67AQTZvuS0bY(Fr(Z91Ebls#J1!v$WN|X
zldO|bblFQ}uAzGo6w@?eU+HhgHg7_%j#FXqoP{1TJ4TpCyv&>0Ma>XkcOAa>t)c9w
z9&FV3;uv1hut&YZ)|(aSF_0cM+R1vu7tFV{FM(ZeeCo(b!nyS@o~={EAM$Vt0?W)3
zw%oI6U^_6^!{?b#O~SAl#`^1!vA+0pr>sv+!YUkJ-&E^N<QpvY+1|4H!gIy&^T1{J
zukhfMSV+PrkcQ{=eZ$lGM*|QID+@724_5t*8%LC%SoaUY_zhn|v|5Gt1?aUR&^yY(
z>-vTIw~V??zv@wN2eC=hi*aUXRY{gN9>=3JLV1odW;2os(8WIGFmEVaH5xA{TqVZb
zZN2ed<djz9nXLtX;8P=$3fD5dO4ahrt=51~kj>$!=3(_dM%DO-IO3pU9Q0ha6O*+l
zz^kcF8LOw{8h)SZE@Q07GR+7~y;6-~qvnUO+OY<G<q_-9S8i8hINPnmR|9+zXc}1f
zRqzC~g$*43ACiDS*Cw-`FaKKEn;W~KgUhzB%`ycZ8^SiP{k!5&K|585gFzn{kj01J
z6EYx+L^WtDG|49!%P?BrfUzX1DO;+;OauwHV@m`oH~DPj1|MIfIo;$7G+(GW-Q<fk
zpQkz9<dvGw(VTAbO`4mU(@kEld9mhnlONE0y5@9~JG6bBq&eN>37SvPoNn@D&Btj@
zH+eVBM{7<ud6wqGHK&_AUvrP<bd!(OJX>?R$tP-_p*h{;MVhB;PB-~%&C@ien|y)h
zDVozwzF6~)n$t~Qsd=L2bdzt=JYI9U$?G+DYEC!#0nMAfqP{+++~lEl?4ahl6I&8$
z$AX&E9qF9L!1SUGsz80?zg?l<bAdhU6tQRQf9^nSbC=FSY16Fj44F00DQ1m<|AN4U
z+t9AOfg7vqR(mCn>{E)=b?Kzm<5xFSpI3Ew@1l&~qCvd{H_8AW3aSYwpDRyD@H#{p
zX+NnZyH!dbh}7h^l60xbiQ<r&+)f;}ntYZJTTM>TBvO;Lya8kJZmG$wb(pA{+)Cgn
zHM#jqDY@CY<}ARKHT<yVbhAVkYyO?)bdy(VzE^X)$v0`fOLMx(>own|Io;$3G~cK>
z-Q*6fVbz+`O`f3n`<l~Do~-#hn$u0*P4l-kr<*)W^H(*en>=6hKWa`l`B=?=r#ao^
z6E%NYbGpfkG@q$C-Q=@1e@t__$roroO>?@*7i<22=5&)+YJR8Ybdzt={1(mWCa>50
zdd=x3KcM+A&FLl&)#O2%pIno#(OlJJpDPPa)SrR&?<4<VdyXob+4Kc^j^6*MNuS4h
z+P)GG7Gg#|^L^aNl9~H9<Liuz{l+T{-vE3bbG-q-DXs7U=ufzb?qjaT>WWm1TZ7+l
zBGsSy2tqWx7oI9pH$TyT95_$)4cdMVnLpKkOi^Ljf86^6<l{@oC@$u@E2`W06Y{^t
zQZMv9-umok<Oft3%Acv^dnw-w`7vHBdBa`8X?m)soNp94pWyU8<r^g?sXrH!UI^pC
z>PGc1IXo$WNnge;_)kt#>z@XKRZZC)<H+;BDjiw%2-FXsUJ}70a}we9pkB{MoCwV~
z?5>Bt+|0HhJRfnvVE<KJLjD2x7uNH;))g>BV&D(xS%^ku;Qz=F6^P1Q+&~a#A?Ct4
zQ=AP?O2ilA1G(kfgGp7HJ?_r67x;0aD$Kop)qsx<w>*Up<~~<GANx1QT^@uB^^SFh
zWi@2{<ai6w9r%5oxhpSaV_y6==rw-l;26hSnV;@LF7#)0J88UCJrv`uy!h{PN`J)Q
zjN>f?@AMZ0SL3ZWa3ggkdt<1soTXJV@)-#q4>HRTvSCMbby#osW}>jw=9qC9b=Bi5
zJboT&>d4Rt%s)4o&3s1PlmY3rYCied8Z|U?w%cI%o~^0jFwHq)LmVBOv;#}lo~_xg
z9ZBdgV{*AEr*JhVmy7r`u5Hm!ae_HF0e?m&<Ik9G=!LY!%@dR(-j5BXcTv3nPBZ5+
zXwd-yoN3O*V-FNLFucTdXjbUCEYbu`*^%aa^U!?KWKGA0=vdNjnzBQ!!cHX35<060
znWR0`J^k{v-)x&7@98&I`z^5f@t%GYwclc!AMfc`r2Q&we!QpOZ0)zn=Er;bEzo}T
zHb36eZ?X0}VDsZW{VKJeLz)mKd=~HNw@LdY*!*};zk2PLZ1dwi{SIiqZZ<#O)6W3~
z!*y1c&5!r=OVED#Hb36eFIoGIwfXU$e%-X+M4KP)>6fMbifn$or(eGIn{D&sJ^jXN
zzXdiw-qUZQ_FHW8<30U~v|pvokN5POt^GFH{CH2l1=_FP=Er;bE!KVqY<|3_U#0fb
zO}&(#3`>hPX+Pa@SAM*=PjK+OUOCPSJG8k>z2GdvhYx8J<E-K@oDNfs8u>ZiEKzpC
zM0HlbGNFYS)yjTAn?&rjMVZN;Vzo^njxFj`jXQN9A6kR6dGGIWJ_6E|-~MZuVYAbp
zL91N*Kfx1na-VI!<zG#QPr|c5F&#dJFAGnq*!wf5nGa7x;GdfhkB7g^hreBm$p7qo
z_~%H8nh)QC8{HCdKKui5HWa}Q*W$in0rJ6m`fQmRGOMxQvj3(0jOZFLlZU>w9G_iX
z&Ly0VSlrC!4gPtm%UKBg2bq&7^WSWyerkH~ij(?xdOq;8yp%0@@t>i6LX-0)8=Q+{
z@V3*oPiw;SfnVF^1GntZ?bAoNk@jg}O}KsHQJ7K79N0d+iIAtXPt;HISW)J>?Nq8u
zyuKcj5iHM_JBmNkmV+Nt$f%h*2(wW5T+1a?qMP_4@)90^zWwD#b%3??PvoxG?9l@c
zSGi_Tn3hra9@dHI#}J8oo>t4%e*CVfxuP}diuEbZH?p;v5&h)Kv>fvTkGX-zM;^vj
z3a|g-m4Vh+je?`UutKA?)r>wAb{ToNOj!=VbC>y9c>bAtQ8&Yf;N|}g9g)peJ~9aF
zyuBtjN<JtXN4(gymVot$+nD@#B!3Fx-jIp}t^z^=*WuSf_^sw_N#F>0SGitf@GSSz
zkG)v^0b%%k7;}e}-<5e#kJKhb_YAH&(bQ}`GYY9~GbSpv`ut6t4_v9&fh78-a7E+8
zozz$E`Cn2K-<2Ljb3w}+gRPM~`vIKt`ycpx-^zCv9&|k$5`@lV<`(SyHut0VHIj1t
z4`*3z9zWh(l(}<jD79rsh)V4iAf)#5W0Km7ajYJSb3DAOTyHZt5^3(YJ|DnJ^e+1t
z*21sU@}o=fMvt#2#}H>)_pQW>u!RN7k+@X1tP1@|f4LOKP9p`Ysdk|tTA?8Dt^MX;
zSFWb@XSVwh`7`-?$o%%Elkx{UdXu0)8N-sXNGPxx{i$KeaQ7LO6n4}9GWrjIKlGW3
z!^aHYpk&NCx@X?a9+rv8<!Q-Ab!@7t-dJyypl*Dv(wn(Si1VR_e+a(RVf(emthYAb
zkGNqFl3>IPNH)v?kOwu38PEmy-K^3C_?iP?$%nw>+GPGZZQ3SxDS!ALvRiXamjUHy
zZDg!+&!na^D&;Zirp{e{6yZ=@|Ja0V_i@<<CWK4A!%C<Dhxz-CB|K;kIYWQ1@0>jT
z=q(mfn;Wrj^ay^%3tIqIxAJ6sG{xn~*gWM<ufP3o7slXp^@p(Vw5$(hmthxNN8ET;
zgcf~yvDhomT<5|1q`Z`m^88m}FTshXf&`Bf1H096kmjE0;G<Zz#Z`}3!{trc=5=l~
zGPb!N-=iWly_vs`S|XLK$2lr?9m28OV3h1(>{?3fjxlyer`(3v{jeY$yJ`wz>=xnX
zKZxBi_e>gW{_tsdJ^EHJ)}v>fm^?L9tbW&TSL&LCZALL@W|4`betQ)A9(JQ146TO%
zRXxWb&4D;%YD;XK_>ePdOl8TryunXy?`y)P{Y<P1sdx-8Z5S5`9jikhnSx<Y<{AmP
zc{l7?5kvOE3bV8Tl#@%?UlHFPhvEA$eEI8O`)*J=?-Q@DpTR5s0WVp%`yo7S){yq0
z<Qr(;F>Y{8LNLq-o?dgY{wAcoG)hU<S@)izzrbMyUd^hf6quSl1U03m;QWZ@j-ck&
zO~Ws^ZJK%}RCfy3awl^wcV=BsWzJ>MF+W>ku7=0zeE6*}+K9MyHw<V|<~^IW$+`Hf
zkHq9$JPrsZ=Wa|UaJTGVQ)zN8{vNc-)tAAg0O5x{q>??eV19TQd@KBvBQzCN2Xhu^
zrxLNmn?I7M7WUz!+2(paYrqeu7KYj%eY(3pGeHuNz@t8%gA|s^h|!YqYCRP85@4Yq
zmVK%T0qd?NC<WGsVL{-F2yWFZW;;S#X8B<+i!x``>BMFBK)h*BGfRPxIDaD(*KB?K
zD<&?pgG$^j3?51xybnp@mh@+V4Pm4UFx-1M0pBXmjHDx;9XKulV;SUGstoe}U`xWB
zQAh&YbFZ%#GikW8d9KG-utMMnfXN-Lw;oiw_w18gR0x~13Ms!DEu~e6FFVM{!mABm
zAL}(beCJtXdCgc|*k5vX3}3uuuX?f70WW4vmZ*2YtKs>Mc*eW|&vnWZKgDD<;T#rt
zz962z$%5yH@WihL(DzgL+B6<U8|nya4r}=gem4A|doB)BIN<(WW%8!vn$_#`t+n_{
z#xlpRV-~St9K9Mp)D`Y>=UcDS-wZ5MbsAZT{<bL%Xe~A*_?%dG9|^wxTI->(JaaW0
zf`S$|uD0%gaWc#SP<UJ1c>90a9hpvoWj0x7c*cYLLKPN$T2JBhRTfM;Y?(?&q3y9A
zBFVrHUFM)~)CzvsoaxEfiQhg>#tFH4U28a+N}s)4pNZoswi)K3$PZvy6Cvp|BB)~F
zRA1{}>N>LPO#G@GGTiTa6TdQi<7%8t?D|T<!t>z2UHmVpa&5pJ%Gy<~f}zkW0IVm_
zDL}9#^283{`*Ev}6t^zLKEW#2wRnPBG6Gk8h>zvr1_|&1n!d4>V?|LL=vt%oW2r)W
zJ_9mZs8IV9>h#h(tx<cp3AZrL+Vgh=Gh;A|`5r<dLLcyAx|ZTn<+=oKcUHMlaG~Fh
z4NQ|xvC1!29Q+2R+F9du4?KjvO#u#iIBf0b{s@2Ot9XJNiOf&%Qzcd~4q~8?eWi9X
zb82=n#|pb!E9HTycmUR58?b)p?|7)n^+y0zylZGo*dc**AD}hjJ+vj2IDg16k5;*!
z1LFGQwsTPSzHSJVRIECPkM2O_U3P7x3hDT(^O~{vu`3GQ>MfBt_#xV0b;OEVWDHH%
zVcpkJ;@&?J_ttn9;@;chdBk|r3Ot6mp9dUzT0Pnwk5;)p|6He_f+-j(&1Ws3^~}3S
z!3q^0EK$a9UKl1;LY!b;Vmncdarn!L5909R9av0VaGCWi#u0c~-f7}ki*}FsYm4Ju
zcmd*gqjj@9)<NPp4LB-}v+!t@D<dk7ud8IaRzDre%FIt-(oFn9>ke4M%eNy#@BEFj
zSS^^0b<oC!--YXgjOTII#rFnwmLTe_B`xlvCs=waJ&gr@tu^Ym1~VUBALCe#qK3M_
z+<s!M#L=#3I%uA+Rl^3tE|1e+kJ!ej+BNetmB1UUM?_8!$Uz~@1y<tt*d&ov<r)pj
zX0h;1mcd$;C9W@Lh0<4b%$7bcew)RbiIZ87*%#qCt6@3P7d|doofn<0j^AY+z|S|&
zMC37nVSSI5&M5oTUAm23L~}cKxm@8cyN^9g^`kDyUjMJ3UKfMk`p)^0ZL+%*FM;1@
z{K1|GJP$DPGc(EfP>FGs?V&&8b&=48@avB=v6F*OgR^Gc|0y%ZUG@&#)-9npaUWSY
zPO8_depC-}y7<^y6{mknoPJcPQ9nY1w`j<A%zhTRyA+>684KlK&^>hLE^Pv5e0n!r
zh;E+0$9xD!F+z31mN$5JA}kd=GlQR?w)=ZLgfOgXzX1>a76Rb)8oaDY%a8;#;3`|o
zGEsFJronHpl6k^9w2&vHhklR;IaRh*Dr9gQO<IpjDV{&+s!e`y66=XS^9=QTqxDzy
zeCYRU@Y`u+mrCtiB+u{Lp`N#{R&}I3q^aNDQU?>UrC(L6jGT2h<OHY5tKU+r(Hcqi
zJ(%imuw1r68fxSii|1ITe~R(*;|M!`_F`xM^3p4_V9?=<wMnb9jN)T7=7nE3bVAqr
zN~5@%zSeChQM-2)rS=o49c&$t`+%&T2;A^2BH4+FklBS`3LyFigAsu$R}4H*^&i>V
zikBX8+5fR}y(*Wna_K0S&0Cb8PcAuf`SBwKm&)Y<xpa}s?#;^Y1-V=+m)3ImpicSC
zkV`MQ9N47bBDvfum-FPZ1#KMS>6c5cT$(p1xJ)jS<Z`K8zWh-6{Xs6*$)&Aas@E&O
zBDrM9<<L3>-;m4ga_J<Ot!tIv({eH7;*`sZTIDxIE|<&Y>ly{;%jJ5xB*>+<TKTQn
zh63Qpc&-5MLd~n3NALAz_I(n456szk%BtOEOIRz+W7x~yPwkJvwwY?RAz=_&inZ7m
z?#(#*=tIz{inlq}z`aXo<ehKQzTkNL`dA~=fst!Ari0MU#)%o(n0&~_<im~Fa??wR
z-08vOgTKc;ecAKH&xBe7mhzn#K9~Wo6HBPj({6tWV#+N8I4}ZRIU1Wca|1Mo7-<1-
zM$;n~c$~lwW00YH;WvGQd5aafsK!0K%A%&msM)Qjd!UcI5Q&oB@ez228jyTFQN>d*
zT)j+o<*@2%5LSb^(%@1@3GQs6q@42`NO>(R<*!PLVP+O@wdE)JRny^yNqnx7nOx<X
z0H-sY-|)Y|?w|EU=3x8ts6#Jc+fjODuzMXhj3E4FCxt)H9-i$v&x<q6b@<5vWY!9p
zUbK4yH__>=ner=?_?=IL+Me#usr?z!wr<)43mA{^YNrrR0NR9Oh~g)AGW-Kz16RAt
zj#HGs_1Q*qEv>ow(Fd3d2PWUAoy#)D^(<T=Vp<REslYi<R&2xV9{)1_a)hV2G7i6a
z<K@U~hlM^7-h-KpWAW}1P8_Hv#Bi^~OAlSMB(qB$r)EFe|2~Xj^GAD2_QV|CmV)iz
zhpxfzEwnJ=4`COcQSu44v#vUzQd(Is30pXNy5$E9a9l*-t22iA`*t4UYu#BL`S6B{
zeKw{VTH%*4HYVix9@&`WPGUpSIpKiES)PEtg-S6_9$V*4IaWAEZ9or?T4{CTF)d)-
znFE#j!lP7bZoE>4{dYha7T^t{Qhy$-Jg1B28ylLMvzPURuhKfGv{fnrUS;3L-+m=_
zk*RY4kux0aj)yO*f$9fh>w3WY81qu-;^Qnfcd$1ygnRW>&hgYWwR%S@aAbZKnmhr;
z7Ic?AkF171uye<QV+_z5ci9W706s=}wov7{$~gRqk+K#W;CV)4#&*=xFFnpJIO}0b
zXXwn8<sqH9Y%+C*XZ(uV%-ev%yGThpZ1{Q{Eo*_e@3k&sAD{gUY?Omhz>hZ%#)|{)
zvhIuyj}<_}j6H~H=*M7lV9qYX`MpurfOFL7U$)L&b|r2G`&(bevF78PG{bl_&sp!~
zoFNuwB2AwgX&>X5x&*@<h<DzBjelRcOLO4G-?J#s!T^MQk$gVto0WiO42!KCMHuar
zU>+04?XARqyx~rdIam3ac}X7MNLZZ9*Jf9hCn31Q(_Eg6E1vF#i`Ti^W2!rkZ*Fgj
z%UBEJuojRmPjhJ%+sXrZ&N~zMCkbL02}FEnLyXEZ-HQxuZ=^~Mlj)8coH<Z25;_u`
zi$CD=F(9{&XG7JdA+-RVCE&VC5bzF!X_yF@C~(?mn8Lb;@J7JnGX3x-)UV=RLM-;o
z+$-=;*cn!FA0S8fKZ@k_;`L-UyzYmW^|kOs=q1#1J!cc<BgaI!YvAeeS1@Ik_iYG@
z&fCXdmd?mD5&9u|9NEj5>lGa+`bj(;vbO6Z{aG1<c<=*~P}7nbJQeC(mXUv{gVmCo
zLl0rAr`15Hwl2C$gG>dE{Zwl9Iy_(i&X}CwHFtZ{uyGZ~-jp;YPk8{xLZX#AyjdLs
zf!WDDF$L#_*6VtFk9>&f<VV&g^J`V}hC*y;JsLB4a>RcG{YR`%_V`A8h+}9*tnb3R
zWZre-9R{8w*7sH)`7>$?H#YP_eKksUALq|)WI3|k2{|7p;IH@N<ZK*;w}pSbTe`t)
z#qxleU6|K%m)(R~%&#P#z}_>4_(t@@2|=3++Nz%GPHcI#mSQN8SN1L1kZ^Z$HF72i
zs`WlcrJm}_gAkFu9LGsH8qPD&>f&4*Pje;CUW7v8pu!`a-KC3BF3b6$6wZqax_SIr
zs5ni?v&6~2@*>js{hAeOg83uVY#vC`qNJX`UlZT~u$ZxNWU=Rb_+ZJAtjVrraB$XY
zifb7%9(Vn+F~N`1K~PpBVUdR(!Tkq}+_HA~gS5II=b}TjWH;jieY^>Z2<4jF8On7S
zK}5Ok8E97z+wYCwGwnu?`2*F%{Llz|jV~DIUttulW{pE%wDbzzpc6YK&!5`WQ@qB2
z)8l)pE3ceqyg(V${jgbi<`L^<ybuO;k_TH{IiYv2$5+}lpFw>n?q}t;Y;$32Pxb#t
z-J1tSRb3DKGbAAi1YS^<sDMF(5~WHkDg#6^5X>8#0aWTzcWJ3stC-AyD+VSJp5s)$
zmakiDwY6VMt8KAbj8tO+k_5C6Km~C_P<SyQaf7hP{66R2H#2Y0ewW|xuV0jT+dX$V
z_uO;OJ?GqW>(020_womy4B)=r!h*rkWhTRfvc$69vL`$z3&rjKrUFlN+^O2%`5KLX
zdyZ=S9R`i}leKRAE%2`N*pRz4KgYAG4NQk*`ebgOA)!FNk%&Tgn+&+8wucA9Y@r7c
ztB~_K8I@{KOh#+NIOpKK$t)uZ@%-5~(x|_Xw?W2G^Pc`et4XgEFH-NsdA(R$RbrKL
z06P5pa?$({VkmF^r{d~UxoGr=MomRW&KC1rQFt7Nio&`1vW9NnncaD^KR)$)4nwN6
z_A&IJCt}aAGnch%e{xPz)jp`|<q9ytc|OqtIv%PJln|B#Efl$N8*6>qF#m{L=H@rZ
z1%lki%oU0k6b9xu%bR81keG7DK<LtvBHG+581IisW@Qmdx~9(KT>Hby_5{2HaBDN7
z7zaU0<?ad<K<?X7k)5dY*bQt=DpYRq+!UGC1$pd;CIoHt+fm_FQ)6M0Yh9<A*Lf5E
zL4jB=A*U&D9z>C~<SAs!uqLkMgekG?36lISH3<{tlua0`>AuR<$RXY-vEYP4cBnq6
z<Pb#YejUUDp-x?@8}8)ZY^@Mw1=J$*mT67Lwi#|K4EnYk^|NGDiF&$KQV7X&{H}U;
zG4*DSZ|?B%7=&_)gGAdLA@M+o2Sq$7iPI`kjqO`l?J{H5^_8+Du%&dn9D;xyO%lfd
z3zD+9W31#1N2-&6uFPzmC+YZc*xH=Z-1uq<`o0O@G&Oef*9u)k<p5p8#H^qM2^FnF
zDtFk@2dc4gX+gTNQjK(rZ&odiEX{0fq%PG}ol(IuJXr-7WZ6kEZp5GzN>!815l8p%
zGB5sPGS*Ve_9}GPou4smmnM|Fw)OQ4fzDoDp(g0-wauuzmAP1?oX(2?3F((Lo7FVv
z@#HmPNP$Uq*tG&h(+WdaE}s661lGUqj)zbvNkF8I9$IzQSA)-%fa#)c(Trj;po&pA
z`b0t_?F6gAbD25TgwGS|t2vJ)n?rn4<D*l=$2Mt`$)<0BUC@OH!YvN%jT7373GJoU
zPb;vC!=ZPt%4}`h1nRA#HQ41CFJYrO5`p56&Iaw&+9u7vW?U>2+K&RvD$#@I)ZD0~
z(Jokdh_D36=~Lt5PlfzXh2bbxoGY_p-Dts6+(bSz0@jtS#g3p}sswfE8<u%ji3D|w
zuIJAs+Buxh!O<|OX1x3xjMp^c)N~`3ktArXu{|WXAU^sOu~#C`$ZYCgp{LVTGM&Df
zoK1ywFq^JI7a}u3AbN<|bRoGov*{r<n@&RTpnZBam6MkLKh37bB+l)X$@Eet(`nx`
znaXL$kjd3D5M%VF!dd;B%<-v}gEW#=K1?aM;{`rs&L^1R0~Qk+5*Lx5&oN`S<e3}x
zFj5Ia{dvgh3Y6L=dSJLGgq5oZi(rGo)_$c3_zkmpN3T$P<cWb;_K9lLKBhYi>LP%1
zEW8@>3JPoM37MTJORR;OwXKO9+wnWnUJwhuUaT0E&hbY2AC`jUlL_|Q{?#R@2fj|m
zuoc5dX$#SA`m(Y_jP53y^kV7krhv6FuzzE~OJqBwH>A502p{@75>WB1#hiDBPM{J|
z5Mc=MZf^kb&RFng!Adi;QGE0sZasn0eZW8otz*=N!Bo6n_!FT6tS}jHM^}WKVOL>7
z2XI$x#)yH#%?K4hz_&Wm-gz;uyTm!91r3a3`McK~W)mU@Wt!was%)Rz9}Dqtf0t2M
zupim*3qo`UzbJ+r5e>VDPwrc7)SpNWfzpvB0ccMN!kg8qa4iyu%ZsHWEF}`EOmxZd
z0b`poN9;fpuqsLkQ9WeapGXfu<W2h?e3?N}iEv;TJ>SHTgiP(}kN}RX`^Md#T=#w4
zQ<*gW+$8IRu`d3j^bOHFFwHT-#f*iCj42Px{on~H0Sz_d1V=&((yimo91+Yeh&?dg
z5zxXjE{H7|FCtjL8aT~*QQb0YbxNp-yiZlTem<BHju7CC@ChsmMWx>QduZwV@{Bx(
zA)4eK$xo!XWWLUfF2`*qMLBI&OUql+L+B8YG_}Y{jMX|rk^LqS#Oq|e%R;e4E7#AA
zoOSe}o5N&vihu>_t5Z0Qy59q$MX1{GBRmmyLpG`dHSv9$BO;c_TGx?Ai~aUV4}gDb
zwLHZ}<JSKf%NlzKQQC2llId*#VekpmNBE3gQnvM(T{}xw2??XVnp^_b2Z822!ZSbD
zV%$~gWm2{7ygZ$knW}XO&6@^#8r&tH%-F~hqwa3LD;0Z(%+ac@5S3IeiAXIIcU13@
z`a5=rB2~E98l*-$b&ayr%Vdq5muR_;suA9_QU~=td`9xlsOyoeN#_aZ^Qj!cBb6EF
zUHd+C%pFa@@geo{HhC%fI_T-lHHw}-h(S*`N;-Ae=iV_&?P3qQpVL8=uq!=f)c>5J
z&=E><*3a$zR_5WTM*UhoI{g`#-=fuA%Aj;NnRogWN8V|__8qtq8Qq!=GT|zLQ%6Z`
z%S0<NdQb~}ZimULBb=<#PWvbFEap!o6{LT(e2`T8Y%uEoAP8_g_F$-D&03gLl!$Q$
zNl9x#V%cla0vYwc<7FszpJ+%#^AeQ_mZc+PsNQi(>q?km_F8-p@ru9)(K1EvQAB!C
zpN!xtI@&<#eWF5PR;hxOPRf&;lzs%S1_Hh0m2Fg_N5?|Sq?69sqB<#tPS|yf?38oU
z+J|+&MO`ul914rjsYk)38e^)F%>iqReFDk(`WfOEjPXD_qM2wL?JVAdCddA{PfnhQ
zKPiZCBjihIYwQteI|7VRcThc}^^P9&Mrsh#*6ds94G*K)T@o${ML0_!V&(X-z@izN
z*dFk;;4-#W&2g^+9OQr$(Pm*i_5`#1c?zQ<um|y^SojV%A)G1h6QNpWbQOGetx*r-
z$FZud^Io<R<6(A0O71KY`8niU2NS^Yv!#bRS>k4p5sdZk3Rz!-N;ib8uL7lSgsk?E
z@7?MweBfa5U3nqjJ1nM}*&`e(T}zTsDO9y4<l9>PJZgZ>=Y*w(pU7x=eNXZ&Yl2;-
zgu_SQQ5fAW+9+v({ruSq)3y5jl!d%+llPD5_c!SGSFWcA*6<z)!=9%<h4iOSn)y`5
zsMe$LIzH<4@5R$6vmA=-%em)~0^C4v=J*itl@KF3`gD0`t+#tX3#cDsw=rME`{-Jf
zFYR>@`4si{0eCfTO85haWTQ;UA7fdy-$W9CJT4~U5acoJaw=U;<#Y``LIUMukWB2F
zGf4e8D#`TNlDp|>7B1eGV?Qhvo8m_RWaqWj-IDW;yrDjaW=nm_M|6TMYXiF7pG&bn
z(#5`ymjK1iJDFlHOci^BF7{Q{8usBIka}*qSUsMxYgag4>sYGMIH4WeXV0#tEzAu4
zoJd#OP!cMW$f4SCXGHpyHPznbJrh3^6ep7i_{z>7$6?9UBr%x}kwfPj&y>Eqhbcw$
zSqM+{5<+~{6|98Z0)En;GoCYLc>&?vLg?*8(eBhP=?tgNfk4c#c4=SefBcP|Mp2eN
zp2%pqB0FTW#5yLGV2LDP;R&z6o7mgB0<!Qh!0<77?0W7bWAzH*r3GTbx}J-7aGc<N
zn31meK(LKp8skvB2A%=uDmuUr$whKx*$KO`hd}09lDb;AN~Fwq`A>mIwqCzHBNm{b
ziG@reJQMsF%Q^fzd<prBz?ha78EEH^KOr)r_73kAk&{eoz0$SXpZvt5Vkr(+*ladf
zF`}#5&^f^Ly&LYaNRD_gmixWPJG@>Y<4hJduiUHX<M?Z!k9vjGE%p>vmgx?2)_>Xq
z8~y$S5VYkZCY9FnWhEm=AE9vU=_iD=JO<+PMyx32hGZ9B$4uLDzUj$`pH2Kb>-_0_
zen@hOoj+YD>i9rx-ep+)0@l>aAcV-fWPdHWcskdM_gob{f~Al6)Q;^fT|XfuU8{-Y
zXSPmC*2?IY)jCPl$&BBXs?)3F+A@hM4RH3$VH{+AF7@dl@Ucm{KDggWebC3~`j9b@
zQlFHkkgkJEo~uOk%|?AD+~X~!NhCkP*U}FKUscJPfv;SDYn6kqYjoWhVx$~OA^w)C
z%=!haDyiS~l1r>AiLZM#zSN;wg0Cv4eiCDE5SYgb6|#Po=eKSxQXS%q;a9HYQ)^Xm
zF^z%zv;B*z%%dcLG;+BUe-}Y<^kA{SQGnp=U$V>?0qxtjl3CEg^Hf{lg~DC<VC`Lr
z_9xpfLhdFl=e{<IEX29e{?%5RgMH4?CgGEBT9*}8F1n$}sn&6U_`SztaACOpYR7>D
zfmFU$i)Z@|EDl0A0WmZ)yr7?gKy2!@LCXgp>C6vV^Da|5V{Wb$Vk0qrta8lw7Dx!C
zy|%bHy48&45+AwsvW)Q-L!%~MKEAnmbHC)baL#UCIM!D2&x+?PdtO}6Y8N||K|ghX
zeTdC{#{38Gy%K+x^TnHMdMM-W=y&0|NG%@Qi#AGPK;9<c(FOtcg!8)0((=UOf~3Vf
zjZfv)4mUo&_B>B`fK+Nw*~Z!lS96}XI1YM`5TdiGL(EvH%fvWPD;YA2p|7mpCjs`2
zx*>e^8_WAf51bYrE(*K?UPKQdc8dUAJI)gs(~-wrn%|Ims+V}00`ZkVY$a0#Ast5r
z#=(%^=C)!{xP<Y7mJbP19XDrRd$o`u`z`i2mXN}puAW!Zr{SWI@6(XAD`ZUkmgNqc
zG!J+7QRfMSti8Ml|J2`->nYpg5bb^pPdX#}=4+%)EfsZpBFBb&dmBz8<Ci6Oqq5n*
zC|3-0%PUKaa3JX28M2OWZhfKn6Y{<83#3<G1$^%tb)RzIc}&GWycjw=AA8;_%~;Zu
zhn(YH^{%qII`?#TA`5+#q!Rk;#s^cq<H&bxzna0zMwG);?jt*wr3+U*#XU}qgK~gS
zeUk+L0UP!o1rfdv!vkdisJRtaTzlA$W$<U8=!ihNM8$t=%fJ3@fyC2J2}hsutctUH
zBW+UtTfcObFOKr5@{j*+`KLPNFUu(ZcbEU0@*RIL5r=Bae=aqNu$Q&`x>Ow&A`j#9
zJUv+l;n`W#fvd*B!xYFlY=Q&&nt5}V{q$8LiyjVhSQ~3lbV@q~2ScdCq?XEZX;Zm}
z0_?j;hu>NFWNhYY5Kr6!NBHy!iE)bK)mHnnOI5XpdQu{X+Xq(W2m9OvterfVE}c%v
zp+XsQ2Xnkxx<#aq<eh_m_@BLNp46d<P1F)4wA`Gi6czngsW^e@&%|>n5C4>}*ajfe
zb$a|zN#ui_f7MAzg<&Ehn%=%D6Nhtltd<06E}EpaB>xrE+w-AfoJKx-4t7Chu2Jp(
zFwxGtoz6Rgt%4K%ZoBjDT-Un~op+eyQt7rj?+pFUsRbVN-&dXQ_&+QB{!G{GGAE}g
zpjlPxQs;#)`Qj4i#fapKA2}}uC11>NUi9R}6^U&}&9>k5zb4%FjolR!4Nu2ecZx8T
zem*JvOhh72$1&;|wmF<Sf60^<FR#vFf7FNc&pDtURf5jxijG}(q&9Ir@4c^2+p`5|
z0qfSlA)pW8ycOcbI*w()ReUm|<v-%$37^j073yvTch{*q4qz+)nYtS#`w+@+Quk*`
zKGxdKa}dr?<BgiGI@x<PV5B@@S@LvpG{%54<cZ3AI*BbmV3a&%^W?{<W{nNt_!O|h
zd3gz!#4P>6_;~oPgNfIs_wvJ`?aTiIx{3FBXey$smDhrW$5`Hb(IS(tN^HhT+ge*J
zW%Bp3NlRSxYqr+qVP{XsH@WY^<uZBSjRis9XDh3bIlsA>8+;(l2Z}wB*U7emLgX*i
zkMPfYRgv75Dyc}%MGI3EVd37YBCHBTgR014%FK0@pX(|=*Hylj1aegI<*M`+@=BKO
zT2N%JF1|}o;P0!~#7SBSDW7kEaj+%Vu@uoD+21RF+=@Qmxh?LgZ(C}Y2_{mlm3Z&$
z4eWU4t)$|F{5OeXN~_ApjP*<u;)Iq4aP$S@U@R(wSPvQMCz!)Ma^T9iszQ7yeJWB=
zq(n~!a=?t@pZn|ziWV_hRP`!N_<iJ7-8GhaVYWuowT>ohtuu1XH$hHbWc#O4e;l<m
zqpzt(8+EtHQ|ysZ^eIQy2NG?@qW%aA;vEGk2IB=|u#gt_8s_Qz3C6lQg%ZE<lD`KY
zRoRnc7Z!wKmuDM=bz>zBQm}WH^?<scWz~&6iFZNkH)Bh<R$5EOj^yeTW^x>erHhR=
zcd2qAX>xceU;J&DbdBhc9H~efw&sF>wKmZFS<hhXH?30fkn!}Un&YTpzUl3r602)3
zl4clBw;2nrVP4P`EZ4bwLzjxSzxgwAS;bDdo>OAK*(|vPji=wMsSPx5&kuMHPKzzs
zD!BxVr&k&ahLTIi^)kgo?}eo6KPMA2{)k}Lj4c#kT&_Bgan7SnGzI2&OKIVsDxZjt
z^Kn6H4#o<4QE+J3+VBT#pzFmFx|hr;o&D8xlAcdO*t!1sRls;<ZJ@OUAtn*n)DkdW
zSR0JJvR+kbKe6D(zY*CS8ippKH|Ob!+V^&<B$X&yj3R!0q1k(hN{qP10|ZtYv{rs)
z8n3jO#tUr$<Jq<VM>f8Na`UK!M2c!vEt0;KTD1F>rkm}Fn?4p(zwtu5X;q==pRu`+
zsv32#!@VTo%0TNXsV)b@Zd%3m!=QEL=E87pD4%e7)AAsYo@PFvFSd@lzu04D<!YL2
zP!d&EuE|E1bipQ3$jOS;UR9xUEpo2K&UJ)y9pzlbr%lpVI9JoTPIs=CIoE5Q>kZCz
zwsXB*Ul-QWZs&GU`c_1D=k25E+o#gEOVhW@(zgxi+otqwTl%&=eY+)nyDfcdr*FH`
zx83Pm598CRSzh{9G#k!ak^7z7;`Hr^^zEqh?U?i}=HVomnZ6ZAI485q(zn;9Z*NH7
z&Q9Onp1!S3-!4kuE>7P*n!bH1eY-S$yDWX%kiKn7-?pW1+tasO(zn~vw|4rrD}CFY
zz7;=}Bz~RSm&eEqn&f}XUxsV`GF<bQ;hMh;x{Qf4Ie(6E=Ff$~Yt<)6UWShuBQ`rM
zV781_SU{6z0ZoBn?Ln)mkexNjv0=TqUJWHLo5CD-<<~2fwrBGf%2S{ahdFU&v@b7K
zTF7Vg*OxQDA|%^mbb@-F;1o2Cjb&6y>o8ByAJ4<l2odEz&iQHluhhpQCz7>m;Ks`<
zbS6KROqx>Z-XWcye=^s*!Pu<AfKuNXbwUKnu%LA5v163NoP`#;fVvXlvTAxngT29=
z|FKM$*!Ux}1F;uN6ekvcpb%LE;;)T@Gh6MP(h}V^L2vPNxEQ?YV_{|6BovP`f5n!(
zS*&J-jf(ll`)>@^7YqsyGQ9_b^_460!hOWC&$lk}cE{&bCKSIlPn<)m^29T=Q>qug
z6(yr=Of%}f;#rm*P6fxI&8+`O()b=wO)%=ekQC+O2W8x+{1Ywai*3>zQRUZ<kQN}A
zTg~iT4#fI%0`AjmYx+h%o`5O*O^(BOy1DwNP7{YqF07UD7_;m9DtELgQW11A`1$rk
z+)p4m++0%VC(43;o{ka*8`n-{uC25>f-&{E$(9wqmNI0?^$6JZw@fDY%;mD^o_<}Q
z(d7C*LbcXjIr01{2^+?)ycpK|WP|Grz6a2@=1~H3uf_vTaV1GXMedW5ii4SjJ0W}I
z_ci)>qsL!N5%evJNSf_>G#w;ZsuL_d0JYu9C;Jz?PcN>xbJ5%U-6ly0IpMd~B`sJS
zw~fH{%RCw!E^!o20qa3|4nWrshN^b!!Xifk#9du?p4d6D54k%}M%(-y@;P9Ax}t(7
z5T@hn4iEA&{)%d-zhMNKCifS1*ng646zbW)>p*Nti7*8d&3=N-{tLy@gS%Gp@s~Wh
z*PqyS-wef6^~w^+bm4OdRpOHGC;47nFPKIbk!Z$GE%Lb%Ia2!ONyrt0`5uf<^oFcD
zT{hy}EWh<)|53<j-2A5hS={)oSNf0UI++h+)bn%w$8%M~qnUO7Q8mBsI(okIV_cyG
z?Jid+`mGC#2;8Lk??-j)yY_P@sE%E)6Rekw^{Qi+@LoDLqd#RT6-!R+mO{=c@Au?^
zgi%^oA~UdDpK`8?oolVSR+h{*N-ImQm7nSIQz1X2<fmA?A&Y`>cE!gd*dmWH;~bjo
zSN^ArCVLz0P#xvx22Xl%Y%5m7R(ywB>Sb-fo`UZF{@2816-{b)rS<9vD2J0`%=xQ>
z-1%Q2sZ0MhH5UA&h;0Hv{L-y$m>DPnRaam>Z4n+Jo^*j&-w@#^Tx?>SB#Twz{Nk{Z
zJV}<xN`!DKtu^yMR%`<QjG|!dxwRFDUsiMS<!{t8qf_cDaV?lMbbJfB(&nHH6;8AZ
zmh_jX1BF5E3hZ2jK69o7zpy5Mg0lw>A(MBbTbJwCVsx=4ALh~@uX@}nPDN`QNZ?Pi
zx)|hmjd~I8I^_VgJ)@MOW$6=ShsPX6W|GO8JVhpJzDZKSFOO5y@w|S&U|9zFh_})&
zbb`e?!F8DlWUdS91ZQB!g#fR+ndz*Sa_!N)KZ?HaZ^H9Oj?g4hTdo;XTtfB|)ZeeL
zy+Qn*;(wFT#T=MqXJ%|5`oxmt`~r7N$owLINa(@dl}GXzXX>3y^e1CfmAU>PqBZ}X
z<t(C**vTU;p_oq9`9_Kuen7cyN?-;OeQ>4qhGsk3U81MV5;ey#ZMI6utZY~cDbGk}
zrhSaMC_f&e4hrXel{`4>q?TXai#LdJ$Y3|GZ3p|%I^Sv(<Q#20?&gxbIUjUhDMxKy
zN@Jv}k6ORr58dJ{+G@-|78W0#vHs!P^of9-1ToXRFxrpu&oH&q2@lEE7hLH)_TL71
zl~UL~YThAjqfUDWS&Pj8BHP>~5cdGV*QfN?oxg=pZlV<o1^asasa}7&<gY0}nL~tD
z{5ru;bb_CJ<q(-r$tn6%K!2M3BPxi)k|Po2^HE4e{ptO}U*S`%$E%p&p_ZWao;iPq
zmQwJ6_`RyrM&aUWR12pMq9|}BxmN+IRj|;-&-}Ve{qwi0@jJg&@mh}3+5h1(??0)G
zr*nwJ6C|q3Lnxk562wOjm4mSoL_Rs<4UecaUYh%L$oEy`eYx8g@_iZE7`wFF>KB`K
z*l*><48N6+O0}s3mZhUUfyhhIn9(JdewjvwI6({?sL<@wUwi%KC|tg4e=|TODAEZ!
zo;xbR2Ky}%2*O<aJGy%Wld%}p)NCxC`63V2z9tGIPZ*a(*_5+4f&9`cyI<6CR|qf+
zNz}4hY)s0AX#Di<U330+g>~yn&isu}g(C&A0gG^Wze2D)je`f>El<Ao$lO3o&&s0q
zyDAjFPYI7*j_~-h;MuQi$Fi{9&JOoYA7{+3lViV8@}qQk^NxICO};m+zJF2p#E^GY
zs6MzV4^xiFkLx3Eb7p|d^x})$dArDv=jX|BwjxLV7<ogi)wl-YC;6UOmkYx`$2Odj
zB>P6LNXb<74^k^EAK6HkFLK+FLp75P7eV5Ph)=$Dqy8IiT*bo5D`msR>m$;9wxd^X
zU8zDuMO+;AOePQ)b%ggFvDb+YmQ)7CI0nbb*4}`%2bmH3ci&WX>TdhLh<K{r;Y)%D
zq{2!Usj53NmH0d#B~#V%%Gx3)E{cbt%*p6vY6c<MJ1-E?d9}q@Aj2B7Q5d^vPsOFt
z`GOTE>&-m4vj*wnoa69K=@aDyC-||u^F{paDblYCQL8E~rgN}9Sz?p9i8FbRH@(Qy
z(5!L~K$0dmbh-9i;?d%CmF!`5m8jJ~CB<etwXKtlcrH1)PA_@>ot*46Yhfu+!vDN#
z*xvT1jByA3*jFB-UD7^5<=yd^b_uW6sjELv4|01n>16!tPftANFj9N4em_CKpO6Hx
zv-SI(ZE7A3$o|7;z^7g_?&_D~kY;QitE00_HX#hx>`>{KWUF{<o+ix>8+<2KxY{2f
zI@S~RE3Uu{JC_BQT5sX%q0b3sgOZb-S`!vHI#T;g>G4+4r>p+c3uBSocHS;D{zI|N
zyx#eN=G^*sx@47P-YW%Oyc9ClWdGu=)#3~zJVKQ2aax{US*qnEdcqAgoUaz#Tqtv|
zvEc72qX%Wx3Z(|?o5NK7fz3*Rha_h_+Z43w^dlnq*&zNyy)oeT93|QK8qggJ_zC7W
zo0iJODGVO8EeR<V7o}%ByGxYl$t9FBLNC_2UH8YM=mGhsQ1$bzI>D`MaMtqmX#M_D
z{r+=|SB;I-K=<b$J~~Jv%4K0~gVKVB`#S1yGe=}x?bxIJ3A%^{!l~#~ZuA$(MxNSY
z@)Fz@oQwNM__iS1wgb^7&NA;jLkSS6^~jXgy7hTZ^2uczr}ggNHq~09UJ0ZxV14;j
z(0FA{(0GA=&mx_^B)JED??hgo8q*1n0*buBG`Ejx<SlTj;b?=sbWZ@j-rRv8E8Ew4
zPK{AIsWi)n;nTBiHH2qt!x)2RxH}YE(pE?UQML{U<+o3@G(a$>a;L@tnjP>SppRFE
ztR-tCv5;Lcs{Y_GxQ;9mJ4*O*_PfZ`B%Y`u1h%;J6afqb#tW-VwA_LZ6`EtDlkSal
z{ZdNeVE($mFa)<3Rq_Cq=umxIxHuSlpjGM+%3m9_mgrmG+VCQLcC<Y{zb$B~x+gzF
zROo6UJe2K1UIlHqCmL$qGflV+4i7|D>pWS*CviTtcmFl%j$#fvS@@amtTsA}f6uO?
zm%{77(NT#bn>%Ep${-X$R^)_aXs@eRCFN;RU{jayr%Iizx`&m=i|aL>!)x=xAF%z#
z5om(elR_K}uRoP31~~;EPnT|M0y_ox_<;WS<cF!zBAq01;7}5{xdrxG*&nasr?|;l
zA{H2RGFPb`5gR9$z1KNh;g_MAqG!>!OQh!%J+oR5T3c019L4+_PL&DPhkebV=cAz_
zbmc~2LrM%pAGT?S@{04T-%4RgI|K8zsJT#fMvA=yf;|>_F&I-Q2}z`vjeHW_{a60i
zsJTak8%+1_QWfGQJSdn8#{RHhs88H3>R2JE<ZlLASJPcsCRT@reHyf$Q%S<TLiu<R
zzNQe~yb}^46Ha~HDNJA;Hji_s#l~+6)PEEn9KwAVSt;N>6!3l(ie0%Wuk)B>{|53u
znr4mP1Oh|s18CkwszkucI`qR}K8nHcW?WuBH0sW#{GhMNh>lcbRv;dR9mU+s9-^dw
zfV6ED1AEg>-6~<Gs(P^{&Y0R5t`h7Yt!S-NpToU^9O<O_3sf@eOhrO>uu<Asfe4kb
zWykLBrO0TqPO$Us3>tfnk1i6*lz+ttS9dDe1mKrene0qtX=6m$;!MtnT!(|&<|UCK
zoA)9Fjb6zNbEM`3ZSTT2Al#iNdT?3htuq#E=T4k-klxhrCFDk%NX&)k7qsNe*cF+$
z7<UU{?t9Aih>OgHN&J`X5v;YsdRlQaVpl*g|DYe=B96C(p%;i0Bc01rev7OXMX$ib
zS5hp?gcGG#XbH}?m&;=B7&&P7Z*1*L+qcB~&nJQqvfKzsGg8u6!qf2vQgxQ9W>Mv-
zvAb<KiZ^^B{9}(gaReg9(~;#)hG0hLtoecJoE7ZVYQ)*qnYXSX7zH1n(;r_WRzgP4
zt+CG|fva=4o$jX+F{J>MEywk*D};)OpnFEfj0Bcw{+u~urIrQy@Ao?+F|cVzl7)-P
z)_8hj&15wMyew^k@38V;%f!I&ox#0tZR9+P5+x(sqAHB&KO}FpFY^)glDS1B2nl}|
z$|n?A|NTy0${0s>K#N{dplhyjAms&sqK>HKD!RR8-pe5+Ib7^^^l2>b8Xp8r<0*}a
z%^r*Hh))s=lTVN1zUD%bFu~(=g7F&mw!@O30^r6;{Md(b75Gs7_0~<PuM_0!2Yk&y
zu#&f&Ml+vZOs&PECN=-F_PzZ)GhWU5rA`@US=os{jUUaj(#M*yy8X(+E6fq>z)khD
z%)~q+|2WgTERBxts_gwv|G*<ZrhnRNPLqjNxKZm$VQR9>TPd6=7TlB<7>0ZMj7@pO
zqolL?UA8K)e?w>(6K3{)UW5_zHU4SH4rz$2$EiQZ?NleIzS0a){m$5u{VdktrgW%W
zt-Af9nUVgL%aZl|xtIx+nUdcQo~AmZ57X6Ao$)}=OoW0ST#jF(;hN?<evPQ>!npX0
z*n`H9eZ0J3qIihF(pFy4@!~x?nVQdLNEFiBQCk?xJ>yv%A5@q-gqJ{~jV=1df)AuG
zpT|>6!eYjAPQ3x|tZ}0vn6YzCmiySK*j;(iL?V1l(Dy=NF)XYO2X&9#IG3M4ZGkJq
zhyQ#L3FH4PfWevZ3kqZsALVB?ILZ?i$Czt~R-ruJFhd}Kv;P3UouM&`Wc@itM5y7N
zqe!=kl}IujuJ|N6z{)<EDBNd-j8pQk&!W+2G@VGj#5?7sb{@Yy0@5etMY|K>(Lqbq
z7mjkfOdMiKGnTi3tPX4n6t{V&bb&C~#*xyZ0@m8N<J~if4#E@g)pLlTAjsEv)#7py
zt(F=*mHeo5{VVmyLvz5+h@77+5@q$j(L>9JLh3$Sq71`a#W`rmNkS4=mJ~MrL>2cD
zEl*fd@JIH!$%5Rqc!+}Rd)Oo8qK2gYWyZy9+Z0bC+??3LzsvsSY;r1pLX=a{8ku}z
zKNJl`Pf<>s#53I+ukuE_vy6qO@hEECx?$Yle!`$hyecSALc%G$#6M}l2@;2Cm;EaF
z${eS*zQxa8T%}(><GgMHEEvB_+P5E<`DDMc8I0Bqqjj}+6+laGk<3_rQy-T1MqMip
zxK8x#93=Z!Di+<8FqWT=)=Pa9s;e*>zqMb)8w~&YPg*7=R9}2OL!Kj8Q4_Y!jLSW-
zTZZiZ6}=1wzZv~K{Wu*uk@Q=VAKm@<Lq6I=k=&K`w6jc>nfLzOjj8)2o}2y8@OQCP
zzm+f6O?jeZaH#I8Reqy!bO}E*O1%6AOL|N;8Y@e(_(em;?~IZhey=R)$=54Oav8CC
zkZB*ksL(I=U+llgp1x2p^Lnwp(3|RY$G@A9u}*(RhDv*LB<$e)RWcZrrUU1=l+3)}
ze(6TEi(3@kWXdaIASpKWmPoY;H0kHmO1FDFFW`wxXA8lw`H?Uic*%>R8Jo)4J7!8~
zuc54CBB$7=Tt`hszUe8!6x{s35;QKV;j8z(pl=5|iQmWZbO>>10<n9bN_JfkiyWu7
zKcMENA@m3jmxA{I#)g13RMcz51Q*g8tb_Igw+Xc-jJn@aKR?{76|*LP;x78DvVDuq
z1{q`Wr^F<5HfHIUfvgCV=YBDJu(!V3&ZR;S6Iw6Er5TG*JA!C%frp(D^z96nn!c~*
zzSuE=p&<2i<eO1t?@_sfR8Cs;_iB*ZOYW9(_kvfU+^t{w4rCLMPB)~+a7KUNi5^6d
za<*#V?ab_hYz+Fbis>CRFy0%l7tFK0v3?IR^`E^)5JzZP*-gWq1f@o^e>&Si(>`!x
ztzze(V-H~CMq$8rxF(nQ{GAhLEuI!<ACzxT(5TuI@_idJChZOS_SOs!`EJRp5f4+@
zTUbO;OM%0RV3r;XWSz(Eh7xo$A?p@-7cAXw-1j@mAp>LKa_NlBPN0YJ?zTQs_!lEX
zo?TBj(gQP84+Okw18Y9+#m0gwz&Qjxg9NHf-!AYV6`GZ-&`x{dUC9b<PgUqwXGn!+
z6?WzWJrL-!J=XWw9qhN%G_bfYvyNEjU`-*+A~^|a9WP(2zb-H5>k9R?L+EAKk<=V~
zbblSP*G^MS83^_nbM8IZ72t?Wui3^9g31fIm%$_k9(?0v&`uvEw~#PYNMYK$Q@U}N
zVx0C<cP95dDXZXIVlSfC6s7ZAwJ}O&M*FT*BH6~o#wdBKH%8U|fIF2ruR`&8Jv#d>
z4p_Uf-PIy37<Io<E!@>OmaBd7C=v@Dtqzrbik0t7Eq2YtRyH0rGE1({>@!c3nS|Pg
ze712PjsuL0?2?Hs*(1&*21@eFoLd!$S9}O$*U5oWO80(~`aC@e1T+V<-al3FXVfW=
zJb95%Pr^);d$_bom;~c)!Os{ydY4)zp2fxccGmh#I+w(mbPAZfBuG-(xAdCczx7LB
z4^PL2Zk#%&TjdrH_KVCP#PF-XTw?g8Mx*TKiVeAz1gI#pm{W?#`$9?o2Lgk$3kv~h
zYM_T~#-jhOW#;bu&;`tGORiU3dO(e*g&jnszYn7&K6XVW<mTl+DVKQJtyLp*qEYlw
z6|#*0RCpv6>G&g6<*e4KpvT)<HC=z6;YHH8|6U}GepQz_DfvRm8%R0Z>uss3mf{f^
zMXKpzs@d?tTGGsFUARax@w6<wUoOg%c;RBX>$mVJE-ed}@#|?__#(fHj#Us1+^UQ&
zc|qFnU=z=X?PA#F($1xI;TC?Sz7+7jx@%eZp}cBcxJ@qnfrCfR5b<iE%~R{bF7=jz
zN924yPujBZYu>ah%vQKjvYDWX^ej{adZ5}Xxe%o#&_$Apf0Q>^ZUG$1H5oy1MKKQ@
z_nn#?uo>f(&|Le1@$tA$tY^Dkz_;Icgza?%7K36!Jl)ri8^kCAHVq_@#lS#x^%0Ye
z?5(S}!IjL?f6ZEqsOud+JaU=Qi1ai&d|9;nCq~_FsuAtJUJKf`#eoBAsBfhz#`3(n
zHQ}Kl-w}LT6HSZ#i~3^J({BkY*u;nGPQ;%c9fH+aeDB#IXt$5ONMWczFjSgdqD08#
zDRcW=>noP$Gx4c!{TZ{aUL6q-vHK)%SA4W5K6<&|>h_m*`D>d-RW7P5I5FCa`sTKT
znOGT395WY3fb)l8!!N$~7gE6+cq|rY=lz>QR!6Axi+~Tk=fneMg0PH<;+pA<=8iY0
zoUnsZ0{-o{L+h9!<%EY-Jvw_Kc?=herS*ofJc|sChZJ^S0YaV@i1S;ASaBqu$K`E2
z`;0(o7vaZ>YjSwmc^g}01JP~Uf<D&OlQvLIGrD~nD)F-js8K9OeT*`_m`vXd`qq$O
z%?0uPJ;fXs{dgO0G9o`tB!EWWE`AI8PBQvs%TKT`K55OxfJ8nH`dY&$yX6yOc|rD~
zUaFw(q4!FU5WBRRU~#K@uuH4shx9xBH{yHDM{tIfI$Gn|@4<D-Xiw*{y!?xPc>|X1
zMy3Wwb@7Mp^!iM#;ePe!M53{n@5C;(ALap>Dlt?-C5~ca9d7@HNYdp2EU%n>0~<Q~
z-~d@L^f@FKrW!Wi6c2+=E^y~FRq{z?z!@+KGWL=%$3DlOo`h6}l#Y)n-DSU>t_2MH
z;Gd{ip;NJMd7v7X43UbB30kv}QP1A@OO>A&Zx5#?50h0WeoLPHsvEdT0S^zTRu#)J
zJ^-JkEE0X*{g-eQ)r`wEaG_o62964_6se$2!ty~AMwO(BJXu2(*;Cw5AqABz>lc_q
zfI7&U&nauqiK?Q-c7He2pp3Fsj-`-zpWh!zLA|7*hS?u3b>Ze?9?~`ZF#{E-rZm)i
z4Rw;e)D87)2Gs6{6jT;k8mERAXsDCzYBy9Q1L|sNP`uAPiN>gLH~^?X{FV}Xyc=o)
z52UjPtDPh&c!|Phc%y=UR;>FP{_UX(euRCPb1PH5pWq>A!4ecynXut_pO?~5Pim-9
zc8eQoje;8EK;3n`!sd-1rdoKTh8klpc0>ItqpV&lR9SyaL;X-gRoFjvLtT|oR$t<L
zP{VIGr^@<vh-z1*UE+p1RY8qNw(9`}^#{=^YHYM>DAWG@85cIb;DP>jphl`VTz&3%
zI91kvXsCeQ;D-8V29&{griQE1P*-TEDfR+4RDA~2?b0CnJKe6~8tMW&=!Tk>0riYb
z+VMWio=w$o?_h<EpgqtHbzBD2&B~Li&y9;yP%mq!kiG4{T-eybLmC@p!&EQtNS77W
zP@M1KhWd+wN@D)Xr&SH7_Df+tq@iZmVK>y=jIt_}X0y+kMJcF3K*4@4v(Ixw`FSwo
zKj_8mZd&YrRz&=g3iQ(FYcUb&&fa>00>0Arx`DF=I0f}XsbRd&CI6X1${#e;we}l-
za3N(K4{50SM%BE9=cLNIMMM3>{!cg5qZv@g;*w2SO$SqDjnPmy*w?tBuFrt7SyBUa
ztCfQ4p`mWJOWjZ-GoVi6a2KFDUQI#0dA!2LZ2POHUG3V(Lz>n7V}WYdO=%oHtf6kR
zUv)z@Wk3xl;2LF36b}Q9`D-;)wY|^{b$<rb<<g*dpG|l|I8di)sN3xe-B2@lptH+S
zC8s;!mOTpiJ@FpYMeaFHn)laQ4>M;!H++9pYI5SZPy<d(Pd6`WK#`&CEx&i6<~<(L
zsM&R%Lg;5W<vS(br2)}>dF&_MfWOZG9Cf>@s^`}!z=;}Aq;-3i8}P;qz{fqRL|<Ag
z$k%|21@H%Mz)?J;k<;0tN*u2>W+G3t7b)aCDu7>eVnAvh*v~^6aL(fj@N5aerLp>m
z27F2Y|LF#7%mDn5Wi@@dM<f6Zc(n#xDu4^zfb|)GSBy}ASEfrGt^t<`V89I+<RRT1
zkEj?xeFoR2O58n2wNVy1cA*>aSSd07gI5N)3Fu2%x5WGWI9>2_8ooi`+n;jbXcG_V
zf~RPC<tqGcoTk=jz?TK^5jWuD8GsENfQa{bLOc~Uj>c-hCIP(K4S1abOp<KRISTNe
zbXC0?P}UXpa5vzH48UjfRC`Bya%mf=aMUh<yPtHmaW4<Z0}d<QJx`T*(O*(+d{6^!
z5x^JRfXfwNQkW~`FiG0@=)M%-B^vNu0ld=<cuxjk&$Co_ED^4ykuyXCZWTbk8?Z71
zaF?ET-%WF@PX?$qz9)crZou9dfbr*5RnHlhD)Ctj_`U$H|E&u-8+k|*zyv+d{Ns^S
z8>=<o2Lkx88}P9Vz!d@!?{m>_Qh+`U_@Mw^<p#Vq1Mrm-6>`oWo~o)x18x(*5;x$f
zJfz3-sC!h2fBQqK#Mb@_Iko_HKH+NP=RBkdU|m!J_Di?%mm0830RQd={09%|62H`w
z)hQxq>wf%^2K-t8?{EX&tx8N%&TANzK+X|S(`mrL8t@wdtZ)NP$^d**9rV!WELLg`
z;0{zA%mn)cFxL&(D+4gHU6nXGJrn#{1MU~Vb-!^TX9Evu0=T4A0shRFYU6Dhuv-BC
z(+&7&2H=TORJ_k$?n+g4j0QX)fLFQ!ugL)X=8!7!b#YtJ$VnWd+IUa^Pj&;Ik^#6?
zCf9hMR)N=mEgJBU0DkssR~z^6kZxmOkgDpf@f2WO10EK@=iPuWW&rk7{``F=6X(h)
zafSx`Rse5z1KyPZxcyF5)jYhv9l#Sb;1L0w;0BzS0eB7yU)uN*x>yHrdq0JogaGz*
z1LkJ{uF-3y)}<-HKWRWwLfQX%+=ZNW9@5CEsaI7Uo@U6iH6X?ukG;eV_(%qz?8&35
zRZCJOo~r@Ht<1i{4S011;Fm9|5?ip<Ic+>rsM;uj744JUfWtEYi-d*8`z%OLIufXz
z(IF~1yW=re8+Y@NZsVI;?Ek9>x|(tx(14=3vzNI6U*I9l0WQ>1%aa1H0Wa2o1p+wN
z4R~ipiNiGD*&n4!JYECh+2yg%a|8M_0Efg>8%yP)OZ*r`I&G94Id+a4FfRk}RlR_H
zQC1xqaH$3q8>s!}qb}sE=OK-p4YC-D_t`5mfd;%)1BxQk{*@c>VFj3+Sf^;dy8Fo#
z;5iEL8PR>(KXwCNl`QdE(XE^#%gT74XYNk{e%n{t_>8!E*~8p`B^iMJXB6OB>E-t8
z8c?=0+V&%^Hg@u$+NesrOf%$}a?x#!X}}W%@NaIwzh?lND^-a<%1&YRA`K`?U3-oj
z@Qw^XpI&^ND)73*<1`>U1wD4T8?Yh+aImPq;(ea`Z3=K3dtr#ODe7Z8+YOkT1XSv-
zbM(aeHPHy268{LGz42kNQVjPH1>V~{pujt?_t6TxMUSQwc)lL00%PT(YkrO7bH6*E
zxa9L41>Rb{PQF~&t%B6X-Q=Mm9fQcnF+&*37YbeGEfhgBR#i!h%g!(GSl*87l@VF_
zV{1*HY0bspxW93@aKN81kBa$o8f0|&P=b+&840V)o3hc-(pDuDzG#^3KiQ0A90-i%
z7i0cAVwd(76JyfW56qSjFPu<<>4f{V*6~oUR9&UYD!zss{v02$TCGAQ&JE52U*AA|
z9+iNj4>hHuZ4jRWCl=EhwJfRc*YLcHF#rZEaA)!S_hq5Z-;VAs)X^d{4VC18sR<0P
zppJ|euu7b=Jd3c6*es3ZBV;>5KUBP*iq=k5^VL~ulY#_MjJ{q-3$Jn;lTz)g%2U;Q
z?rVW_T6oO&RS$jaCHQjSs3%cjh8+>&*>6|TIc0mqdib=^Tm0-}_v;U5@IhWbpkF^Q
zOTQkZUWe}>W!+o)RW`4bX9ImqFsQR52)2^-Ze$ByP36^JAaHs0u6{+hoU@Nd(<`r9
z^s94u)%u2h^@4s?0y^%3ZE%E~MU;B`xu2p2r#B|$StG<Iu23S6tsZPEp3Y8(PLr2p
z|0(=D@RbWcUxM22Qo#SA|E_~BrT@(Dc3`L>*@1_#;fbfozw5wc{WN{u_IFnWFZ{j=
zs__*&yU$MxJow^V0e1RO7Kr&31x3rh{z-wyGjA-c;sWk)SSdpDi^FCr{(zj!jv2qn
zo=N%uhL@Q+&7_o#MFC&y-P>Ao{wi7Ur-suiRUX8w$m$*YRn~l%l<l)0!Y&h^gAuXS
z_Q}0P;4=PK23b732-=l>u7`c9I+tW`B>jn$L-hRJizo(4#86j1Q@c0)`@OcxKqZ!{
zmdcWQX<5!OqIFhhpT0a-Vc<Vzf`Pm8Jo6#}>__q8zd-YVZJ`DSTya|6(>YT7?bz@h
z8$ALqh7o!A>S&L!OM%3|Y|i9px9IAkfN%9ZWh&v_BXHlyT~SLtA-}|QwhrN4bFVn1
zo`hu|1Mc5ofceKBOmsfOj#Oz+%AHu96i_aUp7?-w7&wjR@s)i5_5oH}DTLtP_$7^*
zd_uA@J6S@I==JO25PMn7rBdpfirORNuzQ&eo>MSN5-1q=-Y$5UoP{iZ@Gq(izr$*^
zBALcm9?l~g&5^M%WaH^CYq0LC6sh-f>?B)J{(btqPaFjarCG%`;Cnd}z7^9|*}lN5
zUZ>p7LhY{}FTkF}nfHknLIbj0BYL2G?q}g2;w$;Dn(eggCQ*;lO?aBr%LWQ4Hj5^L
zl-|0fTd$*~IkZ$<r><g2iT8s2_JcxSVfGqAU|Zw?KL$C3?}$V%$c2AGV5gAr1+<oP
zQsUD|c`<%N;|WgKYAm1O^+pe#6wZ$xJk_Xw1jL*1*&F~?Q&dSfKyQ!9SY?;244+{v
zzuW8ekLku$Cq6q%oe!eZWS7w(Cshs!BD)CYUq=(vA*KwQQf!hfR3>5n8P11SgME^!
zazxem6jfuKFm#PSvwxJYYCICJ%}GT<r1##`u|%rn#kknq1M4GOPCEOE{kK2(yts%%
zMWqV02glCcZ7kmiI^V9@A;pbzihJRgf+1bpOL8whAIai&<~hZw@o)wV0Q%Sc70a3u
z8_jxmzkMNc4XpWx4&}2cN+|#C56qc;{(7y1F+0$|W^~-|wQ`=6$9PWnH4<cRbX#YC
z2FH}x-D6~MV5?0?5dUjZgfIRQlYghW_-ho}dOv*Q5L#;=K_9|bKgtJaeA_j=>aAYo
zRElC%jpk34PHn(=)-|MgMbhj-|3dl2sWi<x%}T_3(mX0@R*?q!chT=x5`Jl3nFPbM
zX#XdmVAf)SVbT6#c%XeD^Dpf$m*aJ2R<XD8^>q7bscQdt-F|tb{a16b2DdHqrrWCW
zySrF0Va9|>q&^WbFJ4~$l)}UAC7F2Grz`8!mpMQJ`UPG~j#mj#%FH5z7*dv@3mKfI
z`VT_bTys2eaWg*7u{(Pw_XlkZ_n~)T#rCMf?3;gqil2B@fhT%<iDzC<&5m0uN|50k
z4li6I{g?J<aO|}(etXwQA@Npnnmk?(j|Xcf;%Ltpz5o*;ky!CyF5iKSujlBzt63Bf
z6SeLM3JJz$76xNC6>(4|;oNitUbgAHnrtg<1;!D~rI825&N;brq`3FF{SO%3?j&=2
z!@~Ymy-Rj?;mY(8McP$|kZNP(FRZ|wc+RHI_oe+;>h|B8>$E&*Q7P-DqD(ngcpx6G
zM%{DNJ4Hy)4~$TlU@-PG1X`j!IKg^33B=tDTG3OtVz6xHsUs7}R!(tBG^M`H?Jb<K
z+Lr2{>{NfzK@oUslGRV8o%2@b2h{$$E51|vyQuxlv@2MWc~k#%|KadC!g~Ar<WPju
z%95g#uQLzD%#+n)#qCcHkmoGbf1E2lo@<tSYe7YWhu@$vsYwJ_f?dcS4tyPf5?^D6
zQqOnvIZpGW#R#+Pphjx^2TvjV`a-c;ocsbwJ3*n#*0}MOCh<A_JBJ)i*$Q883%)jn
zCp&a~*#X$y_Y4uimw&kzaVs*FeI|Gbm;YmkIy+Je)3dKYKpYK5jUOL{Q6dB5xH*fR
zDa41x2~2HbE@G+i3qQ?Y?yY3+Wab`{<cMMZZZl_2%Oi$C8_lg55`-nj3j!CwCoixn
z+wv~3X0#Ow7pclquKK5o;x#_t4_y2W0@C<*e(X=)(NjIV;E?0rrCg}03hgzESO_+u
zhLXAPO&&}uE45@{&d3}u%zllzadUt4Ku$GhmKKNi7|WZY-KU3-i+2BjQ)UybWlcuo
zp!k&b=z;t>{fRyCGu|!prryonEos)6jgLR3{*DW@HYNDe{DrpVo4zkDh!<Em$Pgc{
zaijm`GigDfbORnRkVUWdfKk;(b_A#QN1NNp%^y9`t9mt)#4%=rWX0dTY`NyZf@U1u
zI4W@~!vT>-|J9mCU}ch|r$-O=3je^Ixt7kfey>m$XH&7iL6%UBN=Npmw{dJrjko{4
z{ig-vmwL<KIE}|>49xA{d8)j_+d1`a&H%7dy^M39*vgbjNZ8*%LmDT|(u$z9S)o78
zuQ>QEpN;C^=OJ+*QutY+@v|}LV^2-6pWtVW(Ris>Fk@}>M_0D{X=OD|s{6u(V>%B+
z9s3<bUH<Pdb$Jq1iX>&69B*3PNqiWMdA={gI3P6<H8H{-iHy|3?>OGr+7SIXi%BNC
zCYK#k(@2C{)M4o4L{BBu<o6N2y0SHhM?a6#ynf#oL1S`LFg~q^(OA{P*A+4*x0&&(
zEWTD{k(wC+v8FYfLnjYMUKLh%T)IAkIIM!)XgtRoM)rPBeFm2`rTP<@vgeqB+(>U}
zfp2AGFO@T%fS)#bn-a*pG*v`P6-f)CjULQ17AP*nbXB@v+{&t}d0SkQO`nc4+0}$E
zmuo#_6lR~tVVUP~6jJsWe7TTmqOHZI??B{J+0Rw?YshTh$|6XR9d!!2J7c_Yn9+Gb
zE1R?5TJr>IFHm958(WW)o9V@=x*dPBtaSU4KMVR+E-JtitT~WqBYiZHRXxVl-tm1a
z7L7+6aCcvC#iG0OymHq=?s~{wmfX=-%^caZHnL$A{GywN){GP^{IZ;BJ&6V3Ez*8t
zL8H0@2bY(v>5ys3xKG9D6o}qHNY7XIXXoxf4z@N+LD7LiBl?v<5iBXuN`q7Tt(ct7
zJF+M^=7^bCPww8ipDV}06k`8>j!ztKqX!EWu}su$>f$^wp@}Z`pyROF)FnH1_j778
z5!EY^vT1PPM2Ke)XSQ*$<A%r{p@5h6tt<#%q<jCcwK=*g3&tsKf``SqFy5b>Rvr$q
zqhbGhih-o?>yHiY<+lds)5C`YzBeL!iM(ml&pm=?<FzoH=ys#yXFMdQAEAG%Ie){x
zW+QyP8VEwLGM;mpEo3-UpTk)0=0WrCibdyEESd{t&4s4sLQQkC`Ip1Lp8R8%2xJ^x
z*-c<iM(ssq8!Hx#X3khin<8I#PL!<XjhAz6XR?!CcI)Pu`P{M5a95$3kJL$Y`;Uvw
z`~xWR^m@Y*1qnx}wGeXuo$9WmhCAnr8~8t-Z!90kYNxDe%m&%A-^JA7>?fGa*Aph=
z&&uJlm1E!+dILxbU(h2S(Fvk4M^;Hh0r0_oo^?z=^aAUtx{cuiqmk8NqKPJap@R;j
zu*Z5qpJzbuRN?2d7Ei3boBg06R2z}61MzuY74~=k9Yl&3!LS45G7UtJ<W=7-+c`_O
z3U8^N%^S^O#>#e9;V{W~`;f=6@RwtSzsPZGh0xMU_{)aKrvcwOqfXYQ{(UV);gL*)
z8dw-is(mOcjy0{(0tJ8BAoQ><{8Ke)ZVY<A1eD|F^hu`*-^TLsy@h|s`k&+Hx~N}P
z_1WqCSXcI5)F1XTqqld_4B;*_diUT4<}#yq7B`BwP)Ou${O<7`ceeQ?QD&xu0WIIw
z-mUg?R(K=dsIH|y>B})2BF8C5>&0~6aj-mg3W_JK^zR<&<>Q*7ZC>dNmIBN`r@t)g
z5m|rB`rJtH8|<5yJf<yj42$q^K@b6o))OGN`a?kYpfh;$%~WScvgvy1{><@2|MzkA
zKj*6^)@R_CNv-x?oY1Y-$XB$0`b%i<@Cgq68I5avU&wrAET1M5HgvYzO~YE=o^i3>
z(OcFOvbNTa@5ySh!W)WT;bp#BBMs#2BnMyP4UbI6g!ow-`LZ)d`ji2(p`$7rb{oHp
zb^L}v=^B`g5%j%bL|+6i1mk6skr5TYhrr81r~KWkKKB!-Xm?&U(SZ*Z8`0Zj0@x^d
z?B5XZHZRQzpUB`r#@L>v$3K`IJaF=&!G#R!01^S`6CU7v!Zlgs<AtNQW@X}!^w|S(
z?H=^4^vpo7+#XT4J>*t;s&Zgf<)Z9CP$+X!d*ojocavElu_i>|WU02eCWSaE;+NOR
z`{)5G%b|!=R!j)+(T;PZ@D;jQA{cHmqCQC~CsU2$2G?uW6Mo}An;CP#(zU{m2)xg%
zWh{_`>fxt7L!}fZugSWQDUa81Y0@y0eeRSa;n0lx-sO|3PP?Ru9$@MdhD^kPK>_a@
zq0+6AL(TXQ2iPZ8Erv4%haRa~6f8OtNW8&GoZh*+J08}M#Wf?Se&@~N;ci4_;|-0O
z!-ZTa_9^b9dGB+Tjof&eQ;07RT~%22UXmW*4@2P(Lpid$U*!p1{1%*mb9iyUw>7eZ
z>BOijr7-`#<|0nXm6}_dbY9)MKc~b7552$|Ttu$y5Z@Zv+wrwjz}iT0U4i&uXckQE
z=rZ@e$NqfIJv~P}*vE8Di;o}0w8b5&->x8Gzjg$D$Wcu|Gj|zvx2kO32}E}u4tZ(#
zj>8}~So&VT0!N!dTG1AW?mA2;f)FQvl^zUwKOzi+BA+zBp`7C;#%>;Sn4(XdXmLYL
z4@*ry4f=?|HJxHQ#9%4xRAHz93^f;$6BW%K$`8oF%f9lsE`AG+0~}ZTt&xu<U{xJ~
zgFH(?O|Nf71GYhD0t-m-&#AG&eWzK2(e5-82?BaoV5na_9q*tX(YR!8|M6ZzY(vAV
z_Huxkgqc8VwEsgCXEMQ$lL?+fr(2oek3`tWMz=gf7IaHNxH`!3*q~>!^@cyXgL%IC
z5Nb%of!0HT{aXX2>jGW^QM@16|Gx0INwJXwCs`v!S^x`g2|s_=sDFs+cTNg<-wi}R
z?oxPuLk?&nJ#2aC5&xnYMF?CJ1NrUgJkcNh*yy+d7^rjbz$!*_AhA+pkG(CGV-pPI
zHZwt(U+2KuB<ogJ;kTR_1MwvJQLZPfh3l3G8sDQQXc#J!dK#a#KDboC8`;kqS(X~Y
zAI=jV%~3s-J~n=|n!baP4g|**__#+HvIICxm1p(Oi_PfE%Ah%$)6TzGrTFK?EG-V}
z7&Q_sjlWd?g{%#=<NK;IL-C8?r)q1zuf1kUFn&3xI%Jl9Wkl=g3^xa9H=<X-83~{Q
zyD=8b7y61GLKL2>?hY_q#Y=+0>5~_-fH+<8-ziYhfk5dOM)Wt5rEsmjd_ZjvE6Fz&
zoa&5xV*xuB$n)SZBl=%tL4uACxf=KFIEtB3mG&eLTcdf{T8D=@V)x*_W+}~(-i2#S
zA{WLOyhqLF9DND@nYbu;qKp6BONPdM5h+P(cr!OhrPL<JSRkBJiK?aDfzoynR-@f8
z#FHd(^k6?DT1~~Jx-UorSbQ_+cudlR47#2&=z1!tHW$hd<|4Q@G3Zvxpo8>1<OM?t
z{{Z<`=OD~T)6>*g|DIDVIsCkd@y2>hL97$Bu@u_h9`Ls9Yi2#fth5F)X34m`Kl4j}
z5i*%GWIkfk_<|<@Q0*7`;(PWWHPNW)hearJj+%aYs9UC=%0b92GXHE~{<&64OEUi*
z^kdn39pY#}=uFxCBLQzy^d;@>EA8#8+G|LA4H49fCHO7-d73#CiTqT1LqvN}d1SP=
zFYPsizX$}|TY-?BS(WiAi6B!_bmb97byj$wY7Zl}f6wz5fglNTN_*)P+WRM=9a&Gv
zQKvm8@|~L*J-rhb&B*nt+a7Y;L)~V{Z59~LrZC!jGr$Bt6E+`;kIrX{6kNMY&kxc*
zm}tN36JeZ+*GG1nzO8znsABYAf*{e6D@HG~6dYZ%`2J+IQonc?7Eg4h0JCH17MOer
ze@q0&!{2>}xha}xjr74nS-89#*(vP3{%n|3N4qLPa1Wy`fb})+b(TPT^%95-ll0@K
zzQ;2QW*<jJT>;;F5l)B(ql?Hxxa}o!ngez8TZ(+ttQ}nir`=+DIfrcJAqkVoqAHo+
z#MsCl@LR#w;h=Ao^v{M!QRn!OcZ2ZSVCkBUUosEq><nTaA2ojo!uvA*W*Ce=#?k@i
z|CVe+>&`L83x5>$u}sK}^#ii*d5DjF7Zr?gcp+hGP>=J79=Y_0pT$jnn}1)Etk-;-
z!*`kSc?s*w=ysM~iQH<?w=+ZjZ}ECs{PM2{|FZa(&A%M}_2geJLz?hB(H6B{+ZaBh
zY+p-u88)UCP6TDWHdJz(S48w3bcvcCIAsGE)?!~&e=Tw>_1B#I1pPf$(8$8CuT{|C
zRR42b{RTEdb1Fj*>z({{75iEYwpPRbe__27-3c>inR%wH4QDO>U)5KyFZWU3jV;;z
zs4wfwcXWMOUsB(-->vTkUEhoK1d*aYQNJddYscq$B45MCSid2ChmgJ_-vs0|Q)UVd
zdb|R$u-hBSp|w{a#5Btp-rewlsqy}Xl37yuO=3nZnmBgw(8vH-um~D^2z{2OcPK<g
zu5Zm8j;0J+?I}g9Gs_;mK8oJGkh5G>eblny==nD~e&K&R|Ds?1BA_*+jPjtb#aPJx
zU~*)A5U+yGZexBH$*CLTtgqX+Z$E@6vqGIny-eL`={N$GaB5X;CToKD=)8b;P0+WF
zXw&PYF$lmly(DtefsO}(&;rq6&L9~y%h-#>mtS{$aALy_K6jSixMkBU<Jyh0jO#Y?
z=E`$>$SX$Owfwt|e|tZfWz5<>%c$O7G|RaDtyxCrN?wwF7I4)&<mFagGUv5q2cXrZ
zJhaU+W;cJ%P4g_%nAd_lgVxO%kg5KWz*k1&Z3)8H!53;+83tGxa%jk!>Q6*f5|ssY
zAa1f)=0Myk;_)=ArH~BWBH0Am+<7wr{IY1e(^OQo?G$*+I;Xvv^2b+x<JSK0($2MV
zH>0bNThOg*O#`W{F}qD_-6pkOC$%2rq8GDwnHr-$=oiVzSP+$Np`!QQsrqLFogzkH
zQP%9r4I4qDu}3h!MTSITeFR;?f!ykg-T4`fz07>U0Xwmc#yfk6)+K*Ew^Op@Wwz*J
zc*v1Ns9{LbQ{LyA`A6iFQX{OaZV)@y5&}h`RzEAI)asYvf!cUFL}p)x!q6e-Hwm&=
zXi`)D*c+X?8|>U^`Ar45n;t+7WE~2;qEmZ{G4kj?3|lIbVEiS?A-T(VO(6c41knb}
z*Hkh}4kWDuAEo%WS#L*<LzxuJ|J)q*4tnV4Uy%BQ!%#-&cC0S%=XgAG&LU>q829(c
zHf%-qt;5}UrprMUJTO?ljUbDX@y=APkw2TUsANy(EV?@vq#rwKSf2)C&ldAFF#lT*
zLZJ~Af5||6GA941df)sh1eUOu5(RgPRd@Ob(!$7?ed|si#r?#Vy3;GTVj|O}B-iqL
zan;a&*E*F1FV3i^SQu4+<uN{*iNtGxe>zDkI?ob+7W^WbKkrYpQNGhARPUVg^KEF@
z+oAjq=wy!9+uId1o_|(hZHgoxw$7|?=lsXjW_>pT>1tMHJwy4sD(i^SQF$c%7fJq4
zNghGrvR{Qj%CwVqRF%*MA}kr1?^h8aMhF9oVCTOuv(4^CucB=Fcd`{@h2RWp{niRd
zYyhoLB$i>XybKZwh_+h(l2l__#At{=s3?c<B_mSggWstlAB_;{idP*S!bfR-E!{j!
zb@M*xLjpobZ@(nXGwSA3jL^~6Bpp%E=%S<3Mcx;);0Sxz$<%*M=P9PKVq1Cj*@8+<
zxgHN1PtL-1<sDfnK`1x`R0>km^jU|5J<fdFakad{5L+Y-R`xy|0nIp3ICO-LBj@M{
z9Ys2B|J-vVLPw#B(1E@_m27h=85PaT4ASvlN3sa<V-~CYW728~da*(>5`ujMW16zq
zD$5B`--|#_EdxtACf1Jegl|w4k2VOId%{0fPw_{En9(Y94o%}*_UE%gmjuCa%OrWt
zNI6Gzml?gQ+Y{M_zosotougx7F%Gv2YV+(lpNUm-Qg>%lrpM$c6XP_J6Q?d<9@KGn
zRNx&`T0hD@LbL#a#Fm)RmsQP-y2qgdY*@pR@}IO|y4a2+)wubR%7{itB$+a~5|VFw
z4x!Nd;ixO)Rnk|03yD@#Y}9>Ab0s#=sNhZ8ZU}h~2je42xq1vK3E64jPZDjtA>`Yo
z&pZvpZoNz!WK;8%8T$pO-4otidry%k{6!#kULba^ea`)=^(XCHMf9|7ET>-Z`fKmW
z^F-P^hiA?|D##Bp4PO$lCqWbD`DZ)D6=TKmjw?ogeEI^hL2&ZZ*;LC2yS*IMM9btn
zZy+|oK7sxx)={@9hr`5rLFu)$$%qCiRR>Vy2)4<2C|G(ZrLooP1#ibG(503KIklUv
zR1q6%U&)gWhcy0(1)_EbRsStFRAnJLCmHA}<&wNnq#|rT6>7RtRcP0EhWtq{5uWSi
z5p61mnDl^yYNJk8KN72HlzCIr4MA_0^i8oD8(b2MKRY6il<^<Dd7MOpEC1xh1BuS#
zUGpK@s~dvj!qQjgl8e7~p6LmHhR4BWoq3`^3B^iE2s9SnETQA*fxQIkV~E6}d6KVq
zDg2oVnZ8&d&vP+t$BN|Ubj9{0GB@sdzAibLwVNn?$j+^VH!{2D?h5$w=4{6zS@R3Y
z5oVRlF?=O<&18msoD7YHGJhm9b=CiW%%2s#?u<I>iOcn5@xI~k6vl4=CkOn5`Z9#+
z^#$Vur7_hmPQg<XR6r5BM9Iu%VI@w+{pzx$rLZK^QfWIRwpg+&!Y3(WH*eZ<L+X47
z=Ce9T2s&qAKN=w_x$qduM~jJv#0AAtY?{~oFXuGTV>=UCa_+0Z0ZRWBcaBSq-^}@~
z+cYMloBJ>{Y+We-9X-E&6@*f-K|7K`<@d~SEyUSpjyqIY_YHI0hv7eARw<Er?zh69
zBmYtiUiFTB(-53Qa%6VHZv6UXDR(aA&N=cWB`O7O6%PbKapP=iY@dp!3)J3Me;sm>
ze68g}<HhndUwzHF{tGoo0<nRCSct&Txj=M`SL37AN3tHM`ehC==eNov+EFcYL%jTm
z><Nz#c)t?qOcW71D&RjztmaP+lYF_`We<d<p~XAmmm@1Z4S4wJaD(XL4I0%H(T7g2
zrd%@Z%SOq@Uy}v_gRt}0N)2XG_Y|xGbDKNFUS_%-ci%ksf9<%7SMG{h3KO9s!PTzV
zK?_CXdkqpyn8UJ5Tt%xWi8yx=VJ%hp-YFm3t3_~4j8m;J@dt6W;_(;K((9XVsuERk
z9UHbwe5S-YY?pd`)bwYyri$LwB+<O7!FuJR^Nh}T4m&BFZe9wvv@6J2`$Ny)Ee+AW
zRd_GAp7Vb|!Y&6135|qt)arji0)D1nrjT%wgrX}{nK=Gq-5-+0{MV&Q+P@Fn>=>(X
zGm?$wF5Jw+*C;93>nrr~9bzflm?}(@%fmXek9KEdRwkL5^AlbUBdhMshza+!y7zCf
zdOGinU*T+`@Fg1ZnHz&MzX;9jZom_kRVMA}4p{FRIY&8&H{ew7F>+EgcQFe%bLP7Z
z$MHHKq|gRP>MQz-nH!ZWP!dF*2fMO>p4EZ5sqa{*XTH-Q+F!G^m2Ha19qW=GL5ZxG
zXSQH}J}L>z0oIGIgbo_XUncC-e2*qI2d_CnNv_o#J#aX@$5mUYZ)SRBB|RGCvoy$e
z(zEvk{A3p4POpYPnvezpzb2dL8pcWX;Nk1{B@uif_@%l)PML?oKmB1kOjMV+;%}3m
zcKK<NpJnp%l>97~pIWjJU%^@@FAR7n=#E=s_70XT6R87VTir$}uh?B4fHU_sh<_?J
z^c+Gyw(BZ4C{GpLWf|=`(trgz*-{1Lrf{4IpzsD2>C5Tw)|JxjP96;+db_&SeEGf6
z%Qr9h@PtXC+1vjy5u5RrpTVl(#+7ocv{~9pFf2JhT7(p2g1bw)8crq;N|9_Dn1$W>
zO_v7lQt#~Izj{2vJr}FwhwS2F71^&hI-&Wm%gZZPf$BTY5>(ff)bc`bIa_|Nm7nSI
zQz1X2<fmAE3S~<K-ECUi-Q9`9V{!}|OPU)zT^`TLEHlbI<jT6gxGhzp8(BjC^tUu+
zM9!CG|F<reO5hn<>#oN4)Zx)`Perm0DJn=;qCunqcYg>yY;5=}`Bowi7rGY-NFJ6f
znyeiw3P}V@Bzok|_Y;YRYm<)*mm3R8JZm2xXWmFs)p?l6qHL&ICEfE$GIOeta{E=~
zSchuMUoK-iOC;2ivy2<ByJ#skUOrDY6~)hv@|C?Hb?P#sgyTNQ6*J}<yMv%v>=fy3
zw&wg;^5RcIrAN!B$&*nxj~6@z<O#h7?zE4<(1Wqzqj=-59be)JUxD|vEVTMA4ypKx
zEEc?;uxZ8y^d}7gGR>?3J><^XMeDma!_|-y2}ieDVy?@soP+l7x2VNK<D}a0<I5v2
zO^c5n&tgpuN+Qx4ej>8HEEx$u-5)aF3rR%FFWkZ~!!a?IV{#`44`Fid2=Ow|=&a2C
z%<ceUIU*X$56C%tNaKOTYqjUeV75Q*FWn;(_oXV@%1$8{yor&=1zCk)WsUs`p|U^$
zOT+0(s<g5chg?B17)UB8cXWxH-`0*F2--0W?vp#_ium|9I6{daZq~Ww?YRJ{FzJb=
z&edisk0O2?RmVHn)R(MdIdxPqtIPlR1>MTYm)}#zBI-y5r%DA0N$alQ|5(33eEd50
z_@(Pta~yR;{5ldyhRNL`+>vQfFf|yyi=cMR-6B%1r~ZwLkpb=HELwCQrTgdkccg#D
z(mzYNV>5!;s(e&4*OO-S?luV<i3#^>Oos8njDuB$9T)%i-FV3>X%roQY_b~{|MB0V
z=o=znfud>O>BIjQ{kM`%Q~igrS0C2%oi1Gd&vgCnJSJJcPb3b=_jlossGlaYAAP5a
z8T0x7s=gWhIZ*o3p*1;RFWs5b;HY%wL&O+xcV@@ZWM|T)7h7MJz1OgoCO6jdgUBPN
zP$@x4O_;9l^S|#q_DPw0nL>ZcXV*mO3~BX|^G2E)%D?w}!>PP>@!y79GM}eIlnY0x
zcFgz};$R<ULcJt8Ni~D~t&I)urhs&LYDJ-PRq~vOgNcSGNc0_%PCooL1^wom4N=zv
zZD<(7^J~N_@RaS743qv>FsTZ40Mvf#+e{u^Y%)KLke^ZVgElEOWiWlpE4GcHKv+>s
zrmRh1pnqgLL%{mPc_S?KKY1fZOR#D6?vgRGO#-;$0B23Dkj0W6*#yK)6#~^{TM^sx
z_&y>KQxQ<*ew1Rg=FD%A-dTd~a8^2!mPHPm+EiBw>p{9d8)w7FC=sC8ks!6rGUInS
zCH1Bx6eoFBbP=$!aHkDXkskCWg^EnS54BEUk@(izdXae4`Uh%JXQgGgva6<zeT2#)
z@XQ&igXescTIT+pZni6)Nv?mQyQixyOV+uqRoMtr?6V}crL!$Y=`pLc22-?)zLO-H
z^7%;}=BU8@d0K_pFS)1N9_S=NVU?-99746#wh`ubWRj~(^+N0Wgdj#%jLohAq8m?F
zSWQ^(q890WI)bi<`v?}WMJ)rOfi!ab*7@05QfCq>Z(VdCp|)I)Jq@%*U!SfQCK>u^
zsOXS6^WJPGD%Lo1JejfZUidK^suhKcv#60neMJ(1$+Df4uFvzo<n_NyGB(ms59Yut
zONu%P7$?!Lr33TKtTE77*<0A5RSJFPgltg40g261XqWC{1C^wob3z)T()yQu7;^Ai
z9U4v`FiwbK#kjAVvdo!xWwWy=uhRO=8Iwxo^#-4i4fG<yh>H46PO4*DAC(EliZ6%{
z9!cf|R5w(xU0t7joY3m^D;<p)qX|acb+UY-YG!<)2O~PW2j`rLhI9gFBjSrLWZ}CB
zmzHBxGPFh#?ECa$QEA^*)Nz=Vx9rC|A0bJaplW826pdCa+>BZCBm1w^OgF6#|G!FG
zr|jqel!Le!tU+s?oxlB%+76`rC(O9XzKyAcsJyZQSv}S06yutVaC`mwB!*Gt27K=u
z(UT}|vhfm1^?(etXgAwHD%ot*6#jr+X_Y0%X19;epNrY_?vi8e#c#@_!MG2_IEqm=
zAT`@rL0?n#{YL$>(y{FGs(V%SyomtTX4G9OEjM0mH=|!pur};_S0hr662Yh8zQ2)l
z-$va&IfQ9jfD?)qi0&17>{+MA;LA6`WF=a@oUHQN?@<8$kdH$;l-}q;m^HLmsTNf3
z(C{SLewu<`_>LOG5M-n1nI&%Z9pplS|58yv2K)hSRU<*kQ~_7(PxtFjce3nnTuIrQ
z!Ta^cU+9m2x|)wz`vp#pg*ln8@Mk4I&k&jM23#E&FO1ry!}X~B-t`OTX~zfLwqACr
z6Fd1QoL8Yo?=HsfG}^~;4|%dCK&0Houth{HC6XGl1yR7+c8s&aOm>80`{FoZu?YVR
z2QdtC?7GXk00}f)Vkp^>9T|4J1WqQf1afHy1}S^lM?BNjCuynlZFx7C@nm{&Tl>iV
z-C7Ztv5;vYS+mz09QL{@a-!0quD6e80)Wk-b+o4q5!$|>sN;6shZ9vSkpcb-9TeL~
zo^nm~YDPA#gThf9b}~2DQC66-JJGRL7viDzSuZz?voGl0<OzsJKdZBD)9M4qzg@PM
z(&~-n2NLdeG-@R#;FLCDG;8#D`R|dMEL`#Jvf)A}1Ad9P!oucZU>R_FC*5-SB+)Z!
z@?e#Dj<9;`2t{%yPo}v`QI0|xa>`S_yU|H`RZSo6i8^Caj+c*qg1ofsMt`ATPcIQ3
zdqVYpsfcE(MqBMUtoM}eRMg@uHI}1f;*&jf1Adj4(()}RJpad*FGZ24T7GVcYWb2o
zRLg$@Y^vq2$fs1xlVW=0RMqlQQPR3vZanGkI@RA=e@HxqcooP0RCsD{XHpXPkbS1F
zd(M!>wPVf<<D=fj)7-BZ615Y~l;gBeMs)U;6<|+&Xq&{uhD2{^7e>ggZ|N6|S@vIe
z0Y&G@v1W2M4el)2=Vfmi-$TNjs5hz>>7VrcHeI-OT|Oh4!$17SOH;@2bN)HTOYNrd
zx3$Wp<W0t*C_Q6yigEWkhcM>G6s}?vG4%!t1k23x;sf4xMrC-w#OU3*UEWC7)L20-
z-Y4vh`=WD)jNZFK);EFDZ!qzEk?8Fa+fIi3ZM@VL@UF+g7QV<Iogn_7SU?!Gkv{5V
z4)~g)c-X#h+{pMqY#L{m;eWbPb`@Id16k8Xn8R?u8jW>potSn@k0ds!mUTX@X*U_>
zM%G)8)7N%4L2~Sx-v|wq{~F%KAb1ansBs-Mty$qY$gUcPg0=7kqWnA`=UFql&4`>I
z6uV3?el_fG9{;9xE1{Jm>8#C4jfe_UT5zmcw&${I674cQwKicQL+(cWYHh;Gmr!!p
zP+j<An<?CzBZZ5a{$*;G>cEBx3NAI`qpj@Q!;@@o*~<{0$9Vcc&8g;0=D$7<t&&w?
zc5_Tl<B4!qeV@D&3lNP+ReQ&5PuRPvfmtARu&Pr+lAq7ttyE~y^093|MDpn9j>9BR
z>A+W=sMS_oQYz}JZO*m*|6%S;0Hdm|!0!x6AZ)&%pm8NY)F7asY!RXvNXU%L017IK
zS}YV%aVeQ0D5!x+gxBfRrD|PTrLDGgX~m5qYDicTaA8v@Zfq*PI0%SJSS0iP&b@DD
z-h^Q7zx{vTM{DN2yS=-dd+xbswSQ~u--Y(?0{i!V`?o^<{v>u6B%UH`!Md7<-Uy~n
z<)M2b59LXV?l*-zPs%|!LHy<pJ^z<AM?VnyU_^<6JLUc(&|RRU$g8TEWCOw*x9cD8
zGlg*{^AbN40GOM4D1dooBWv~q-_~oE$)lQ47%6FToCn%KlpA~|v2uFC#s848sG^3N
zd98-J)RJKMktl_qfzaUg!S-`zW>rwecvwtm61nfb>v+`P6Msfw&rCdi<bP}2R<VFe
zXe`x?`qsFo6dr8z^8fA~-M_4$Yxx*Cw<3j}H4N6=#9@)ql?%efk2xSyk4Nr{S9VuZ
z@;JWnst1Fa^&Z-%3S%Il@D74}NYcHY$h|R$J7R;d2aLI|snmnUqRatjbRddHaB#&w
zOqMn*zKAI##-R@@vp6rji&?wmBRO|b@N8R`nN9^)G|T-m+HXQEL<X#Y7scnCjC~>3
z`4)tHx!<j8Pa;6TvhE4}Bcj%uYuL4v1Es_n`{1;7fzJuguBs*u?Ky{p@$WVsBrNng
zJ(CJQT!d1&Abd|%eGm<yRfN|w-onh%CJNOaRDV2#>n&I{qu2foiXKFvxU5iNf&=O(
z>SA&8c2Hz5KB{ok$2#+*%gGdq3}raM6A$Rzw{-69*OM!dN3J!4EWJLLk7X$j;~XbT
z%ll(1Lq{f+pZqO~tP-DOuZ@rs@(t@9ymRTi59}Bw@Qfh=Y{yh$zaDNA?VOa~3P1L&
zJzQGTIhgi>r}r8!wGV!43vtGPIos}a4I1CjoiYH+e6DyP+<^n=?zt?#29}km_1W-z
z-!@VguY#DJ^ZGi*Gj{x#5y5uhIUn%sTGjJFDg=wZWh`)J*qC#$duCQ{;KVrx&&qC-
z8#r#x!M^E_o*Zny7_MvDawP_kn*COXV(nm=W5$>finUq{%1=Y)TpMq-;fehYY3sZ1
zhGHGyr^j>S{udMhoNaqw;BtIsmxJw5Md*(z!hT{ZT}c~U^(Cf~H!>pTt*Q&o9!ZE}
z&V7tdyxr==H$!*%GTg@w9Q5MkVna5>KmHcD5t{p#Fnq#O)d<V<vCkC|z%jy1PveE!
zSt>#m)Gv3gm~|zIEM`w3i6-2@wG7dQ&F_UsP?GIGGT&D0J`TP+A6DPVLbZM+3sL>s
zIQ3&~!va;)FYw@r$!cQP5<?{mM++)kUcGg$-A6u3C(V!S9yW@g7tXKh;oIs(_+sME
z3>rlPqWw4Fe~VQc$l)R<>$&Dep<nT6vW2<{zZHy*xv97AAd%5^5`yThDSSz{eC=}a
z0a0WudQX!_RxRC+-%kK~XkuBJS&hz>dF#8QR%T}NmC1;&3``!iGK0BqvoaY+txOEr
zuJ}z+<KDnaeMX*I7zuY2vua48<8T5RtrfqBFALPPjw{G!Xy2bzxe@O^*5|+F8y)gO
z2Rj5axdlhxXK8vo7+)GrV{6#Y1TdZSoW?jt)m(OlC}&1)1q1Cis^+Qp57eC?bJqU$
zkOf?%J`NslRe!4&Dv~)_l1*;*!>O~}s1n;bdm=op&1l$ry2&L8??2YFg0MxC#;4jv
zt#GAL`5y4;bA9b?-0JPN$A?0qxy#q{>p)+nQnTCYk}?^ximz;!$<QD2n0u^mCyR{k
z2k62Nzi$sodY4vS`!(%)8JM@K-lzhL@sX(CwKK6se6HS$@T+H2V2E+=G71RWj?W4}
zT2Jy3;3p%#J6OSbtHA#9a%qjFxPveI%xwy*#U&!wE@uGr#qa1#Re;mR3h%Smm~e7-
zzbdE7F%&8EBgwO#|22iqdQc(iKh-GQc8g%~%lVp$=MvG01&@+=1&JEreLxL-7Adt;
znwqK${q{o&-KhF`P_9=0pcHytdQw3JTdoDqj?2r?!zJN85>ob;P0u;t|GO0U|K_K_
zf04#a3HS$vwhaGUo$${~Gx#SZ0slX55&q$rTQ{OAXnhWyC@OrBF742=(K}k8-giyv
z05PTI&fJ<Q(!);f2Urq=z2;gQDWl2Pr{df8=&9-JL|bbaN4Eq8w678xx`Y)Fdq;3}
z;}%~xuu)CWupU(<LEkKSBdSO{0jccs`J@sSG53P1dZ5U_q`<b2yO|STH*9gt%A35T
zU^GDte<hKL=_MHL`Rn5oGguJAwSqDg#@=%tbL0ONbF=C`<zq0=0nN=0Hhy^5N!*IS
zI6t#nXU&Gua${z4){D#}oUQEYd<9ZWs{~Q~u#4)s1lg?Bo|mDTWv;hfE!bhb;-@tH
znO)BmkKwjGlkT$rMV<WJF)G*kg(E2i{>G@8uu6`VpL!YyK2rk7Y<T<J6NQVBdFk;T
z(kd2<1K2dK%8>zY<7;e*thj967iv{sDuo}X#FC*St;#!kCUKN?6qjP*{&Og{WFfV!
zY&s!JJ4u}_=YJZDy}`znnGY75-?~Fht?rpNg~jRk+;hTWdQN!%9aaB1XhYS|LFivk
zI$T#gw4=+q0AS{f3ms0M9Yxi!qfq!U>n<AQ?Ko_{Rm^m2noht7ypz&!vL=ua%~T&&
z!lGtQKB1p8Cw58>7>qpN!v*$$t?HV@%uCc~W?IR|&BKr`_uP2QzSqAceXn4OlKn4Z
zGLG*5TQF8q`u~DX5F>Wo|0hVWJ1*aE2=ne(aYZlq<qUksT1OW14bBCitYshbL4+M`
znD2C-c|m`?svSiRX#F9(eoadKxj(tSd)!!z6ZR`t4lf+RiH5}Ye2BrD%R&FAumCWA
zNqU4NIj-d_TgWri!L@q?Mf4{YrqFFq?UFfCbejjd%~MrpR6Q+~H0|bT+~Vm*?m!qd
z{bFh(Xtuw(S?IKHxSb@ZpcTAS)}XBq+eT<1=czwyi8>}?_)D!`L-WxBPJk&ov8oG+
z>qxYvyS8&C>uC-~N)_o+7kxsh^_0?w$vmC;RxdEdsrQ009to%7M@_G?MnY-r7jyd<
z*7iaIuXfp}%R&fgolCEoAwgI)&IfH0Z2yNcnm~_YAyDZ?;Tdw(`kdhJCUM9yNH_rc
z_y#g5n$DT74_BpbppcmZTpaorT2Dq3ncg>(8U~YvrkyN^b+Ux5pNtza{^U!nW5g|6
zwHD_Wr_wvHr^IJ^#U*sb%k&o{c$(pmAdT^vPv=!gMo%Z>p|7aqdq`%I(YnzOm3i~!
zfq#Mq6lb}wbsc<AX+Cfz;r+^Q6aICUFER~jAJ(1~Qdn`VcUuD}NHJ8TeTX3uvkb;)
zDX>F<+8k+9IDWN;ik;{y_@;c7ui(J+<tSgvx2#e|0hHKELh(~zxri<K9NOv9qH3FF
zR|PwP;be!^z^i0)?TEfEjJDI2TJoy$b6B5~7b^&Rf+M8*OFkcE`U*32;)@rOC}9f}
zb+U)kmV+ScRuXrTr~zu=uj25N<PQlk=3SaxCnfLUaPc8oVx0)HD}VBJr)|3utMC+=
z&jI83=28(`ahfcFpkkY_R=0OdU8^_qD)>h8wL&1{QR^i2J@w9YDm*w`$w_WFPQHSE
zxttKs^-xQ35k7L?KUE)qOyXpYNU^^a#b#!591Fx!d8h#74nj<;NUELIG*vq&jB@L0
z)p$E;96Z*yT6F-R8;t%9fRjpPzK;Z%#pt1iX?2mKRITmJ0;zLW$n3=lf1%*wje$4X
zj&opCzUyXC4dcLmvTuqiV&^$8pmN)71JhX2gfSGS_mmgRMlS?)xT}3{KO5Mzde7xf
zCmP)V49qX<8u~7ANSzpoA#LPx9$m}#RC~7j-Uako$=u~0n7TzB8&h^C6F-?5WxzQc
zot_-(F+thXKPo6~`;4Culy+x&yimA#P}=AsW>z&$^8it=_8N$k@M)rK`1|Fn%@yp5
zw4l8>Z@g5Vh&GafC0Q2vr@Ag6RAk-MO~KFA0yAh1Ov@|6B~gMuV(<47U1$I@)2SJO
zuLP>b3uX5y>dw%RAIgR;LU%}hJ;?W8artcSK*y*XU)TC2Fi_O-v*ZTT+uqZkAS9-P
zMe!R<k=$U)Nd&gelrOx2%u)5$>U6S=H5M>{1nnrd;v(G`u=V##9k3<vr2Lo`Whclm
z-eDNm9DMdLl#vtAq%i+fexkJfj;(d*K%4ST2udOyH+&2RnUpaHuM}|DGfT%Fbn-lt
z_+vA@bT8#AoU0th*FvFDd~%l1J`yJaO*w7OpcxN8I0@X!yme9_KeMo{tafEyjlA-^
z(EeRu|F*dH69uKTD4eu9{*&$$R!i;Br~I`(fd&B|KDIedI<F#?c~?@<ZQzP05ZCcu
zJ*-t$+L9XjZ9R)fkweohG})A6(qtFBDl}P|cT!B#lb^Asx>ow^uaX31ZyHM>$WGM>
zNrJ2$3B~%lyOc)+;km(K0ukBPTG~|aBP)7rErtm+XNAmq9PfZz(q2~Re)|oe`y1)J
zM)#$atVQjCTncP`Nfo$)a@B~g(xNBX8Ff0NP%`dRHH8P+k(wG%?7t3BQ7I|+GxRIi
z&SFkKp$)!y_Zz>N%&*sc7s5X#epHa{YLyDEBg#Tc+#bMz%7NjyC~dwA|H)$5(jm`{
z#TOnl7Nv&{oEbP(2)ettA&3zer)$~v`L5_D&F}O?#==l8b9o}eTlHIwckMpH{4<Kz
z;dkR?SLBX${c1q5ZGD_F+2zCV9BHmfkx${rq9&42;f`b!g=*HpGaH7-on<8v&pLMn
zPA|8)mk30=D-N6+I3emK{;s(ynFKrgPS-iX^GoO#_R0C>$0JGir}HTsXJ(Z!j;JBR
zq%429l^>gCzloctWvxoW*?g!KG$rlciFmMfRHS!1EZ^Hr3!bM^?{ZBOS0jY4?K!++
zcE_}7!LBfk_!!9#)%%G+>^kf<H%5=YpW?z!YRw-Ku!S$iBc*P21p{v<%xu&_dZ7+d
zoOPr)>x)vO`kK?zsy5v53z@9I&)ivGl|*j-n>%zQ*O)iF6~G@kGlsxw^rX{X@Vh>9
z$3F9gL6mOT6eX4H>*>Ho=uo~fuNy2(WCz}K)I~$fvfQTFr=8eveg*=HBtCUl94<4e
zvNE+7{i^Ncs_(XwRWa`Uh<CHL@k6z6C9CXDL9d6J^1(mmUa!n%sE+3QGnuyS5~epq
z-Y^IGIO4NKQxbbWLK$$aV&&_MMT3hfV}SyZFIR1tSp=*_Pl@xh@dq(G4Ikq*6w#N6
z;(}#?3BH2;WuF(#zX;b0(NFqqFxNG1E~uS-wQ_yY567STr}xnCRxulAM@nJ!`{)Ji
zcbdx<h><V$kXCj}D{|wIK*#i%`vDrSiStFbeAg^Fu6zZ1%WlPV5t)oDSZZ!E*EasE
z;80*f^k%73Y%a6A@qW}NOKMd8Ha_>_vPMkEvQb2J%>5!+J{mo)E-hD`hwLIVBM-Yb
z-E{Ey#t+T6i(Po&TJAR2HT?dESgdhN^mxa5;S{~#o4YnkD|xA{NYZvC9A*P0#v`fk
zq!x_6;SL=>)2R9t08%18*UuJ6vB%r4F6~S)=cHRI;Je=FYTX+;(j8S{zl=je9V{a@
zmZ3s`V|bv!p%!oe6L9vj`U)i&p|?5U8ve<jRB(mCromO)jHZCA4{+VVOsa3rJ^Gt}
zp>DHpdcu5Bt(-CUPh>Z|&L4an8kLs*P<hArx-@*raBP6ZxO2JytS5YYgSfoX%PBH5
zwUs{PAzsa1PU+hSN(w290I(HYEgEQ-I_6D>81r)eG%J<tW!Y(UX?>}SMvX?D9qNi)
zGsRuKtbj*Ua`wLtt%R1A+z7RP(@kkNrCGxT--cG;G+WS_tHv#LV#^z*@;N$PKD621
zk6tLQ9&eSb<P`k2>8L62g^!QkZcob~Pvi<lbyE05&{)O6zJdykMd(Rvs5m&Dmt_?P
zgQ|Dc9Li}%@fy%3pd+1jUv^VkbgX<0ERnLR2`mwB-8ae6ywBW#Vu9E2Z1C28b#b!2
z*X0X0rMWM&E&*3L2QGsyJ1FDO>pt&~ltlAhAV0lMH1BFCo}VDl6mu^fDVVzUfA<@8
z-hTZi$LeY-s!Q2coj7Y8iO-R!Pw79-=EIS~`?#@L17`3+><hg-_mpSo$Xu%{PuAnJ
zRIyJ))8RNtFm=L!Q6wA*k}y*z)UPFBmn7Vv6FLr~Q%|QH=h#aXKAYAdDW7hBzGLnp
zBnGaOJ!U}Ea3#b#C-z`DQK3V*V{tDYWK&>1F?1OxCNICwtDu_EGfu9*kheg+ziN|o
zZ|2qT?on!&edS5OF{0HDJT%P(9YsBPf_1n<Br}^Vfm560Npw@uA7YgAsB9J`)H08V
zg2DZ~3mqr(tKz4dh%w}fB>DmS1-QAiQARPXl2Lj7|1n<0Ym_0MF66YP^7N4H{{1hP
zAcq<Hw@a|yI(~n%t|&lC-Y1ntm@j-A>(yvI{Jf%mM@2-5`fe2azAQV_Uoa)B%urXC
zoz~~yi_#}K<_@%Cz3CTmuiX1L5gAQEm9sT!N%=TXkpMT!`X#Zu8r~uBvIB%2^yi#i
zf`hde69LTe0Oo}Pn82h@VDh273JM5p-miY2wqf!G8zx^>FlqFBQ{6@cz{kiudj#l<
zJQ6@4avQr6_K;F^^uP{5+qR6W_de0=ZRbfJ!YIyR6h4z^0E-1)^~4Gk1Kha^Mp0Pz
zn9~66@DQJC4{-yEd+x@-hj=vBKfYs*A4SKJNj&K5*#J+e{1j0DhJShYLmmid?()xm
zcNlyyD?1tzn|s^=fr=W-`uPgVm|hBqpsdO+qr0X%_5d~$(lKugg@|vWM48|@GD<w-
zirI;{pTVs)p>d7};$$illde$OITAH4(xv3@Agx;%h)5K6m$Q!|g{NGlvJSK%OKhAa
z>wU6Vr<C<MK%)?%%F&~!G(uqq+KR8us!@;#AmgX=`qHvWE~)(7sLCK2U!TFY>a7iL
zoSbJEq2qYMX(8G(#kt#v3nGEHPNqx%UkWF>RU9jvq?NU<Yt_daYfZxs<qf7`cYck#
z;yV}j%#DX&82{hZaxxYjRJ_b+cPG7MuIjg;vC)m8liB$@{1COm>G<^PAa}bOm6=Q(
z+9hc8=LSyV0o9sDgijP5_uT<+!NIaF(Fy1ZO9RQo>42xM@-z>s3aZtP*316niHySg
z<QlFz2e>r$+*_Vkwfc_X#ir%X+GSMU%c3+qz#odCOKq2#@d=KaB15rvU5AH_bm00j
zHePZjV{41egTo>jJ+YQK!9fqYQ9!AUm7Qbam~iL^X~D->5Lt>Zt5)N(rmiS!cTvsG
zjG~G)xkdBOXKkWul*0jf5@g@2qV}rAYVd+Je=8l+=|)KCB{vF0?wTE$?uuoD%r*OK
zUBN!m{F(So{!Z>Y3}Bi2b=h?#hJTei?O^fzc2XZlcqV_>xK@d8t$8wBCLbN~x0T0^
z1U>6cJ5W5oRk8;3FpJ~5d%S$*e5n2J_PQ5K^YXFT>2e)xR)cb@qaEY-4XL(7s#%U|
zk5euBuxk}_*9uyA&I;KWsk?3F2oCS_ZsqWPu1xK**ZxGp8ItgrPFQ?02^}S2zD_uG
z1qnagLPEJtcyk5`QAwCWf<~H-{fImjiGd)u9FXRZnB5rJE<vLpEH`*bIMvOVfBkCU
zaFtmV?+DUgVf}3cSj@ypo3O3V1!w#WWrT`5`=$lD@lr&us_oMmJb_FjR(Kzq)3|Q~
zv*4SwcVuMfAXT8aCk%rwT_|5{Zl`D(MT=c~?b1)TP#R(x+=~7m>`o~upSkO9VXJ*U
zO^{#AnpUKLJ2%qDaOU699^v*FK&vYjTPFoRHkoeWN(yd;o!v``msU81x6b|So3xGo
zo{b0Z;+{K&3|HUZxm&)!t+-(CY<y6O$iN(#ZR9hDMyK>xjc*d}T!gh=5G%{~6>yAj
zTqyN;>)qXg+!fpXkeHXG56xrmZTh41T2iG~e)BtT?dQ4vNxS_8M`rZ&7wopH@3taT
z7oH>M>Q_Eo`cj=i(ZWNjg=u-&I8!ky=Q4L_#xYEZZC5!6D#Kc~T2^nR^;NNB^$<36
z`6C(zYV}U{PC~&_4uRPL#X7%@No0&hB!>nPGqJb0=QhtIQ~|qOO|fFLfu$>vl4?SX
zNR&%K!wR_>L<+ZErcRD(mwj^d!|OEhM1L6v*LNeOV!+WZRKHI2x*n`q9w~ilXr?4Q
zn@Ct1s$V^{HJn*EaKq}&dOJe_ya)S)7@%lWUmA-{Uk9I|ZTYDVd=*-@4#%)F3Zhr&
z`+a2kQ<TXpKPj~AS;@$a7N+ovBGZ?u{If&LR+C@e6;FN@+$ka%!#t*Z%K4#}qYDy*
zs!g-#@w8<;Fi9Va4~O?><L}K!qtor`@N0j<cnrpKc}wFl7|+la$8(k%k5PVx_?lrn
zu60Kb=auLmQ5+53Gd>N~22AL}S3>@j<`?59VS5G+I9j<w+gf?oTPoII9ap;X4R2kn
zmid&4YMF=J!ZH^N?7_6Eu@>G2KDX_YG0ci&?s){>N~ngt&_=xVFQ%lEODq>wxf$?B
zwu?0pT~zF0^Rh6dMkbP(2B3<l@NuaZ9<Uwt5K%un&J!+tk7qXXbsx&ESJgA$CJASy
zc}Azq7kRA9tl=-*FQMletqj%ntn9SlMbh*$<iVkzjQZ~!DHb>p<T0NoiHl@pXWqgi
zUQb)sW!@>-(Y7AUUaX~A(vfs%ve=kxE{Sn-gDWXx4cjDr88C!rdTCT$0r)ojj6ZCP
zfl{EG6bKb|k_LnK+ZQ)w%)K}+rBk=2lsWOqC-s~ZOUHNZQynklOY3=z&&OLlGasWi
zC+A}cN%nj^#Uu9e{G}~CJ<y}4d9m)pnvaI&^Grj*e@2(1uV5aQ?^Ke;1ofoD6Q!P-
zpc|whraVlLk4K}^@T|Jbt0Wu4{Yc>$o~$9#W~i{2qz2!KZjA%F{R_C|ipPWaJ_1Ts
zKa_wH(YE#s$=vfz`!rc*LVKaAx1pi@U$MB1LX|K}bK~_L=byeWh>oI|gCg3JMQV1k
z0cP5hTmJ^Z%K|xtA|@=qKs>o+sYm=v*oBNlAvr)?Ix4QG=Q8kb=if6-)i}2T2+(nR
zg#hI&l|MK~ORlI>lC6XqoVyzG2TKD*TC`Q1Hxnoay6Cs`AJJB>#PKsK-{#EoMc&N&
z8`;Vx>*>4-9-|`_S;*~RjWtME9N1fl{)^%bjm-Akd|)cqge6lcDoTNqAC;v}X`Fk5
zw0{%rd&1N523Oq0GJlNuhg={seI(`jdT}{obTjrGK#jUnDIA?nrMdDan?G&jVXrbT
ztyFyyY481fQ2z1wsD4s{y9M@hU1i=Xd0djqAAOXqeq5$fvkD~3Zg8C{@TJG(GWC_!
zxj?{bHnU_@eMF8N=ecq|=L<v2a}>@(?gzrMIu4=Q;+3Ozt&snF8Yg9m0~4kSQ^8CW
z(T%z9aByj@5oIp8NZQuMD>$!{L+zO6g0dh-n+x8k55j+qR2cGC_zOsw3*LW|^gBt4
zED-TeOINcukYsb*ej-wzBznD6sz(a%(k;(Eh0M5};6R=AbDecrE3%Syf+y?D5}i5c
zEHdk4&9$B2;TscYvux1>{+9ck2mL?>nG_BSz-FnGNa02VheuiSYY!Xq&}rtf4$UPp
z1pG;Wfkg)on}oy!U`l5_CGs#EjoSDjxu3@f(;Q}TD?F}okrK%gQWfICHX*ZX*VGOt
zaT|?$&mfDQs&JAbuS3LSRIiF$x~_u5{P<AqI<7_|8+P(x`}`d9RK=mp@|;lZv*h$^
zcvo^9^h0FKQu2*?xZX+&)vlH+%d@(2B|owgZx&6O<TuWc<2T7a=Zjo`rw>ffAIa}Y
z`WexSQp;y7h59*%FOgf;9WpQjrnqZ`v1m9_h~<RBOn&Lr=%0{R8H-8~cdQ(GULa#=
zyL<Ky6?3%uhVSW`(wA70!4n4*@{`_0To-z?xuAzn@S2}0;Riyo(1CV%%Ong%LA0!o
zQJs!#tb)*UUqn1lRUFJL?-;6iHkycs5%DaQ#C4&X)lpQbKJB-k5NsuBx=wkg`&D~w
zXAF*BX3wu~B!@;q2ZY?ZFm=3kdwH}+emVMr9TvfGjr8|0C}-1)qE8dfs2vyFJOqM}
z@?VN*z#ZF^+MdshN<!kkS23iWN8-FRGCE`K?js7olropCscFMm{KuAeV-^B|BDZGE
zZ}MQahEHx|-V<ug&-`MQQ!DrEIw7r<-kDe13m8=qse5z1a+er48(<NJ{#j|q0_@Y*
zDi)dtgW;RUW=|D8fwd3Pz3>oW2vOjHb#f^f0vpFR;!LQW|Eg5sgWa(MvFghJcuj7(
zr?AqI!drCaGxONjW64Ch%Q{PEU6Tm}EP)2G2K>U_<#N7zB3HtWWq^^wK}{fbq02aF
ziLSBQdUujiT{uo5;dBCoa=gj$-h95$1Cy4*Z3x0FG#bt%5%Ekyv@t8!mp&;64xthH
z6N%z|R5Jw?)8C5Tucx7^23NSmFxhGygh{4UFVgklML^MNX5e5Sqv~6Qr?y&~RFM(1
zP8kU;e1csj*QolMGB{zTyzri;8&wie0h|AP={Tj4jGTZ~=k(wil1Sk+9o(q;_|Ty~
z!4smVU^z#?AtsgBf+W$(p>m@N2k%lFVvpLb$_WY-THOju_WWe2nMPG#T?)->N~O<K
zr7{wwZkM(Ymv+^qFj=BhMwu$bK{N_2FHdWaDFgOoGGay}5#wZ1KL1uGWdzcxX0t*W
zajYq2YAAD5{)IUCxAPaM{PqaxnK5Ap62(luK|n2LY_N4Znq8pj9?t!fY|lWS!1Aji
z#19Ey;45e<<MK&#yS>i+c2REdI3*js0G+wuUU>4bfiYje$VV5(_h+s*_O7pB8!pbb
z0Seam1&j>`6a=)&Me>67>5zqZ@=VdWobOj4-*CE^%lBc?s?Xt%<)0+0Qh1ZjJ?A;e
z&6V8Vl6#)c_37LW$=rdG+ezo1t8;fho2WZja=-n#YB^iyPGZ}-7hfnX%O7i_&fVuz
zy?QxWSIk7Mf9l*1bnYQIJJr(VXdzu)suQ0^qprHIB#LU^b2{-XK2|M?>Pm|Auuk+x
z+2#F7)Vq8FZO1nszsd8Fn**#uBsb^@2yeTux4uYiRym{<@T)znxYOuKc?ued6rv#G
zFv<l`lD2+bdo1+{4T7@c)tnt7R}`HXtSCkBn(M9ImEkYgS@sKW<W`_)8uS)tdF_`O
zK3B|k@U_!w^BVy~@vf1fE5{I9e^GXWk9gSPc`Dyi`vnz_*cEW%y3=|;BB*(Jwr9vS
z_)t@}8_}DLs@C*DgmGNiE?q?lQBf09{z}q?SpvC7N}_8GnCU>i?S_Ah=h;?L?G$G<
zzp5r}K+fhj+)Qabc;65xUZYa^9W*~no*(!E#W1WyPssH&o0t%MjR{HYsr?!c`Mb-e
z_#;yo9Gb32O~@8&KSm~4G9)J?4QCv?%BK1u>-CJtNsrSDH6wDiK^@gfrO?W3-L<Oe
z^ffve6eDBPGjeggV@9Tvo}7_H-x{!%VMcNsGm_=3Hbhmk=P<wFCQ1`6uT|OEQ2qK8
zQ>3+!<{PXgct~takAs)F7!XR6VDpvyGCgp=@W<<en$+-QuJJ@Jg)m(2S&!BCS^*W#
z@eF@B<6{4ij6r2<034sI$;fUkp7M<BHr%p}e7Uw0apRDyygK@VxIc@nGO7;JzL&bt
zTey=$EWrOZM9)t-?>*+akw&%eY`kp0rykmRUGI5AD>%=Hz_in-JeL8E3}>8;kN*Is
ztzXM^p>N<FKR%%AykSqa@HEfx;w;(j>yUAS4q(h4=`e^m0S7#$EBX}7cg?Brhl{ef
zI7pb6vOWr}asi>{Fx;EKF-dP{12~aMw^;iy1(6RoT%#65R6J}C6*E^dtAH^z>#QeQ
zkuH70DmXj(oD973mvV=I=})7$32q}3C2?2`xpT=}@ta|w6>h$LKa^8j6<?%IB8Kv;
zL=bh;&zE?1<Dv^C9#HKL&L_2Lz2VL$d53hqcsj=#VIucAYLxN*FIg?De!cLFyy2l2
zj~p_zjTFUs8(x>+HY%@UC~BxAu`sGr@!sL1fL_JBH%K6$V3{#@0#%g%YS7+0Z58_b
z?m^JrsVuH__q7Z@QuvBa|3uQeN%|C>{@_Kb`~w|Hm#dCaUEgMTbZ$WBmc?^DW`p0{
zW}TyRuh6-d$8)Lcg%#OB=ML7ngOa)1C3p8Vsz0ab+>_(E)+W+55>BBP9~J<Ws||#Z
zO)rX(KN?rBRXgd$#y8kx&FDo%YckylOF6MKg<x#|Se#&V114hr0Kpg@3xvrk)>}`^
z7vPE0jMqYmwoEf#fqxM0kW0Q#p&2{s0T$@YU*5&U#%acV6V%xA4<CqG_g)Uo=prGZ
zudL|fV~=;`Th8&e`47f>Dt2~YP>{`~D_R=w^YfhJy@fexdAw85pJ2Rif0r`e)AgXd
zI&;o!$9P-mtiebQtw+W&-cDCD-Z5zPn2^WyrMax6=0%id*Og9n5%W59(2YbkRG%;L
zS>7JsV6epj-=3(g^Xb~pgqAL%R`u!pvHmFS+3+HVHEtJ;%3Q$M1{qFSAqe7e26E-B
z%hlYeM>c8hR9|PDfqccCQgH_CVz(uVflYE#8B}EmK!g(LEBqxnC!oTSFCoVla=jJU
zNZzl-A&b24qaRW_$I&6~2(ZL-hcfLi#hr#7Bk;kNjk@0)w^U8-&<+7y^$9wp@^#Y}
z_K^ja;@nfOw`QFwuSi9pGxg}*Z_2>&P_jZ!T639ofGdQY^l4!Lrl+JTO^_YdRx)%`
zG$0Iyi<XlR$?W<@`?N5A+Jk3AQQ6X2J;?Ga`!$!H8#tAszK3B(su&r4xKJT(hp)Zr
z6Iw~(=y9+o;vjIOGgD%q5b@;|>B|&dlFQUR@PHw(*{o6qihYz!vl=TYgX!QOmgyen
zN|AgLek}?1i7VDubm?{BkE$-27=`OD4WLWb#4FR%Qs}|K757X9#!gf)_Px64m1Zj*
zJ5fBki@!CF?vm8=Q+sEEtl6H4ho<Z$xza_>LWY%Xom&};4XZ02l27o|X?es%7@9}>
zf2AGH&)|s-f8a@)bI?z6i(gcNPm|ohx7_dpaPRi$o8LF=WW<TG%x(PzUj$EzJ~!ol
za;?FDDj9y)E|LQetNKHG1b*!^zrgK*pZPeLTi(fQ{CQ0okr8o&_I1$mnO}!TW>u}e
zd#3O(iO9tiogkXC+F?zCAHem(Td>xs>`IAC!b1<izc7g@mdHl}o-wFoaBc7eB+b47
zFkR}<dkb1B-RIhkRzmQNii<7`E+WTke(x?=8|>7uo=M_kycm@$d4@$g_9M*z&J_l*
zJ&mH|4X@a$l0{B0F=Lm6GaxC*$M)J2xl2bg?&%_%MCe?uLYQcMWi7%AM2Lc%x{Q1_
zFV3^VBO%3wWm#*?^*|mNRl=dN35v7dlH#(3ASfP(z?0$)-Gqt=rARtKR<lo3$jS_Z
z3Q_5>ZyDcTPNhiZs0ZVqRys>b1m6<@OQi5tbkW!dOL+^;%1XO?AjUc?2yVd~sO3-2
zC*!;%-2VU@fb#cIUQ~W24bSox)X)CR`o}o=OyQD&st>Qjo>1qyHG|LV^EC<2V?erT
zp?k8@jNd#)KfR)7vI*3Q&5R>HopKWw?!Q^^K{fyea3N4z?wurprGn+Nw?=<Pq>0n=
z(*jp|VISwF1xEYK-gsz+n2ctNe)>vi?p!a|f!wnGUi|1~117yh=cX3RJaERRmxO2c
zf`i#)WuK+C^ShY1d%_uzggFac(N>7X?W=1`945j4V`UU~_tz{FJ!J{_OU{>dsrnu&
zFUc80O0JDC34=W(U%bmSx5-2vl8}YS403+$MTked+Px?+)ox=`mu4%MB_Yg>Ii&`m
z`&j)CfgQ8fY8o4#cRl;?f9bCZAH|6U^Px1C?MnPaRbE)a(Bfj@hQFb&qa%_M2*rqo
ztqwgZx~a$?@b~9QPmlZilj+-j56}=E0W*LpUqKI78lgF|vf)wL!@}dbWfQ>eGoNd_
zTxt<Sa-GzCdgapZufZF*Y6R}GdGmqFWAWu@lpoELzf#E7Na67QY6kXmTqk()KU~I4
za9v2;aKN~Qr|5zwr`zx;Q}w<SB*p;@eP(gbSCAP=7W*QdxkG23flZ^6#r`O{Df5f{
z>%XC=g0DL1seisC2r`x5Xw#3G&FRN?&_$Kqx508LD$GGY!hfo_CfqH!B2GV^{^qgh
zsT1{B?i6lBlIQr|rD_%qp08%1pTJU_etcPHSqQ?c5hFRCYR4q$pMS?+t$M{d-ui_{
zk5~Asj1|Qv+b0XO^43UvtX-B!4aZoywbWa;-Q^r>Kjx!l+VLDcoZrZzCday+{<Gp8
zb!N$Q$5`uh)(?oqtgA0!tQ$w$OPshLL^2EBN6%uhNxwNWrTfIodH7}I%HcMXF(^Az
zGW?N6+W^kcfmYKy@vUJ)WI+2toy8(Xt%pWoT=UKld&W`V+@els{PuF5x77!rjUX*p
z?@O5jF3ug-e}I1u^D~LEbBZtAgU?Cxn*a)~r8M1u*olZm4t`r4IQdQDUl0JSs?4$t
zj#1no=d2)NEc8YGCb~u#FjDNzmxRP(TzP6--Z?Ac<APu<yb3lRQb??i`TmW3-yiOc
zg$<rjD?8Jq5EGZOp1WXhFO>l?n&IewpuW<tMk2}d&NHfndQmpq1PrU~vWv;(vnA@Y
z@#%Ym`h13OWXd<HP=$GoN%qV+9FrfoS47@@NFH%ELD}d@*xMk2Th0j#F?fW6P2|of
z_v7Bg8;ZpO+pzTuZc%+g5J|UQ_@@Fi-L1RQg6Ac>bd~PX%L`>Dq)UHc<oSs%O-I-j
z{VhXMOQ!iNPclfqxu@X{_5D4b+RrjLrMjX3F+fDUCJyXUG|>TZyY?RXGEvn&BIB&e
zp@rx>pi(d=YDp4ik*M{8Ntvati7uUkCY%pqmDIv{>|O<Iu>nuA$f(M%-2l*m48;xy
z><gVj{8|Fl))B6|o^ZNY=LCEBVGBy5Z@*}d)0q2bHKm1(n`HXeL~rm7!31o2Pn@cj
zp#+|}AYO9%oqTrVG-ZoULDuP7zDRj<tu?%os{^O^Kt-?0?wwPGN-^|eB=cv928L=<
zI#M|LA?f@Mnm9$8kk1q(N}a%G9?+kmz(MXf;_dl!84+yv)_!wx*Q^GKm8ImJW2hTh
zn}Bi*jkGh&NamOSh%co8@#0^o`O-lAG>zgKP=I{0&%8Z*vd0{s@V)8SPm`e()Q<2(
zDx~moN1i;2tY4fR#5E$4{x#&vKqkTMhL&Avy8=LW5!a4#<FJ-@6fpK=DJE|b7uF(m
zcXNlY$j3g&ZO+Iy$K-^XV!<r8Eek1{KlL;tJ6%PbRlc5-8gMOALJqV$f}l4MK4`A<
z7b9(amVysNuaH|sB9LTmBDf@^LK`36w-dnty^$C4pi)~7Ch4HRC7xs#$W~w5oj8M(
zSb!G<u7|9R?u^F(MBy+IOY(x?=R)Y6{R*29;8*{)+EM>h-=oL-XizxD1gKQ41z#4C
zhK?Wkpni=0Dm*+_$Qp@yCfnSo?5&@gbks7)=j`E;&M|kS^Y3D-x-An9SnbYCWcO%g
zLwbu8$>V~bw8~vE^K_6cccnlFF@XfY0JJC;Gc1X``NI>H4Ci&bxBrKsB{5C2`K_t(
zw=9V-={0xzyKVBf-2zd8FZ1sukk$nO40`O?l1PYgV=l2J>o1A)ZtZLPbxCED$0*(%
zcuA(HS8$_T9<sHkhQrVrs5=QWSdK2WXGDG%5^X1^zne?`paAEb2LPOZF(J{h?2!RF
zaUzx})te>pBGoFflw*Nh*S=VtOs23IiSd_#Mc>c@nb6AD*o#Dcj1-o@{Q??S@Kc?~
z=QT85>7^r~<>Gm*VFDBB*h5M9#HGl;yykZ)@acJrHsqhcCn8VQ1Ov2s?%1(dztz6T
zYsUo9_<Z}S_A2=FB;uW3K%i(Re4dvIpQ7Ml-75slPX*AIk^uUE4WOn1(7=t}&=1Vc
zxe7Rc05}T-@HB9K4sbqn6mWiNRGlwn;=p-YTHsV~eHzmn`_{O7IB|ZPGCzFh0Ko(|
z4bO?Tb=MW0?(&;6{}&)?`)g<cKK*8+zuUk3ZMOnd2z{FbK8FNK6Yz;5IZnzHe2$9r
zZ|!fpySVZQfV?006i^oU9GYhXl9m$ct5(CP7s0}4uH7D|W|Hoho_}0HQW$brgQpw{
zkLbkOi*4}K;B{aGEIVcm7!zEum)SZ7s1jFFVl75D#i0=`WG6H_>8Br!zuO#H{{2{h
zn8c5}e>#3_^N$|`zil>t-1%?tqtEQ0XH?cQFNzepj=xQTTYelj25)kL-kF@N7oa%-
zdjc!@#b?MI>(KG+@z6=Q&A5srcX(<}bOx()09G&=n}6+Gvd?tlaj@hSM$t=FQsP*$
zAHytVd|5C{=JWN<`cbt>R-CipzeLT0Q}|y3)JMnvZA##0;J;Dh=OGFB2S3~JUuVbu
zEm3h1n>73nalpTI%?petQ((T6X*Gv>jUr{ZUDl!4!Km-5h%<Dqg5d)(v<vB|b@uSY
zh17vR#n+HJ-rr<m<X=j}=%dl6|3}bgg1-^RpB%>t`2F|za|CSaW8=?e0DNrxIYl4t
z8h^%8@n`eH-51X2=O<u$)j5D9KnxGG;I^D#kBWOvOEap*NMe1P6g>JsosCET2S9YJ
z^WBag{U74boj(<S9`~HapHrJdQl#+aht*~rX~s9>BzEW(I&qFDGpOTzl){@{<*w`d
z1l~l)5)oo1Mt`nR;S85dYvqe6cr(DZl=dJV0M;EsL%@o%kNc_a1bD*FiqpVlFr$QM
z+e|r3cw&J&`OBJC$vfh=Nay{L&X356Xn%fGxU)-Q+0XX5@OzJMhC5^>h4sbb>FQu~
z*$Y~#4#t`lYC%hL#xHl*xlePc*puRjl`dX&6I~SgxS>M67bzUc1%&iogelSI%grPX
z)QQ&@#OLP<9ZkX;Iur|ZFLo`%JYM>TvGR`0QIQUv=wm1Ps3ogW@nSR_wkqu|<EI({
zl2#2lS0^r!M8sD1JQWo&lvIA5zAqRdae}SAwHlQt^IROhCzq?NQMsMkzVHp`zkZ3|
z(yx{GCnqECT^xJ}-z`r69Fw27-|EESC%jXC!5*VhXvLACp}mik?;05{?yaQ`&6#*F
z8&%IzPAS^HArD&7Hc)XaW}b2HOB6erp*LEC&-7e1S7&B1E%j=|wN|f-X}38;?-R|;
zdT1&Ilb4({DH<tU4~f8xAG(rE^yciz{p=!@Y3NTJ2d5{YZM#ZmeTRp2>V3|w*TD*W
zUT6MGXP$sG)yWEskn5mc#Xq3A58y=!p}!}5zJhD~_ct{KN_^oqk&OMtkq-C9R=c}l
zNAtD-4J@$ik>S%&PTUf_cmgh^I7KEj+^Z(D@Sl1+{jQ3UD3V-go<Hft_u#oZ4$urb
z;(miULyob*6FM9VoZ|_7+vIEeK7`l064!@rTuO9<gYXVaiA)^o<#@xV<?z+OucGVx
zRU5c>K*{rI)cl<zg4+!X`h7S=;n7*)Qg&qo6`Zul(4vAvWu=YlUA69BqdJ(&YQIV^
zSU+Ph2U%d2C*0Z{L%9zu63B4JMs*N`YPME4TaxQXsGBW}ZH=X_Tkfji7Ko^0D=oX>
z9y5CbrH(Xk34fac*Td(C^sk8wT^L*4Z7so~H9qR!iVfEaXW_wX<%`h=nV-3OZ`}ni
zv3d~`q6v)8hpOfOh0ZJ?_=}oRomsClJvy@t4B?pB=g8E47qE#@8^4b?w6DqEb{R8U
zSL|Brj@3$+4#{#}66t?DT{@mF8I_HKBH!~@i5tSb4Lf<${P?lVCL0(cy=#czutkj8
z7+%NTA;3Z=Josq;l>MVLbW$VRM5$u?FqfD%A<?X13X8EF=*5sR&{k~zM@SY{<7I*!
zE{SwT7gczM);t7?a38n*L7XoR6yl@3)#C{;xbUP~d$jAt;7p)ITxM&xi;5?yq!kT|
zp8-+Sk5G!^@F%h(m&CfQH>&PaU)z8qJSL*wYOOzv%6S%>8#YvJuBfo=(E<<F=_9>O
z`=TpSZPFpV6hnuob()U!&05zV+Gga1D&fl0)aLwcW9t6vsEZfq++D)pk0XwKx?-9B
z%#)j`XfQWo*z_;5>|LwrrEz0T5wDu%*}!kUoBKC%soF1Fqxtd1?}d}y0ng&A16|Nw
z%Jdhkozbm9*!3uyF5}0t9%DWv77Omib%FArSVecR)+BTM1#66XN~_jf<!ifF84<Tr
zM#SF6{ZS%FVH@mj+|SCd^1i>b7?kaPe_OFBmJQqZCY~^V$G1jHSP<28nX~D&#=XJ@
z)j3xsbG#8Zo}6og{XjFvDNg%`-fqNj9PnNn;eD-z>G=GgkcxecGb4|gZmtrRfbtp1
z0_|qXEZ+sJLH%I02JfGu*5EgSo}WzYtX~zV%=YXzu;)p)k@=#)m-&r#o6g$H1%RyY
z1hFnuSqJgMWL-+Gj-OLss$fabizn8t)~MRWV1#)%Ow8#RKZ|Djd%UDRLzuf9R5iKS
zdj$7H&j8_h%~nSCh%7XghQVYT0bgraP7KqdP#r+GN&683D}hRmlz*ks^J?%h$hc-i
zq{DnCLLMFNkKO0qK&hY8T_MzdAEQtQKJ|w7H+kD`@fyV+CGe@=Tq8?9I?}l>75hrX
z<D%=>b3UWweWlFEMDB%DXZ$~J&D4mWZw={}$)twl`l}*QzM~86-{^qT^C*&Xy4do(
zxV%Y#&{jT)>nlA$5>T7@xKsX8`a1#8RC$vKFq+F>eijg@5Lwg13#3MK`OBV3Y~+Nj
z=_O^d+LC_zb2)0j$E{~9N%_n5gA|}dbmq$O%tl<+G(u<1212Zl`T}v6qv5MY&YkC_
z>$vlnZA{@tmZEEdEmEmELDYd#np<CbyCl|?2C!f&omp4fH?3Yp3Y665{8?cRPt*;v
zGGjEebf_5EoIz>5GK=FMrw3!DIXTr}Y95AL+jmJ3<*Z+D53L)1n?sS_UZ*kP(%VDZ
znh+NRyU;N?E9te?T@SapDwZpl4|WCcaN}sc+AHzZVd|^8=KzV_^ROBTbTWZR<P&31
zWGD@ArUY>YTY5W`dV27Jqd(+<UwKt*8y~wxf2>qK2C`&`*5(0fA;#*QHpd-|y^fAS
zS~Uk<n)L>m_E_Yn;+{Ynj<4^z-4(!@Vw>2+Y(6HX^~<}wv7=d^@?7)EnL^x$i(uET
ziEY!3kS~KFKi^|snLU4WK!oRkjAFBNwl1BE<RD)9U1^WvET(f2)wl9hgW{wKbRj6f
z1_oXgG}szm^`*jRv=zPb0f@o;H=h4IMkf_5vVPtIDZoDxr;=E~BaO_Iq~%&&%pBc;
z4q|Qizie9dKY2e#9tWOU<uwTuAcs$ax0#D_kNyXRP?`gqGgpbw6MRm8q`Db<>2;Y8
z{%yVAS9KtGBImfT?SA`yUY0}9V;G!Y#2N5I91PB~oZx3l(?m4Rv#lo~0=$XFl8kue
zWUAk4)Y{?3pF;DnPlek=^{2UA<S2gp+fi)k)pNrP`hplMAR_<q;>TTtm-131ha}J&
z08WPY^_p@w$@y<NHiMk<2PeOV-U>gxC4S|soPbxx(QS>VYMq!|X@=zrVOV<Gq9*jE
zVpuK|-VUSuKuwam9D}nRob!e!#M%)G3Ozz+`f@mmXq0S?uG9M<{x`Y)kkyA1@|Ilh
zQs<icBIUV`<%TQjl)QKp|4o(~eJd73uXj^&y(grsH(a}fz(v*@3~#*#jgVYkSy^Az
zHnp+{ZtRs^qSi+IJ$@B^)P9cMCtO5dc-G+m#loG7zK(Of=!0Ibr2GT^nv#D2#Yq83
zlEDmvtv3wjFNzg|ZFfMe*Uf6ZY!0(^=+QWbjet@Tm&Iv47GI?gk6s^<m;20>h+I!|
zn;*zx#h1vrK}A&L+@;QCgKO(twj=}00k6sRbL>}CV;uXnU$gZSP|UTyl{39%kpydb
zz+THHdoBNAuOE??qQ8!>Wr&BFLS`X_eWL>S^MvnHKW13{KxYzEUB$sAlWYDek4Xg+
zb)n+$ZIG<f@-m`RiJv+?JQjtpjYK*X>Ml}-`7T+-=APIIN@xb(0KU)2@o2oE{8MX$
z#E+#%YbuUMQxVOgksc)mwe^eET9wF!)Sgq$YWq_jM-@21gyk`t0|ojxt5{l!@Dd(<
zJ6iun)nhy}q`s(Eqr9o~milt%DC^{l48uxeIL^D*8C8FlQmBq)rv-X>B5&L;BNOFF
z(fQ1<rtziy%UnjKQjO56onS_#78V|vk1Yi1P-F8G$66`J+UV@!`8ViutqmgS)9j<o
z+#MT}8)hQZe8;is|GB_W)`WvJg&%!D@0aiV-9GTQ{RBM3e%UEFYf?_2(23tr(-eWs
zN-dCSDyiIcG=9rf_|01VFdIfo4CP9v*wXo_vcjA}ih>!oMKq-4;HiF!lGZ(aWG)L2
zX?4JYNst^1B>e`RzC)*Xwjc}tNV;6bd^)RMXPt*AtNMAeqT^}jWu5pJT<ugpBkgpR
z!WyqOpwS)+Ud2X%v1{%Bt<oT7z*eswlmXTnet!fAg8tEs%!Ameev(EsiMJ8^GAO}w
z@xiNAaesfAVyFKBX{ogtqKulKcT|J?>=XJV`f%0)sT3Z~mRU_q)37ES!uMCus>l4D
z`a$Ill~yb{M$Z)Y;GQHQ8c&TQ8v9r22M~<R+>3Vv$jl4<VU#lt#sVe$Z3>L@g-;qD
z$yi+yIc-jCjl0{oKJz279N=nxJn^oM_ZNH<^!MAq9pYNeJ7R-dIZVOJ674}L1Wdn2
z4dXJ=?Mg6_uGNY2^GFoNub!2q4E!zL((EhGdG?hD+45$$D~c-cQ5_&&A7-_cvsjJj
zJKTGS<+Z#)xWV@L68cj@$tc~2OK@fm#vCt*=X~uEP(<v<X?}q_wgSt-87C8G(u3mW
zb}>AR!n43o*h{78);I#!%G8A$x~kFrY8(Seasu~bT&O$XIB%2k<T!nmA`ppFS`LA-
zH*!olUV@*RIDg~v5KgKoDfqkZQTQ8)a{mW^41Mpl(f58EeZOd<Y(d`<`#D<4!+5u%
z1!OJ<OOUzyV=Fxc){I<sfX5f^-5^BRE_AZd*L24|K=&Ch9K**)Kh)Kt871Lcge?27
zNwEHj`AEf2do_Og!HJ)~wDD6*=Q;G(P|owl;>s_;PfhwfpOtc+pYuTCJjX>^930|;
zVw>tpoaae?dIA8)IRJ2Q900!-mte(k`eG}6GJyhrqr!Pi;OBv;Hs}KcnZULR3<)@b
z=E}ANRunDMQk3Gj*2OMO0mYlS`#JVJ=U{vzpe`|*U9dXP8&5qRRNdR*?*Lulf`c>g
zDVT9c7Im+u;P%|$SB*utA7)PlwqM0ASK7ZHX^z)}sUK@PJLO}1Hx>TXMZ-aKI@Bpz
z0RJBo2*%f2<bj(0{14&(z8{1C=A>xL@NYIiSd8};tTOKHD2IA@I56LTl^8xMHN=we
z2)Q7)GOBuz?XDZqmwQMoaH2PIx4y=U7V{JEG|0M5s{hy9pwBVr*`wus+&?^t^EpY+
z*6Z^*A@FnDKU&t9I897-xqe*Vd?Q2(^WgCTkF6GQeu>vdYulOXT$^y7N)HYOPPYlN
zSI(Z6>&%OE=1pX(*gDlaC}I6YGVOy`P<;}hQ{n5!^mSSQupd#_X%1gWeVw0%Uq2Az
z+hdZ)`VwoalyN6&8NR}y1q_~DI8*EGsQ7y!(c8IFfm>K;NA${?sqR4UiC!71JANs~
zX&v@;LXFb~)yWP}JqAp+x&D6$lLbGf&+udS;~#;~ghJ21h0od8bOE16RTXn}G>mSX
zp<wYfk;W#7lNy~E6}|o>anc>C#=d}aud3XRIK`>eBeRb3*_Xj>=7xrOWP*{qXQ<ZG
zUWxC3E!~rFJXXPRD*S5v7{`h!>X0pfuRrqYF?AxF!*5a_@~6RXXG{_QTljV0N1^I8
ze#`?u;$d(F32tsdx1+%G3QHq)Hym?d#Mqd=DkEK+Bzd{f<jYYsH5^_*LC!1Op`zic
z%*I4GXH<&Lv?u&q*$yJ4>n}n&%#XESfpEr!FzN{qC52pva4zeukIU&pk5WZ+*t$HU
zMD^<3jZvXuoO*dsoKw4Vv1}MB;{57&G!@N-e!unFIQ`zPm!jW4MKnbWcf!d9Fz^5G
z5qj@t_pSfy`^6_#mEiN@qxZ@<oG^mV&tDja=ImT%G*TGusnVapuAzDl^QrOqw_VA!
zsi7w*bu;uowMX7y?=)TNJt-9_9HlewfHm*faSO>*p02e%$Yj}ZO|klJYiyHp6dsVj
zW*~n>jpJ-`qnngGCv-PPUX`EG6=7?JODB5j@2%iX{s|<(Fd2Ef$9$<m-tn>8)?=1U
zRZ>nK$QV8`JhB6Uwv2n_R*47asowB!RAF=ER0t;gt<(=!Y+}uln25|6jlqK%!8_57
zRv8Mhy6f+qPXqD~i70HZ*qD4b(xDhpMn>eIb{Sh;P&=Jy5W{DBQ+s%VCmfclzGNpm
z_{>)-NK*aeS+W&7Bq_pam9M6_2WO{S1vaks0I1fDNEQ+YW)j~hdkhRNa)lgCqMXIF
zEC1j}V5}u7$KvR>6kcL+UXC{cGs{S`DL5W&Rxzk7?%E4}Rknk_1htwI=Ol_hEl<^E
zn_h$Wxe4<8rScp!Z({YrAC7&JiM7N4kG>zN{hBNl{7CU*tUIxO^d@_WX+bu$tKtlI
zlidPmShX5_PJJ7K7CRR8Z)<dpozxH{LDAcOE_I56D~3WLS8gp}C@eijl28)9{PvQn
z4aH1ThEchVX>o_EWagDX1Xi+98RZSv912C<><-_zKyp_HCTbjYHH~N-b&G2McR!A!
z<{X8i+{WGW;z;WD1t2N6u}LLhURok@7^6zINJ+Rv$zqJEcX)Hxm)HobB%kg`{{W>u
zV(XBLe<f^1eCBRz{SWMzfz9nLBbkqlk0Z6(GZa$GLeLiNsPD)2GRr)QIzoc0<Xs{k
z;@?E?8v-I}vDngtUVl-v<@=fDkMG6y-Pi3SU)!%hJY4nH%lciJ6DY<u+t=-FU)4J3
z(>i#wMM~kk!}*@J4Mj@f{B7=MlpUSIAGb`jZlfLI@h`z;XLHVEqy?_vh{<yWM_bm-
z;5dPsBp5Y}6K)Ocww&lv2g>=~kD{FM3gv((n{lUl{UExMUD`e$N0eWl0u)3Fe-5{U
z^x1vDa+xfXMuBhZ%o3g1wLh7*oNS@a>7{eF?mOU=ll@j_cD_}$`Q7y?Xmy6pY`jHf
zc0VBntzJcD3;1A6bK+yEUQp3>Z;|-;tE_X9iSMnBZ!@PonXvVbk`G*y!k;*+CH}-i
zb65}E12t13AZSiCzphSLF~!&y{hj?B{S6O&!SY6Bp6orvGw$o4M0!c8x~bWe$oV9c
zu=4DmSgYV=<Q1vvgGk48d|DpTD`7kri4*xOMVh#R7bzkkof(SL24k|7tK^L;Ns}zW
z-VMUQBrurEs1oPxEmS%-dE=6fD)OeYJpDWJMs8TgByR@(-a+0#P}w44N3TRsd`x=h
zuLrmf*pU}m4Vl0H`l@=qt)G+1ud(Ty7Wv7?;Mdp+{=%=>ulY4vUKW?X+<6b^K(mwo
zMBgPWS~D3;q;Qqi$}25oqnw`DnSD=UsFA{J4yf&XA+w<Gu{k>Zx4P2bA%CiK6O|61
zs4C6SnIDJ>LS7<si_W}CXCCB7>vL9%1(44?XuTx`?2GO&_7k5!pSuZI<M``RcKAcO
zr7k+N$Vl1Y<zzarSi2v^V()?ZFhGbDF4wHg%ea9%)B<|ZRsvf%`8gs##w^f{9-;Yx
zTO*GoW3vV1AvS<<QyFihnM^Tx*H8yiLf<<s@b$G#$`_5wIRb>psJ6#t$|XnP5QIcx
zu9&UPZmZ<3maj9#I9$q5VZyWwa&Pc-Xb6PYHskYg1?tM$tKpZj9s>EXTCG_V8(xG^
zC~>iF#~PCF;Uc(7NsT0uldU8H0uT^EY{NEXrEkbB*6IqPd@u%^*Z+P)^|SqDrO0PE
z9!37#BL6cfe@^m$euV$YgtjDqK6N&;Vl#joa%o{m_P|#Vkn4InzHzpmp!UYqno$`L
z&#^<86@3ma)0u@j^VeLRUndW(5dK=vNM=h6pd>vTw~tAZvTssd{;2~jMWp?2tO*_T
ztH7T=zjh|&+rLOTznlhJarySfJ6nb~4Q0&_vV}<~Q|Fg}ShOn6{z2EBDbS_|VmH7O
zciss+aZ3<-@@UgSn+1evp;wA2>#dUT#H^(0R^sM+>LbacK|tAGu+6wvH$EH*NX9n&
zh!KO}piy-_O^Ng!(@;fK{x5*`828(w?OTp5e;0oJ9xZ=wkzar39SQuugvly4w;hfD
z*GyJ8_8;SI9II>(U)PDBi-M@IfsSc=n2}<8_>fSAVtZJo)fGKLnq5PE4dx$bt3A*L
zYL30}1th&XR3y4jjm{dP>n)O)2NS6$4p^U&EF1pM0S_fGvIG7#3XUUQC-X@B%|XMZ
zYyp4d#|&x{_&2E^@Snkd`H#VWbM1hZ;opH;&tw`jYV8baC4Q*b*M=`qV)sgMPs-)n
zs4P{bF5xPMFrG^cRm7{xFr?;M%pO{nC9Wdplci+rqg9#z9`z1%_^U~!XKeZS7yfP=
z{cS%5Pa+@RCGw}6b8zECS*8Dz#J|YL^ML0ciYvc{p4qACnMvGl;2vn{`19yC+2qXM
z0q3J(`R|w(09oND<5>5`<0ytW8T&ZWn;<%>r!w&94J5ABiBAaQ1zEHbi5KYduj;J(
zc8ZK$$ir&!>~5W^b05*UUA$Dh*{+zTvu5e6GHLS`vNYU()=@gEbdnkdB_a6#iu}2q
zQ~tbfq8j%Gk<cbsyr1gCEK!M-y{-FrF?|$H<^NFreANDoA4>m`##NF!O76!bJrwx&
z=TQkF@nj>-FILUPT`liN)LIZ1viWkDBH4|aAscFNm0g6CA)T}Bxckww$4?Tj9lsUe
z4EN6}xHXIT*rB`enDiL{cU>2?8h?Hh=#((R4dV316Y0Fp6MjX`GOUX2qv^Je75Cv%
zV?ZiB=yiSTtxpHVVshM6i`lU%jiRJJl0r3neO=%VPk3;_ZCQam5`RS0{dQ^=Vjn)n
z6UXOZo=Q=_=D+#M8s;4@&N4=Yhn|6TS>Q3Za>Jp|+!1`N#I@cXTc!$ruXV07Mnz5=
zNcn+&wDrXx?H3e_zGtha_5EKQj^_!#EES*pU(ipxdOH197NEr~-i=<qR;`$l^d|Pt
zA9Ug>QCv>$pIF<(9*_Q2A5qQqpPRoY^fN?gdTf32*KT&|lQ&KUnty^mxq;>mh!cJC
z5(as+KKWAvgrbl8F1j9_>X!ZaT#X|iP-pd@0NSKJc_$fdq_9wD2G3zC;`-$Kvs6~^
zzoAe5t5&G=WRjpJM8=Da);r4gSgZw`rXkv<X>^CUO;c}9jlgZnrb#Rcv`y3K4q{_t
zwhpJ8STvb?wN;DQF~!h9`jU<0)@B9+rCf>~(@D+km@+ULa@sMKW<{T5TuUaXb$?B0
zqNE+uX?kRT(3y9QCR2by<Ik8F`$Y;XbmsGEDZsIV%%5t<Bwb3{F{P`W#Wv<Lv$tU%
zn?FJRDF0&rpV%|e#pd=***7}unZ`~5wtkvDlXW}5`Q!FX87<f|VYBD7XKHt~TCa&8
z0HjHKrfIF!lKctzmL*JCYZfea)@a_B3Z$1WqqHM>N1-l$qs~2TExB=frtUiH&sV9>
ztobupaeJmVWZ6ri&%0yTGvTD;%;*iN_DpyKZ*I?&11e40Gf9ECJyX-<M9zO>&qR|)
z*)#pw*>s9MlbOsprajY<4dSXu?3q5k<yiJiXK)X;W;Zp(tVS4ztU)@Ym7taPnaU><
zn@A#VkU4~XM`r1TY@)b5(^EPtS7+r3I&#=E%^^3%o(bMVwi}D4;%>FUbFg8MgK!e&
zTpE}vK@xl^X&;x390YqNHy^LQQhmJPkB&v|E+5y1MP9QVj-lM<C0Q6brJm22JOP)O
zJk62D!^a^O!{kYNg0a(TjGat(Y*k6OT5W#Hfh*Qd-vkQ0&8(dsKFZojRT66_PTtOa
zsfe}HH$ft+6=v(+&J|vpobVbMi9d5q9be}-bpBt#zc-*P^<U%P3$IXXFkoy7{{6kM
zc_W3V>&zoUekbwoJvuW@XZ8|X;pE}`b29(;_?OUJi|=hupr>9uEmBYC^7N{D>c`W|
z>Zw0Z3)RygQA>?vR`TL82QPHV%RSDQJDo2<=gaMhmn8ldd#b^bUQpPNpFlnjedXaS
z9?p}8UOb#D59neS_K}AHqQfTc&g-KCnM)UGiq)h-DIBiiO_+4iQ^@;X-h|B`?ZDd}
zd1D)-;RHR>e;u%%rQ+lC6rbqn));)ld+IY+QcC_UbX~RlUE*ig@}7XeF~o&iNCF~;
z;FUc8-hRGH;>Fp|6L=2T&)4!i(SH6J&p!M4dd8g}5+`FFfcUiFr10V_9;|7Xa#%u4
zXAai|1AkR-eeAbq)Z0n++n?2&%YOT#dW%-7&IO+0Et0u%ZsHSxKUiJ!5_mqo0*tSL
z8j0~a?mIr@qh1rs<PXg6z2*_ib(x%qvfhK>>G{Ha#aHhq#LHr4XF^EgI}VATa=Pv}
z5B94W8xy}|Paw@>8)xGo&v@zdD!vs&I-VJ@yV)wX&iWe^LBZxgyQQ*h)>j*wVn%yM
z|J=_0^{eT(A<<b1CHo6M8ykQ=!>`9jTZw30VfmC9fj_Agv>S2efWCMK=}si`?{n~p
z82+0`W_{ts3-~b>RV?HO1wZizYgGP))tHaxJnh9Rr{Bl@zQzUL{ZqWgt#wn3pJ$#h
zSNLw%WPUtHe)CsNF}keUXN>2+t2&;2`9acp5*PITrITsYI*Tz`-Qa-=)r&hAdho<V
z@g*m6E}@!5oJoH(O#X9wUVqOXzNc}!2g`V0!M?Iw>MXr5KAM%iWxmB3^^mzj^#<~C
ztlHVqehuqIBdBa-3~!;K9cAK#XNGPFrWIiw#h2Y1&&m0>jP{}k?v%I4uf;3#_r}~C
z*f4&Ag*;eOw11@wQ>PW~*IZQDvK_`k?uuX+GN)f)T6wNOA3R8q^KlQse3_l%A0hPh
zH?%y5o+6O@!5aTk6JbNYt`PUTi0hf`O+WzFnESlOIZW=kcqupTT}B#>-G+v$c7b^*
zJzIJ;lU3bW=5%_tE3M%c3W>w#vsf)9^^fx4c_Ti5zDS>NfeWt4=lFGG`KBb-Z)}BL
zzf;w3>iXRP&Q|M}Yp>sw_gTN|J0;dn1K2OIe|;KF`eIMxXYpmiEkaNbd>;BldVwCV
z5TB0US>0!;<?)ysskW4ArFJz>!B=HnRW-B0UXI?+#+Tz7YI_m=NBZ1!9UxnP3#Pdn
z*(WKpm22EP4z$5%-DBdj)q{$T*(!$CYltyhGD}sPm?iI9@mZqrUvQshengSJw-5vg
z{-HpVjELwB8d4tMPQ*56J!|hFpoKjH-o`IPJ-&8Nx+ija&9%gWwEffB@V`OeA08<F
z8^h`BO^b?)n1c~gv36HyJ$*aMkff;T`vReycIP;OulQm5eRc=63;t!yy-AJq^a*?<
z`bXy=PG2y{9t2DMdGt&zPl*)v;gX}hfEu*rn3KN8xzc1muaNyb{B8{o?bUB8JX{G3
z0uLSXq9-y}+pl4G8d#}$Zbn}D<9d`u>lx(`bSD_43`#@E2ach}hk23&alm;n3O{<3
z!MpV+kIa<XH4Y$Qf~1xJ3HDYO5Y)n6h%Drpddm)`GBDCGQD(*o`}Yl$36a^L+vm{j
zC{AVjbmKt-(y!S^y6aWz=z0bGKN~^mu7B7JQUwpi0eraXyI`Q#k<DQuX}O(!ckT=6
z`@B>DANiMf=Vysz&)n58NiA#UCvbx`!XEu8fJVSTjXa7OqjlbG{J9fhcKmd}UuGn8
z`z%hp&|<;ma<+sNI!_C>tynAwJuTpecF7YCbl{AN!ZvqvxCbw+%WS%<JrR7sd<Sv;
zx4+@lYt*@rIYDD(b$E<P^yMCSp?GUpi>%Cf_|Od{DBQtkge`zXRW#O6I|S5i8)D&L
zf2mo&>jJnmRhDBx=qcoJ*fIC3iK_UbbT7?%Q3@d5&qf5P;)_bEjxYM^+4N5uR!Q*1
z>&Ve@g05m}JmJ&$&Q|SM|IvIB*ktFhnC`mPl-p{(%G&w}g-V#ec(Gqnj1of0d{{9g
z4f|Tk{aSdy@^y5LBnVxHpVOAIOx094^up@d$DkJ?H^qEaU*Y#dV#(n0MO?E3Z?9j)
z`T$VUC1|+KWn31vYF$ymkgKh~bp!6Yt@OAy`+7Dv$gtiaW`YL`U@A_>9lmIos#^ML
z5O+Y?Fqt-3&36d*!L6%{h%6sr^7K{W?8XGhB<{C3E&i8w>}sw0nS7$w_Nrl2{*8+G
z+Qkn}BW}OeD{;hMBrP(@WjzruyiHti#0zgbx^Pc>vaQwiO)=3uDS(xQC6doPgndXF
z(R7VUfmtB9>m5R+eauUmL`joUaYq^rBT-gJR`hq3ih)V-?B&JM&&F?bFDtKBo2YQ*
zw`vx;3zPO~8Wlsi9{Y*VZPL%?g&)<0o5U-}Gj`$q-!mhey`s6Kd_=EA9%^kn*%@DA
zjPSK?Qmim6&{^vyCcZm&VQX+MUB;7zpMXojE3F%^6F6JpF;{vTKgFwZ%K75&`NfR>
z-hwS<=lNYby)F!mBzOlcZRSR9?Pj?6L-rQb1#z*MYIz&UT-Yy;E1rE^5qDjsm+jCF
zvo8P6Uh!tzRs1U*w(9Sc&9l|obGiU~<^iT(fHQqUY+Ew2p~f++&Ybdpk6%`Zc7WiQ
zztWb%FHiCy_+<i934R$jUS>4&nJXOlWpCej|GRcp_~o{lN8y(-)+udVY4vFxSefw*
zMrnN>9eV`LX4E_74|HPT@1$Yn75G{zGqQHdZ7VaFFIIl?6UU(7#$46+%mz1Q8+`nU
z=gF^wzMiv7HU_J1R32fpY;h$`Xbm+&D1)NPSYWpIt$3Vc9~QhQa7-h#g@gxKDt9n3
zm-(T!bjYasf)*q@NB-o>AB?35E-Zfr`y!Ji!$zC0L$dQ?tw(WGHjQjWPrWpV#uJ<?
zgb<fXf5478@+J>(IeAE#r`rk2Q9VNk<(MczIaKiFBI#a{gua<h&#b%n!@W{^I@j*$
zRnk)_q-IpCeCb-u>Va0S{N5}zz}c9=2?;N*<&N{1=?$ys0AKXkUsNBP=rdo)yI(ST
zR_0ZxUv#;oFGbSn@uaBUPt*^|h{@ucR;GlYD@D!tH6|hpkm7D@Iu<L^2M_pxxF82-
zeC{it*T%d>v?dAw_bPd#>MHl&o?jto@fgF_dFC9VK+$6P!#m#Sj`Q@yDwa5rs8d=u
zSJLd96|(dRYc*vT`aRMe_Gu-`lGQWFjZOygw&}#Pgz;D|iK9J}R*}9+r@y+Cbm(A(
zkskPlmR0U^I`?hUfv?IWH&S??&bmr?Wxt}LM(9_Nsi8D+?0%~d(14$DG4j;cg9qW4
zX2@d7amp6BI29VQR4^lDxUFo)t+A&IlnLrpqNCWjye#1KA=xdc#ayTNje2r!58030
z+h*r+iL!0LJ6MD1Cq9xkaY2fAC74cEUXO1MkBO0Jq;P%^#A5E`qRuz$9cjx`Xm-3h
zL-(WI4H#txG~gBqu=JHbx$*}kZ@#*uh!5(^^L;@fF%DcjL)=a`gh;O|&d8SDJb^4l
zI-0#iwv3h6T_PWT&e;zqVs;)XGVD?X;Y*Odl79-9H4`rzBv)K~F7~7-Vhc*q*_mVu
zPM7Qw^Q?X~Qs(&7GTTp%#msc~t1hjwG2TA25~xSbqnf{PW>r9=V4|L?c=Q9Ff}zka
zW)%{cN>=ZLu=>H{bLQ1jR7$a9I?Igmn{EGgkl*IZ@oW?QsO?u%xjIk-)z!IG@O8FH
z_TmLh1bPRvpEutQ?PywDBd>=t0>59pP-V26{$QYc=wN%i*BtB+>`BcIu3-1YdqFMq
z;)61~7mN2&nuO(vgf!$zp@W7ocQimqsG`>;VR4N}n~Y&>tjtH_MVW!Ip77}vyj$PW
zOOLs^`S}y4SD2@4zgZtAt-dplOh!B6OrbM9CZ;NxLuf?&q4abqeu~%Cx;enZ&QUAq
zF?YbY>2Zkfg@waIGFo)R*TR9{2}E8vACZ$c41^Y}nsJuL^|p9B?^!qFRB!lVB6xZ5
z+t<!(4vH7*iEb^P_H@J)<@zgSe`tQK#Evl!SKO167Pt_}s@jle`-mY+RQgnW#ul_T
zztMz$uDYoN8F)0fJ05d5dTRVh`E5$CPAihJZ?(L{$*CKj<2q`*r{!VClO=~RI*=6I
zX{|{EoK#=M{98I|;}67n(!o2$@Z@nREI`Gt_5-A8cnO?lb+|E3q8EymeE9{QIcx#-
zsOWfA=*y|`LN*;QshW=8bt==Y8Yc-WlPpDT>&RzCHSp4?fv-Z)kX0)X;~>T7KnX`X
z6HqxzjN04VrJkUaI@2|7iqCi7|6snSF8Pn<`$JBI7U!EO6Lf2nAw1<PccLt*zW?mS
z^i!i)>+-wRqt%gL7NoD1f3hA5-p6|8?0QsIA(EMKtB}TWII%M8Ra)U$`=R#sb2+(Z
zf`m%JluYu?mEg%wZ`F9xfg&XyCE-GsTr6ekWn!%Yb@5>n+|84i`~K!bG6_H2U*g3R
z%baUc#ld#d_X)YNjI1O-B#|$tEM>}e!!{vYdU+464?U9j<;J2Kp9C-u)vZeum*zer
z;L8g>kui1^uZnkjjTg5>FQwOKt6n!F2^=&^Go&vg>i04541(;?BtYVW?5o}0uBry}
zix0sf9N02{YF2UV@NqOsFZuepiLVFeTEFxP##!z)+cY3ux6e~%em*tn)PuL^sZ{jI
zk9P))Qe8nU`g6i<hb2fB-p+O|l_YD{5jr?$g*3WcpZ0IgQ-s5;T^#9`K*q`Kq^jW<
z^t#79#NqYaast7CXT@dWg*2E%F{!7C;s>lp*Pvo%QCc9g;kv!e32JrTq|k+2dj)Wz
zgS$ZK1&Y9>C-LObCr-bL!D$%S%gLuSvME<1A!-5_T9<yxF27M|&9@&P;z?JHsKYB>
z*1C8IGkkOjYtFP5Yufl|%u;SWY1YqwVB*Z>)+s|3v|2A0%5rgW7B0KhAIJrbWatAg
zfRh*^q!<HLK(dp#yJvv)&g3%2+@6eIUaR;^YDWr-?p1Stwo(dX7-`iXumaq{Gc3IT
z;6oH~_j`Lb1x}rgJ=S5qXSJbS+9!0~-_f|hiNWdYS==Bg$Ae#2sV2Ano#p(5#_&EG
z$?Ss8yk_I9L%o6vy9&3uncmEzH+1Bmy4JEsskL2oLBQZ{dYN2Xc}m_t#&}FLq3dvN
zsZ_kU@xK2BftP*8i0{1x*d3Hw-bfMND!E)d%fNi*@=GEaZN&DY@ncW7<^D+Tb8!wq
zB)6FLSH;O)A$k=$5|bB~rQWK#U>OXr-c<26O7vAfa*v*0b3+aEFy;yX+m!Z!{_*K}
zk|%IPRfS}_zV*Al^!D7!#9gNrs>xcDC>c;CgEMptALcI3xj>#$xPr;Bx^II8_lGak
z>^xSf=5&~7V0^>uz^M1u*C?WP!cJA9T$i}<h4`$CAnmE{@masRL1z6cpOVIiBrG$O
z$h%18!IVGfHDyI4=CeSHZn5w0QdVRa9dfQnay{5d`2^wfL`KCt-M-N`Iu9%$;ZTY@
z*S&z3KXMn{IOyxvU{oFET`^pKkod{g*ij0sWwNUf=08F%lixov`Ci)fNxNIfA&gX>
z&v@}O7z%!4#CQIJ-TZuyKk?U?cxIjm;^Nb4t>V2xvdg;GN<HFnXOE8L4W#LT>QgI>
zs$OKUQhpcLyltMw&Ax62)Ukzrk-z62f6sbfm@8?Zk1=<FYC>*<;@&RHik%-{_-A+$
zr*>Jg`Y)&+a#<pPx8JqW2AD~zvbvhpC+gk8lT;zDBG*${Q2yNmtkKCqfS0h{vX`*p
zX|MMyf<RD3By{LF`5>y@Ro$d!XlP!m=>z?)Ca-HtbNcB^2}b*o5_<a0z19RO3jgF3
ziV`d{2>(e8XquV4<s;4FGWY3j7QZtBAJ}11dG}`uH1e2GYwoAO=aC&M;UGdUiihOF
zb_&I1?=I72x9F_JqP}{no%L6pwOnWIeTb~n?5tnwtiS23zhSUgooi?PQfEbU);ChC
z7g;_$<=T5;v@UV8E^+O%lu#H*T(4iD6WdD{v#F*=w9PtARmlu?jqjAcph8%3K&jd$
z*CaAdQW^FZlJVW3hTep{h@rQY?j#Y{O6Emvlji49%6^$yaWPcP*Q}tRSj={@OU;t=
ze+mHG33^HX;A^rt55JPdd3c;8s;r`R<!}Mr=K>gM{N~HAB=nUBsNC_;O0)9@FUjQ<
zG<|O;`bzfgl5ICIQ3yu4019(sBkuR7xI=g6(4W%;qTc!eK;!|{+2}}z{0D$6$o8h=
zPJ3KIzQ?t%I9yzSXSUkVc9%P}t(AMdrP>q_eF?E1gp=F&=Lj5wFE>KVX-<MUdTu^E
zIGj-sI^YT{3?0Z2L*QZ>3mxb<eXTDtv;aCkUnDjJJ)nFWci4~i+vORkqNaPgHF!eH
z`1#7~diRn@hwdcicwHY5FcJ&z+OqrbvtuzZpKFIN(z{(~cBicLV6{7RS6W8!pEQ&q
z@gqaEsI=4?^M1w<y^*^*sQm?1;TINTuZ}9J=REl$-s`zht0~%Ty}Ym~*3g?Op2aIi
z?OGxK_Ye*=%hNcCpkavU#kI0TV6BnZ?{<_+?lYcr9AwHap33m8A3D%LXvIti!Viv-
z$dyxUXZPA;ckdtf*gfO&q_^&}8`R;l#=7(lnx+@gIe>e4tpdv#_XCzH_@-HF_1#7R
z?B8^jZzfryt*comC+N(Jb>=J#44ryp-)~h_yXo8ykohI_$UY(~c_K8tgQ9Tq_1%Uc
zsqePFj&Nt7KmXwt>UR)4L_CIna}nPV4Ld@Kj6h&f7d*#{dumT?t@0S&=;^k_)3a%Y
zp;R;!DcexpCl=FJ1;+IoBz4xV&0wV)wj{d5Nw*#ifPNb)7JtY5rx_0|3+*!+s;H=X
zIc5p>t~4viyJGL4;*s)C(TC^_&_>1LkEDhFhrKrekE+NTzY~&3f(bV&Xi(6gK@$Zb
z0!oBvngr;U4x-@53^?MTxZr}M8*oqwb^=`6R@_FNQOEIh#`SfVLDZ12Bmq&jQ9u+B
zL~he80W=Aq>F;-{Zf}4y@4WN>KhO6)&&Q1Ed#mc!a_ZEnv(+gg$RCeiR2(66CCMr{
zAm_=w`67i$SU^JU%6_7?DT`E*`ML<$3xnvMP)qOK%oKFwlV<?!?Nq}8s$>{Cv$xKi
zCm|VT>dXmxjK9dgs{Y8vVLh{7^#|82C7Eg!mfh-?TAlXv&9pj3SKJ{6JRqU!H`|oH
z*1C%lzdU%=SWn$pmTru&cCiuOrrYWn*VgmjD6|@>bDFl&i<nYx7MU@5yct8<FTvfK
z-=v1vO&lLIoY4bZrMsfKpn21@hr0-w&#}JT$8i&ucJc3;a7h=(5RaYnaU^Hn2h4}E
zb!@x=e!0`v8e#r9ypg?`f8Ckm!h7PwYL@dM@DX_b(XLseoz%Zbs&O{dLu<m;kv2f|
zO9NLWtp)TMgtKApjfZ)2K()8vrdWID(;g%LTIZbmCOH!8T~mRqgwD_<5xrj^TS;5i
zh|6V}c<zqxP?gqT=cnC$y$Fc^meeY8xCo|DFm|b}9q5@IkU+I3@Ly})iV$Eu!59TX
z!H@PN(3g=lvVB_B3IwQUk6ynct>5dcCv_I^9T4Ap*+^NOeb{y1`K|O{Sxktlfb4qh
zO-gnh(aZ+h&NFz?VR~u}q&jC{P&LnV8zzs9;~b9I($Tj|>A$RWc3Pn-Y7b6lU>$Wu
zGBE?}g?HUUYGPKs0QeVCl;!^DSCQpjqBBbllet`HddQUVAKU+YX)>Gs@4rs^k8*?N
zbNNa!1{h<KbjKgqONZ*Vwo}8ET`gBMU==so^0}>8%4Qt-WsD=%)pz}>tG~Js;<JY8
z=<4?yZStw?XDpvP@FA$2em63<^+ynPk$B9BYNcG7$5C9pl*o%n?UK`1UefKCbLB-S
zqvT*!Oj?wtUyi*$METomoP=q@c(-tnBHdMgl`rBJN)#_Qj{CRq(q-;cc!_Yx^f}X+
z7&#~zNUO#lz?ffhciq-AvgmJ#S(PV(pormZK(6LH#XOu($xkn}Eqg*{I&jOYq9uXr
z0PU=$FYt}tw={jf@<9dSvn7(mR`LL`Z-WA{y<Z-26V8(j^FC^I`3(eUiCbe>pK$aF
zPFD6z|EO#?*HZZ_#6_)W|I`bdv!Ecr1SgHNoc<KQ6Ux|mD^0weJg|cY|HriTybMfd
z)#$zmens?!tO|RN?x31%kJjCrZk5xXG{(t<Kx(C{Xv5T*@i14r9J`FwW4Dkb+!;27
zuEr)nup1FVtT)e9LLLa3sHm9OLn$f@H`lVmb+vZ%mh$BZ*ny2FmWWNk`i>G(k+q2j
z6k0xYLn%sIpnDQLJwv+^fwksbsDabmXqBx8FoELcxhm_i$B4CoCqq(AZ=n|Fktu`a
zU+_=2M4#n7#-F41_)^QKc#WkM3EYQ>(ZQQx4MYDSQOIqU&8U8!s-3vLhB*h);A+|8
z?tVl{8df(|5;gshB$CU!BmT**q9wYEO2nGoWQ{2~j_zhu9^+hwRxzK6`toEYo|K{3
z6Gcu6ViDkEoSY>^6uAVwXGpng^m-ZJg32G0j}2rQD8ADRf3$KH{L!#4qGQ(Q^<=88
zWVRdCVs~16cFZwMldY<~LON!FG1m#JWaH;L+u~28oYM%Pt5(B@GCoua)^v7_(CLM&
zA=b!;XOkL}V;?$4K*^Q6_rpa0mTT4{ig=j}{=r%Ejnot@q#FEh+X|tziM0fWQj0Dx
zQNm>2Rs`D(k+j(RV^C-AJ8+t-IYfpjdNB(4DgE7C2olKb<HGgaKxpCi8@DlpYmJOh
zdU<jq!KJ;uxMu*^Qf+<V+A+Zs=>=2p+-!wd-B793%uqO2XRZE+%6dR&4G`A5QqXMy
z>AH79zl)#7=pf3-yU+S59Nhk!Bhfr5Bk**LKE`!OXu1$fP|h|ha&)M8gsQ2%bsjZ^
z(yXVb%grL;e&!olGVknAi_JBwJ}IdkY!9rFyQsNhoW!gwaJ770UbNH0R2Pq4pfqye
zQTSRlfK*(nc-zlg=tA-8C;%aj^tb65E@Q>Fu8(}OQa({yNXuc-J=nbjyIyQP31?Gg
zep+TObD$+c(#_T$LDt3##I#snT|50gqxwb0sKRnrujWbOb5{rDfeWa*X_Mwx-NC;t
zHL|f;_cQNsdJQCL#`-yqN}(k69Z9+ORORnS?yKYmO2ed?OQzva!WwK73s<%k=5wXR
zwbDRpK&<SAt_k>?cpx>aa}`xO8OVvZcovdAU|GS4rH38@mhZFNR4y;=c`lJ>I@+E_
z`|*q~P6QryI9ZsAGgQVtwv{?eu2MyYuEMA~jRmN?(@e|eK5ZrWBgH7Sog^wZs+-wg
zgB>ZVP)aZ*JJNI2ZDp>umKMmoED^McsDwe{2J5G@)R<&O3;NkJdcL3r1Dtsy1Kh+A
zcm9edHH}?@Wda%;c`Mu$d9o4u*!I~5htc>kEkB@JMt7d}R$poPBgP=yq`+62Pm<<q
zt=r-nN50NXY*1qFMDD_tE+pcY-n2LypDRXUE;C2jP}IfaRgkpfX&i9kiqyf$CD_iQ
zu4}L-7A>AKn%*kg@uF<dO1MsT;2ZK~HFz-P1GlAT%1f^wFQVt=;&N30sp|$7iMRBa
zTU*7O7{X5fU%Q6+*-ka21jUETcc|SISv1*bQgx9z6p>rg#f^kr{l>gZj73!xRXkgQ
z@mg^f3&|+4_Ma!~c^SQtn1y}m5Z!1mcIZ^k1_q#zSH))j-s-G!y?szgW)GU_%`xV$
z^c_0IldX6$#Z~*LE%c~O(`VCwV}5d|xAybw%(w*Ockhez_>0lv9?-n#BnsN#8B7Jo
zVmL8~b6`zy?DJon{x%@K$djjVIugGNZDY|Pf&X+e`z;puH{wf<KZTOK6+U5Ra4Jd8
z*C26Daepk$LAQe6du;{Vq0+77z{?WnzZA>?ee5{@4c4>BONu#I;w=}o%7lMQt#Wmo
zTIEJUxmed_iRG57ka5H>KId&k^Ut0MVUOrkzW!YBqBvb=wTfI9)2S@dnW4ATcjv)J
z$Lm!7qH{mfxs{@4Z?Jcg*<|Sj7Q^q6PT$O2)N_Af3d9b>unAPrYeFBqcLT)-xOGQ#
zW;sgz8?#Tkh<`qCLT<&Kq8S?O@3|F3QM{L)%DoEHr@>Pxv}(4#J_G8eF+7iHtOXNN
z;aSYRVD{>(L76ub^~-|X5)RUh{!<ltT?&~@LA*`FCKmii+=_S=i&$~kk8@l_O;hno
z<3y}KwNXW+uGp!d_R*h!%UMu+>+a70irAwRAh-|FJOWA{HC{c;eoacWP=os;q}>&+
zcMq({or3O9+o_@*ioq8`>J-F_6CsfMw6uhTH9($9{9HT4qRqEnf|Hd^IfvPZXjDN(
zSk;<JQmniak1!u$WtyUkQU7`MwL)3zDi)CFK)XV!?v=YXp@TVCpWR}+@aBX{jk8LU
zI9rjB?&&2vd^*SGQkd?E^M?EToyY!b{eb%hAq4kS)s`7U_Mi?;7iY_FrEo3*%eqd>
za)cWkl{*Mrgqn~gG6P(Ca!F80P3lECpGL4M>q3Q9&t__@Iyp+8{zv$99{Z||PY-qZ
zWqhg}`OEn9)1x*%jg>+s=4~0aGKa9i%*O1f=izpur>Gsm8K+T=&hAnB16K!$nq4Wg
z#9=_|Ji5?Z=;{z$T~1XSg<>TlxkuQ36C`Sxa3;Q|{K{0JKbO-s6;e*&37LH)lamjh
z_XLdobBK_Oe)A-{s{CG4T!D}Cl9F;M(ZAhhA+h)7jbv1l`|2t{Lq+08UMeJ>;3!C%
z;s;q0i?r_iNLt_@W`M=AkBZA-uDAuhCsF+Dt7V`ju1_SFe_OOag2Y#cLno}}W;^~7
zx(?5^5}@e2Nbs#QIjub)mh0W7LA|CB&(-CjP7(sJD^<Al%|3kZtyRra2{y*ivym6M
zKC{=75jaM{znpWb3~pbI$p6;4_6q|`vbxwW!rxk%_KPSGtZ--jjVKVT1NMuoIBSRf
zBDw-=gS-SfZ=|I{+SqID>_kG{)?A~yBT7j-Y$Y&oDwczXebKvj#z!Nb8iHI-XNo6p
zWJC!CUHb{j;|$;)wlJxErP#H^vhjB~R&ier^uLKV6<f;m-A${^y?Ev-HXY}*ebz&a
zN9IuG3(n_*^KG6h!c<;7Rxv%^a%$95C^0w<9dA?iefd`sDvIz2GIjCyhjTv$KI!Y-
z-B-CY*2Amg@NrmTow4kVDiy(u54vyW1|NdbG%)*I+EU9=_Y2(_+sdjMDE_7Ab@^a4
z)3{Dev}QX`=_5TFmsb*;Dw|Ru?PR*{H1n+o&mtp`b)tGrUUb79H~boyN0!S<a^=QY
z8H)rcu}AWu9ta~Dn7Dr&Jsv^=98mE?7rQf7=<%$vyXP5;ql1&cqG~Jz5N)yph@S7x
z_%;$i^erij`%A)DhV_PqoGZ1-fQ+h_=#31ur`3y`;FIzyF~!@-Na=YWxJOkK34R7F
zTqMo2Uo&fQ^xTA#*X-HV!XfH~`xKY}Z<Yr_W%}AuJw%U6`zYU@`Ho;!_dROXXAT!o
zrY#`L*`OfA+q-dZ&^*G=fsvrsV@yc(ChsYlIz_41SZUU`bW9xiqk)T2rL!cui!<&u
zj7n3w-YA5eb*;G+=&TF$#rLM>Eg801Cq~;WDIt&va!5f~khuME*6vmTlzF5^^wWx|
zi_H3$-B8ia;*WD+X}(c?4GT89!r}v0!Gmy}Q#63^@?(Qm#oa}w`1M$PobMy?+Ok1}
z`1`C9?jy_A_mTDJMPEd(nj|8!j?$JGr7GLIP<RDuirhyg&!I<DB1=&H`(IwJ^uLK8
zmHltTkIIgVA2kV+q#Zx%3zVizZ4uLB^<!5My7On!Tpx=FDF58%OL-fBiS-3+o}BeD
zpUr!lchGKoNanUGZ-ILKIImWIi4iJdxcpOA<0_S0lLXzi2Ddu`|Np+K&l#xF^_Xev
zT!PG1d9UctBHumDtCMX{eru9)`F`!?!dmNs&f@S{I8P3C&iXj-1xlzlyj0@LzZhTM
zuXFG}T_}}lU&<E<B@%6#R2gj(wV@mTVyM;uZ{^iZwekz|xG+cFm6CUYd`x##M}dQs
zkjkV2X+oFk2Hd3Dc_dQp{dTpL@>*c~?*DV&xJ1?Rp8aiMnD}NEj2ttzW3mF?bfqZQ
zTII&m6B^C<_Zy8<X3vlok7`P`VcpbqiA_B}<lk1E$2FCGoTg-x*G(OOGmHae-~Xto
zzcw8^Dq$gXQ*#oVYApLtMs?|Nni2*@H`O(<sq6lurdI7cc2pW;#n|vpP67ZAl>R59
zy74$o33H`KHI$}gHNH)Xwful4Eem*&MD^>hNau1s-^ZF>;B7^w%lsbPUff-Y5?A57
zCIaeNiTl5jCtgxt89q~LWDVBxt7O&GJfT|gqUuV;AHp6tTZLIs(V`2Y>Idp<s1`!c
zErTe&B=S>4K79k1IwhZ;C|OAJY3I`~f0CmH4pB@Mr$9Lix2SE|Dy0bGRWnJu$+m8W
ziDo~)nP3o2Wr9{h8cK7M<u^i!{O{+zz&rC26ulA2wJQ7lyhfe8mSoYF%v#z?LgIE^
zhVX9{E<iQ^22IiI`A!mYCE-shVMf>B$w+uw$vMV|Xb0l;#fYk(B1|hEB3q(1W(#W~
zQnf#|5!oaa<EjAE89*&4!V0!2QsoIJ?XB{g>ueaD%}2~EqK#$+pXQa}{{}U~9UM#}
z=+9L3gC_b|eFG{RQ7m`TZXHBy!-4-=T>&@4O1pk~R4H{j&6EWK!_bL^wRo_lpxA2>
zt)ucqv+rZO15GoG%m<=7QCfgWz~h(XJ3xRgNDltUy}?{5ik{y;Gut&QoD7Lm+m_2H
z<r)!-^V3ZgX;xbm&`uEel{GjAPf@&zrH4R7PQb^>?}L^}jw|Ip*2XqDBq5>lnyqto
zw{hzErfiAN<<D{ZCos?%cm(ZEkz$Cb1&61iv)YNyYM}i|QwTqMDNaojP;!mQb{8CR
z)@@G_5aB<ZtEb2u$3+ePq3QlUi&6*6bdDBvDI$rr=oEHw|5W<iV4Z{UT&y?VQKJ3l
zW(TwtE`LY2zkrjn#slw3pZ(?O7_rd1ucTk?T)ywx#D&m@p#&E4P`dE|yd*T4Ipci2
zPwF0$9i(1nAniyun|~!V!BDChk?qg+XMn0XUo%7SsOhkeZ(_+}M8pE25P^VND+W+9
zofGQl?haKLAOi%&nxpC-3id^Sc^NKuF<7qOC9u3dJ9HI_jP?RUJSn9!beGe@3`5uW
zt<<P`+u!bOtj43Jr%F|MQ+CuirjX25%g5o9(l7bDWjWwt@X5b|k#}L_jL3TQWCn-l
z<i$#P;>i=`)4XH#P3KFosJ^Kot)|4L7)Ps~RI73PHToy>2uwx6We{@Z>Bk@C0?6V7
zzRG42IYD8i^kZ6N;mQI#ffs_443OvB&H@q2@&?KC{XCOiBF``KEVx5R1>_@fRz;On
z{85KP+A^JT8B_1?)rsE9C4Z9ucP^Nh>N|1*!KOr;Ubsp5;|<C7_uCLtuMHH}!tfH;
z!0U3SY?HWywcb+deIN0`>Wu&>Oz4}klp50Bzg$Cts(CF@^b>)MGw(FQ;#Y!nL4!3N
zlw<Eee=_?FwRd#J5kmHk6DQ7NzbGGgQ*Onxf)2m1N4%0Ppiz57R(s6GCqleVV~1E0
zw?m{d+|aud=gI!?I{keJrLNi^Ld9e}L(0p8A;HhR3kl9)7!ec!AC}$T8awqrv4I1!
zaSF|t{VsoEL)yyAIP@#(*=0Pq1N|5A!we4_BwMw3Z4}_HVQv=%-g{g?Wj%ab8{Vk`
zEmriSyg+MreIvVYuG^f>s_*~cl|m1*hKh33+#;U;=gWuJ#D6$OK9nye%NJ&&`of$a
z{i0Po%h5II8n9|w^Ebo%2KtZ0V^;d|vbgpRUf&k3EY76IDipv@Yd;C`)LDUKRAuq5
zVky#Y?2?kX#ano0-t_sZE!GI}H8Qu14gU258-$g`b*lI|0CE7H1(gewYJlz6U;MUw
z?C<{}KS5zh1*{^MSN(-uB4MGRj1`mhA~{E;PX46*j=q8Q*|DSpH|V5X96<r001;W8
z!EPi&>EQ%JP*_>FY?px@K{ZNj7)+Q)+No{H3Hs+djZz{cpb~JKJJ9FiTEUA%|4|R=
zp?*i}p995ZXsoFja`+YsJeT>6i=&dhWtf~pass!H=RP+<O;GfD<mznc5-#%As4Em3
zvWP<l?hEc9w8=N#47Fc!#mPr_Af<DtZyc(+u966O@3a&bWjUDZXP*|gnFo;c779S9
zly|8!?2@Qe(qkIORY{%eiL2|(Xk9`R^5bjkC$gke&DQqH1c9V@Xg$vp22?mB+=F5)
zOmH^oOY~lTY9fD!4Tk;4b0G%^mx8VHu)9%Li*wDsa)0ViM5wDE1VAbQ12Q=VaM<Z<
zOP=<1h$Ds@BKD$iqfvD$#6(UYy8NA%y39_TQOsN_@xE3-e*^t8oZ)(BQGJDEuXGLU
z^uv@Cr$n4Mpz?h6?Z}+cNrEdDgx-!ztJuwMkwG{9Q9iKjr)>EO#{tZPD?Fyl-*2VM
zKZt$}!Udrp*LANEJK>{2POC&|fh>TLegn#KPI$T7pO!QbClL-w=GuHt=Pe*OF>^(a
z<!Ltt_mD=XnYx09K=Jw~WmR_m0Qo>)Lh<<fs$#YO6t#GSi}779E6?6G!1&7IYATe#
zXa2MCp|31c`Hk?Jh^)M*U+RGe&DzH}yk4j8*u4B?24xj6Vbm|vgfrg*Y-{SlU9
zL)vJmg?|%`eYFRTnQ9$6e3vB|)jLV$7NDZdQ(ocI`*=gy*iPWfl=*hHx&oOs6OB7-
z1D)DWG$z-otJcoHM2K_kL}TjmiN;->{k-zqZTrR0xU(p6*Vc)~;cxl1i154XV5u=P
zZ?^oseRj5s@V+E2aThfZ&!MQz({WySv3U{>+6-yql!HVRAUw!Wr$6gC{BLB>C%FQz
z_2O!I2kDDvFd~22Y<abr31>?2%_{e+7(+38n_zIa>m@w`Pkf{~-M}0?QO_LQBO_NA
zS9U;yu1K5uQ(1fxA0>H&Ud_b?BA#9)Np=4I8T<rC16rY1Bc1-vtpiBs=4PR3U(2)N
z5XS_Qq!)NzKCYTw$o`_HUN`&ryQ<l^twgo||G&rPLZ1F_!-IG>j!T#_v;ScE2^0@!
zO9;;XB7ze~w@>*3UH)`_mc;8zRrDVm!q$sG0s~feFVl~cdDSX#O*WuF+Qm3R5O%JX
z1nZ)7P&trQ^n&cc5)YvBTO7hsCxM~p`s5Z-RM}Rn{>spx92O-0L?`mKq;|jfjV$AI
zj#o_h;+k+pbTc}cuh`>1B1F2fma`iwe7`~rSAkAyi&Pgo>*T&9W54(s=M<``#ZOwu
zi}@ApZ-94T87W7f$UVt{;)0Mu>*+|Yao5d!{8j{VdLGp_K58T^(WSUjxcq+>=~#SB
zs%FUp%*Pe*Gs16rBEqVOPh0f(o`;9k<Ld-3N53ZhQJ^Jm7ky!Y-TwX$u|`>w_wnN@
zXqNF1QAjk~6BvlYH?u6sBVfsXC2l<FjEAA_2Y7U7b)-Ypc85NMU6Kw>$GBs=En*ap
z_Nha(PXcps&-)S|e%-0(shLiN@wA~+Nyl{RbDE5G>Z1?iI#qFz>eM(&*d)twJsx)>
z)iHm125wfivKbxa@T~pue7kdPSC}~AR#;6q0r{h)T(IGh&s*V8cB0(0E>d--P1;4(
z{yq$&`*+eEtT7t_-#_S%lp04VtNnjRCsns6(VfJyYTfZ);?g*c-Rt^ntLYDTmCTj9
z#<Fh30h8t)yrC6=K-sJU#XGP)6l&MgU^TWyg&-1-N8$poIIg2G-#qy@evGmvdi0+9
z!Wu(i5w5L2UruKZk7s)15WK%aoR?303gt?$ntrJI{!Ms%#U&j!JivdNPa7MIz@y>k
zICw0j4+0N2rL30MkAjDSKaflCCO97aZ@gUK|2+%XLeh}-LKYmHe}M9jX2>0CrBUby
z-k!4d-KyOd>mMRt&5X6HSC{zf2fB^vDCm|VI15*9J#!)zn<+Yy1pEb#4k${F?p-2g
zfFrrU{&jp)q7vsxx`S8HL5(0&rU;%3MVOCmO2%-R^eXsT6dGDTB`y9B@$m}yAshZL
z{;c4!37$kZH%d2W$A42T;?GJcYg(+ue{XzuJ{vc_LfznP(FO<NQ2$}NEDJTrHP{Me
zkiV8DQ|%`IP5gS|Ky-YIe^TQ+@_l4{KX4!`y54S7jd7ldQr1(iMj`s|jnC*8H@>TM
zgAYa<{22~J53Tt^J+%9!wSrh{f=_Dh4##WBj)%y<S|N2QNj4~4W+3hCXge7kr2$DE
zr4b?UU8IqvujnBiy`Ks^9oOIi>+iO)_i(Y%uvM(;Ez)z3$`el{VNd9^IDMQeyv#N0
z=cE!?v`dHktM+B{D2TsP?(yRRZ|!H;SXERLLT5Xx8#x~nm>Uk`5sR|1d!sA4KC}>d
z1IR}O8ez~&L<9T$iwMG9?5?UMmLP_rm-)<LHs4zRWN^(`v#u_fQ&@LvoxJqF_u9E%
z;^Ufn>k5^(qHb4C;R^k7Ypo;|Hp&NCg>_}O2IalXwCXIwF>8m|%pHX|7j)Zj;fbih
zGhd(eEkwrP^0Q78=OJtiahNL|;$w|1cV9>J<wqie&U4J#8HOef4|JO8&zx6Q<e8W1
z{S4h)Do^R&CBdyiy^Bz3hrU3cGSollb;m5~4ja=h4;3hT>%n=){?8F!8eg<inR#f|
za%yVgeu<{R#ygp>d%8J%KV?i^Y+cvd7EUZabxjDYAoy3=Kw|B9Z*nLxv8l|cOKa^l
zJjuc3xwAweo^AN#Dit>&ITA-@REfgX>6a?ZI$*&7aoKj2nVz|+vg4Yg=aMsM?%T;j
z%%!0TL{OYS4{iAocm;r$cpUFLs$$Q109#eo=?+@Jfqz#NXpK>z)r8tx?b`vdwfKJE
zyeRz`YNozkX5JL_cD9)y-94ZQ`uw_q^$_Un!RNl_Au6}t`<q0?Txxv;KUr>W4PF4)
z!6+r~nd=Y`hnf4qrCG%9NXo7#htR0*2GZ8{6l(rZtDfZLp0k*7`=!=1)G_OOJ>!{E
zkviw5hT4f{q7GRpmxx`qtfLW#UM)e?p;!CS^VIx_hUE0Y>2%DJaG0U@=+E29v(s+r
zIUIezS3-FgkK;Kw2TO~+!PKX@NJ*l<chXQGs~^1!of@Yf_tmy@7Bd9mZ2CLRQnAFc
zE4uhc&q~$LQ@#o1$E{!FapAL)W^GeA-l=0;$@lC;z74%>6AFRXN9~f*MVq$Z0@^<P
zXnn&H=d2^`N{FR-8Up-g35^p{p@HBHuEcN>`ldc@_5}=q1vZTe7-J=`7)sj7N$GKH
ztQ-j)49!R3VFNHYlvE7>P4CSk0;dwb15P*7m0r~3@>gxPSE6?)RBO!%l6E;g!1=3F
zst`X8f^fKX7-Xoyd$lt#N0Xr~&hEt0>VIPkWOA)_8^Cs%x~lrLP3O~9eJX;Ue$Jb9
zi2a-<7vg{u-{iWGpAsfkKx^l7kgebSmrPoWUX3Jv7`pf}|NWVTwPohv3@-I~7RW;Z
z&`~BCJ~_nVGJil?nRO`4^C$Ah7#xgxL+KcM?!X<Wl(rFKPR`i~ZVSB~<*3`4&Qb4G
z89}Fs(xMg<0Vk&FcA!-u$j&Y+g^03Fp~-a?!Bw_GJA(Kk)wxf|aVG~srCEmrY0v^Y
zlT#?hHQs{B3W1}cF>r7MP{qS~DJ;I}Ua+_u?V|{@Myt`veuarm_@yN{0@a<xEV7I`
zxdWLwuJCdfA&cEbtGpMAJ%(tV5^i<u5~q1Oc9~R`&t(#b%BT{5pjq364ihw+2wWPU
zjX9F;CZ>qabAU402pM0=$3x6@Lej=vM$CtGg<;pMBa$gDSI*=jAmA<_81lEN4)bu+
zP@`^iN6xGtmSTq!im5)IY4^EY`m8(5<<+c?&>UwnT9}4x>$HrcIv6PK+f%{h1EfdC
z0;#lStjr9jzAt?-x_Cihv#H6b!^<|g$sSd_Gd=mE%!91$GC&x{nkZ1pX0jyCmo$8t
z)X8k;+2HYKZV(X3bQ|;6%=$J7lp*j?jx%G$SUkGLJxD#;#Dkw0KEhbsg46($%vu_j
ziJhRjKXlfZUz#P8VDRi~ZHE&Z588*)sFKTmUH+bV&RIVvCsl&8a;dM=_p@Ql9wEh`
zS1EEGNpDOTEV@4@$x>RzqH_g?5YT0rQSFou*pB)%xn>b;i%M8|f4GHkLh@_{I?6I#
zMp$zAVb%N_3>hz7qk5uhJ2lT8_(Xl?Yt1mK`br`qc!oQ9E81ZKW-m{qNBF`?Qp|o>
zwAfQ#X3TFl#9ZR=ZJ+5{x7`_DT%Pfjude-&s@myWTn$T@>oDn#=GqdsQMSc*h@a9e
zlqpYM!yTwUa#bn{&&A&V3jUGKxfY5^SN@@24Snsx@RuRVQX(Bhx(YYX>UZaj5l7bX
zd;<vF;?eBZfhL_Q3B9sYgu{Fz8&i^4pfD9tUI~nnHqtmrSigPNdF_EU*J@qBVq93D
z5N{W9O3hwXGm%1Ct)+hkkg}PvrmN*916RR5?63kkqxv6Gj;i~(lP#2@u9j~I_gkZ&
zO6|8orJd#lP2~l5qP9jh#ktUWB13|VWwk@DsV~ixo%fa~yC&f$XJlI6p1_jT*i*e2
z9zC>*?wz5#*LoG*Gw6m^2AnIs3lz8IYeK^5OIb9PNi}lCt7y*Cbb)VlfzPK&0kO-}
za1c)w`0jaC!|B-Stm#ifZVx`kq}7E}sP<+3)%$o8VBBCS=JSyN<t_Em?Mz^x_yJw$
z-9>b4vw~Ytiaez2y;0}>BpY??y5YOXj4dVgq+I6HvZ3I5gA~g*3p9C2@Ib1Fz%1Qq
zi2PUtz6`$2#?j;~uurpnS-3WM8LV+)g|>c%Rq0S?VPL>ZjQM+`{@=vd87P$hE9-1J
z4f9N}4l%Gx54>y&a^fjn?N{n}VB&foBgPy721+rxLxOWw*Jnmo|4pnV7T(aTw#Gs2
zj0f)EwcLN9isF9*&&Hcg4lK`#w=k-&Bt;A~xvb-1k#$`AaAX}XQzpd4POKP}NhOs(
zw{x58j4Fu^#ArVA9b!FSEcFL|mr=O9aBV0j>R*dLpuH7UtyI(UrOQA1Gum43EC7ma
z!o9?;=aR~uGaYXyyNZ^29FDof>I*Lg+N@;n2hO^-_D)~8WXdLo*-CJf5=WiI-CZ!q
z#uIfzavL3$IR7sysa#cweu@gbi$p0HZQolJ8Ud3YoFwoHu=iB*b47+MTEus8Qe`m@
zpo?lLv6#i4$STER%qqouqeP;>{zUXnm-&_W1e7@D-j-fo@Q-p638>cvZ5`};F>*1m
zcK{=wxd&uR%4KPr+Yr|doF&AQFVArWs`A3&N5XjNu_sRJr4I>-AL~lgf)s6_s9L_5
z*sRyUvXT$l(RDVmPz91;<If%FBiDHD@piUO<bV#<D);HMHzmi!4m|{x2<rSOqu~~S
zo?<C!`Nr9OnH#Vt?{gp-F#3Ys(1e?z3Evm4Wj?LSG$9mQ#N)d^jk?S;OE@iI?gJ(6
zQYfMN1o60*f{wX&W+@9AVSNxn-)~1!6xn=_LoeuYeF}r$3JM?PPTTiVmiePku*@_6
zsMh_`JViOip_{51#RO@xvWm>a<z(~sN4P1otMV6kJ9+Tz)V`pH!2vK9PMbKWdV`@;
zSy+9oGwD@)`WkyR#iV2mEeZHnU)H`W0e{-}ls|*yk6ZqXlRvl1pL>()tJ({fCcnb(
z$**i`@+%UT{0gSa@7MP4Zu|Eq`&Zpb!(w8j!Zy(3IdS~B5DR6s=DtX~_d*UlxmWlH
z^%Vwu7hX@35@uvs3iNp$j5g<*;~DL58Y7Ih4M(S9pgo<b;m-IPf<9`X574_9aQH^?
z=aKN^QR3po{&)g((c3MaT%=(C(cAtX*g80c%=Fu=HW>1lFc12xouS6aw2FSxrdZyr
z${AU58c{YRxJ225uH&PS6)DVb0>9%FlIMRTuM)7j`F&b`i}n~*bD&e=mMR(L{+rXW
zFJnwQt4y+Y7*)R~+n7Hj$z1H)Hq*Ion=`!3nXwLz0fu37<N(OJZH3LRl%Og75{3b@
z^_LWJnZ1uWq~uDu9uJ`FvpqKokzjAwY_(huQnMcurfEaPXt4!nM~rPJM+>cu7K$3$
z)aP+#wzF^=7owcQpuDFCUq}^eB6UuABa&)V`E??KJ7G&VdG193{7UGbxkiomXnu+D
zdtAo+`;r_WS%hC&1~0_u2{9heFO@Vml$0A~8yJ4MQM!RL?qnd)#pSC@m-F2(g-(j`
zN|^IQTMIV<Ac2JdsSmJ^oaD+aY_|0W3jUe&S@%Kc_)Ow0tlP>QxHIZ~wW=2z?FeKl
zr*OIS1kSC_*D9P_9pz*qxapLp;HeSMFCaF!;VY+Jr$3bt+b6JYIGqhDj7K|zqd{ap
zk{uQ8V<J9?ql_`%n`E~5c7hK(6+RqM5NV$NjU8i}^pfWK5|>d%&|!W`!_JH&&g69t
zbECLx;iATvG3avkeYrPZE<eDXU_V#CGW{siGY=uhxdW5agZHxPqx-+xTuv#&_ddWv
zqDgV5P#RD2poB#i{~%6t&fY7VGi~G|3m{hwLP_QBNGak}z__l`*FC$;Mpr>ACSd++
z9bh~lv_?*Y6tuBN<{A$O4U$8ju5fL+x!C2~_7lgF){o$xE@84G@EL4mr>^3UdsX&e
zHoivwxEnDn*KsG{3h%eHSzff-sQxRJ1?NkR(m#<Ej@Likxo4!<QJqs!_ToAxd}6e7
zbuQnx(z#W1t`P)|=MSWR>!p9|RsU8?|5nq##q@8D_uF#1Q@w!(Ii6y+33&XMxy;Mt
zCNjYYXVK^0Jpwm>hRf_IDfl4>?o^znj(NM8t3ILLar}r%VMdMFvxHd3CSi2Pbms+~
zIy&nPr-(gTkg$O5h5L22C?H;9U_?HnqHA>(lU)IaavkX-f~!n(t_qdKt%!1bae#GE
zExsUNY_;gagU>MrJ%Ysj={VyF4hTbTKGHIaoJrUbyooA|YAjFe{*GifO4>9=P2^DH
zlhsb&fy>P`%^USycdm8cx`^F2dn09=*J@>=xrX&f<Oaj{t@Hw_mMdp>WsVd$g4j%;
zZzKym@<XZk+=!;7<Y!Sy#2qW_O@d}&o$<m&LRVkK4cA!{-}zjnS1*>3?C93EpAAcp
z34&R5k5MvEyr)8?e;P?IHyf?L>8!OnYYlpMb)cBVW%+g1-*wi(5oFcNOiBR8M(Ylp
z`;5+Yb9gLzX83BIb-&IU7imsxpO=tj?@3C(KAJPM!itMRgT0b1jbq!Kl;3a6`7=qV
z+B^TyfsLqN9$re`OzT=4`N2(5qPuitRGU>p^FbudOOy;J5mJ$*Ct*=Uc?o#`a#+4<
zkV}e)O#{-Uq6o2$d?G-P+|m09+~D!==$$Xto<Qf*FJOqVJ9_VURKfHaow?>+G9!2N
zUahm{_JLLV{y|u^8|E?Ug#8zN!<TZ{a=aY^vBZhgmz1PsXOWTF21LqBQY^IGdMVdZ
zI0GCSQ}-<ZE4gx^kJJ1S$1k;CIqg-#_`b!pH#5&ZiH>@{%`zA(@(~WvEk@Nm>ZSf1
zP#wOaz3}U~@m3Y#=;gFO#=u?HH5`+KNn9+%IGVi8dLF@Bc>EmWq@;!t)m0UBRBhQp
zj|f;|Ja7vw7}Y<*G^o=}tE}@{WMAG1b+t0b$WmZ~4-Qs~LTth4IpCp<2U_VNgg;M_
zme5IheBs5Y8jPxS<hlJ_*B}Qa@Skq9ndK0uHNg@7PRq-SzN+B<iB5+`m|Y2?vI@#B
z{#*;=^dJ6{*bwTRCR(vhb%kqPrjX0H{J<b}Z9q{2E?e^KYruc2VM?B>Xsx#>bfw#$
zy2d>)^^kaL;aEX1=xCXWuib%EZW+#8Gh(2-4u39$2TJfEw>S9*tr;PonyG89FbDCw
z)+PR2V6IUu@RCEk1Dt_}@?KB_#kE}s!37RtsT4W=zlKWIX?WBfLa1vknUW*SSV`S3
zB)UUkQS_DM9<Dr`?CqF0^IgQjE#8xZm$1EORx-w-Ek@P(yg25z7hgcLv1+YnzWDU!
z%j{(%cyN`gQKgO_f?qlK07mw=avZd0uY)ImPsod)&lUdEt;fS3G1Zoc<p2pCrBolQ
zBaK4nWv8!N?u9C<EOQs5v91=TG7*=`%<+tu9;sHZvs>kA)~y&_-34fN8#zA6_z%hW
zxg|l5zg5OxUy^%3jlc3+8vq^InC4Y3*8uRG<Ctrx!PhbP59Qd0YRLStyoeaQotc6V
zG=EEIhcoboo&?j&9Nd=#vN~NwD~;+_wQI+&qM8L(OK9GF_dxE5F{(s+h|h>#Uqhgn
zsPGaO7@i1-#e_Xmx3*YKd?4=0cgP~AH#z<icIIX2c73;zENG#k)|Z63s`>W|M%>6Y
z87nu0a<nzhggT06v$R`A8Rt=rE9mdA=O)LRqK3m>&N_cJIy2TsKpmbTj}<P{Ar+02
zdl@fKV*fA0KZ1W}M&W;E@DXrXZO5_o6`hZ%@$>QO{|s&lt8y%{;Neou)dRpRWF)NL
z!dieRVU{)2VqmJ?*v1rhl_`Eq6^hIH16f>&DEHhgb{F98vnGNB^PFnE9WI&D#r)Fu
zeM#Nkxb@anQrVX5?N+&MX32cD<Z^=|$R_lsaz$lfvs!@fkmZ<rn!>>qB~{Iymjt_<
z<`7O^$bW9pbUNgPM4WAhu;v{w!Vv#5*amS~3h(ZK`q>UwI(U~5)%b9wUiQMN=)>!k
z`hxTIxJ_<hBi!(VY<Z$dJz8s7yn2$_<=#A<jZByZX4*0z6k&8ra1(ZP`MpL6b)dN2
z)v9P|H;P`&STG`5zojTyPoU(q4xm}G?l4utJ-XWIs4Wy@VfCk=;s7Mxm%2&bs6rVF
z+FxEKCs3xQJH%_OgQ%yBDx#ATgsHVny|)=vExeZoQfnQ=eB;{ZX0CmHSkBj%n=4!y
zi{TG*T`g;ravJ=!PIVVH3eQ^iZ5GL{WP$=nayrQX2T6~tpy4Xi>08fmwybt1^92>{
zI?GYGIK14^eFdPr8mmo%zbE0hJq?7CaTHlzqU$z$t`rk0GSVjEE_o50+QOcxa=UbS
z#^!Qk$fv%a+mHe`I+DL4_Hg$VR|fiBJtC0V!dZcq5q?igy415w>S-UUL6Rt}b(<ZK
z`qdpqe~_~*fzDG`12?&bIV7jTX6eK?W|6poM6K)DK3A@a*d#56K8gwaVxQEPhsdd`
zY1x5J)zILiz@VwzlX6kZ>g1*iN;Ay^+~8QWdMXYI%RTOJX}Yf{?1ik}lOFb@gohJ2
zpfRHONp2Qx)ljO4rzWX+Sl|@5&}^O0;*nij0s_8Ich_!GWB6<X-JKupZa=y!y|ncs
zZDpK`a{ABmwK*1)P`OcBk9WlJ`EF&fPvpMQTETtC_-@e;LBh)5x(D*8)h0&OtI}<R
z{NK1Z_jF}o&`bgk3{<3Uv-Mws?ScMp1Z7EG3_FUwX5aVrk~+buBLExKKB_<?C@iLm
z0<4#m($WMgg)MCmD-P)D|5bM2im(Kl7o&){-=bE`#K@Z2=v>GO3JaHqzqpC8Uy?2f
z2tO!18oj%1oe%e<eWgIRXEdNQ>7SnIL$lSI>U}PYEKtcJOV<gVZc(L2VsVXU(O2l)
z=9^XSt5=d63G4Hg&g!F^8H)~4*(8KTe}XKTAI-0h9%cRrS8h(r#u(Sfw75QGnT(P}
zN6eo=aHBmcEaA{I;n0_RI)mg>%ZpY`?H+uLQe29$gdgjL9m0Skbf@Oo1*T1<I1K&l
zSWbD-=f)gigIT5ajfb8NJj1rMG>34k;K|WKT!~K_xoUwE^Xs;n9GfTj65Livc@eSt
z@wiK3b}!a-pC4cMDr2?;m(@Ahu^d;xfjzFyTgW=^sKkejuI_b+4^^W?d{_n5{N3KR
zFn7~F39p!9Hi|r^ZW98-{|ex&l$p%L{c?UUQ2gLT1?wR)Z_h_j-chYG7qbvp)1NOS
z^DRYU_gm-cte17x>mn<?t+Eb@kd;oZ%!T|@@{;%?VXg#o#MD6O;9=RUmhN?2M!;M2
zUM?}b^d&kcRIY^o>f-P&X8*&zy}@DQUmf^Y=iQCJz2}a&tMURj=B2Z(Dt-f*pV&y7
zJ>?F6M>%*0v*~3EYz=#M_ztzH$2QMb5*-G;IFRE&%QovYWK_6{^JLS(D4%@-%!Gpz
zd!{qn5gwKvdJXDAs72eZE^Z#rtye%Y*kfvXMIjjWDF{~T%-6e<DK<30*YC&@_>A&j
z&px@CV*=S|`Y-NKE}T>os=ZoIy7Bi<{i$<BGg{n@3~M!x^pQ0AR{kY^-<%QmeFkNl
z!gd8;lVd%iE6|Dh``G;8q?~b8k`?++cFp?2P3+e%^AStMxuXMp-V-uJKc)S}D)AS)
zWVpX<pX%wZn6udJ_~;+&ES6$BixJ~9^a&SxU>Q#U-zi{_bvORq)^7Z%bkFo(jW-C`
zrXkX$LLcDKnXcNWaFIwCTi0yH+qX1R-vTSM%KR*`2%aLAP9CFGjs(g{X(uN67BrEW
z<S8;9ew<%%N7<472ay>I{8?Lgf-LmM&gH$E)*5)%4%EdkE%j*&dFS>>a^B&|dL`Hv
z_Gi7OKJtEApY|3{M4d`|fnNj@FY)V(U&7^p<$0f!f0}jdG}C7BjC~&w&$lMo%({(N
z8uUC;pY|Iblg0#JV6=StBTJeEC?~LMX46*!==?1TrIG$=nb0^fCMBmcefMq=mu7Jt
z(2mWdA8LQKJ`W~9h{^M8vu%FV6(yoBWtM-pUC;8x3H+ZO{>`XfOv}pg53df}@vk8c
z8M@;<xEGHN<=zl=9;~y*;ruQ55JFz#N?q>nY)>y|Z&OI7t~c-HXkRY3Bt+z8@jptc
z6#QZ(d~-B@W!m-G{lXPRoDn1V<w{-_`h;sd%9HnsyeHnEzJx$(bl~N)z_qmBgKL#D
zS)LRTJ8JQ|6Rrlb!Ayo3MV173LBR6G{(wZ2Ek{XJn7$NrOMtI>tx?s?$`)N<u2KzI
z^0q)66CZ#1rFsV(`Jz|ck8J+qpdS6Xrs|OM!-w7-_LThREIeec_w7t}39_X@8bTvo
z1&8?6^b4&(JvX(iF2_Z@F4Vica2;<OT!o*?&j|l$wBF>pslP7R+g-TEX}s0wF5D<T
zsD|62iS&NVeb;m<EcT>+_2e{(J-cpSZ)af(Z>`S4Rq6-*HC4$WuDZSCeW^+oey)Bv
z7f=1p!u`C#YRdWN-mb!x+(g(zY?|eP)Y%Sy*V*FM5}sKecrzVLFgHRkSVqHfwiN%t
zy+Y*hxN44|$*IQQzYw*S-<^%_J}=w5HYh@dBF0LuSZ*$}2C@js*})N1(?g%Ddt>l%
zw3edDF_&>N4kAq`QV~DoB(J$w*!oMzgn(u`M0*C&F!MuW9V*~X(-5N$t7Zv)lMT~<
zEso*zsoej*>XRHPycHiTT%NY4a&^L)BIY8DeQQlclyI20pNTj_b!FxN`30ujaa*6n
zeT=Q3S$>qFUB@lLhm_CYOc3jl=hgCk&CT+Ch~bHs!uR)y#$=!MTRIA{4Luk|IqlCV
zI0<uidFs`|7P8>INaZG;R%)Y&hv8`ud1WO?W}k1wF0AlfT_lx}`WW|Rpa=_XidL`e
zsaS&K2A*bn)7hAEHJeUVQ)F6l0Kw<KQ0E6J)cK0k`IWA76nHhlJY*i3J}I`1%~3Tz
zWY@Ud(<{FE8|@zZJ?TQZx&oqMDbR$)U&(4*6Eei(2eX&$0=3+^5TIceB)9tttS6El
z1@1-9Dd3LhD2H;$lyJ%By=vF(AYJ_#B~9qz*zi}5dM{aM{hoG@39X+nyW75YQ2kBb
zyL`nKPstp<_7#86Q<A5mK5d6Q(7}3s^#(BLJCQ|;myuxQa?++D?M*4eKMn@I9(fED
zd)i&aF^bFZJm&M6v}Qf9y)8VuTO~!RHhCT;)hl(}PO7i?4jz*|)gctX#V4S$hTNwh
zlh(*`&`k9EGi-i=t&sZ{sog51^7^z3rOo=ZGiV|SQDHQY3uK+p?&_paw#4s%NssuG
z7U54?y>@z*c!*Z7y=RALWEfnW|L9Cp`;U^_pnOMXn&Lb9QQZC++drU(=!3fbDH$9M
z^sh)0PkC$s5U?}j^^uTaIm3$88&9UHhXqh<2mVwagd;rDdvWNSqxPHPqGSg5{szGY
z7Mnh`&q<L3kmgc<S&wv9-K!GDRnZg7zy}3!<;?v#WyarkC?WuZRnf-AQ}<nJso&EB
zJoEld_QcQ|0B?U1v+*5jB3#=-1e8uVh&4*EV^fmi1g`cK|AVJw?{6E@8hPLo7D~6}
zJc?9y6*K-J&+?CIU*xeq?az`>pZ13QXpHHg&Q5ZMd%rp)UVJ}UmKW6V6;I_c+4Ew3
z+O1MtF?#8c31&a-JZAqTD%P_<|DFg}6z{*Ao5U<1-+#CADF0pj1P}=x0olZVcW)R+
z5$0}7Sh%QSi&qv|s`&3d`-<}4En}ACz;b|BO8hN6QWz<5ObTAHCDIpJRuOv$3`Cbr
z9pI8v^Qvp_?_hoRh|J#-oKQExaXowlXPHXjxXikijfS()n7^NDucTUQg{*t#)C(cB
zKflfTE1!$)BTzh$K3YRtpuehW&j@IQ#OV*DCF737$re}L-VSU|9k?mZxo}gz`RgCp
zd`78U%GI)s;O8jyF~JxA`AiXu4#YfSk%fXIe`0WIR7y5dOroa0M2GEv#aA+%6X;nk
zjXi<ERv)7oagy@cIfaiyC(P6Li=bjSxCPD6{8G_IhV*Bwj<oBdpN0MEv#&AOM0)XC
zTD0{EXeq=k0*3_eudeWy9Tc|5TM7%la-+!iwiC1=5d;#Ppo^6gw6H(66ZG|*BVnd*
zf=#-@q@Dzqj4&7&t$j!61pxERsc=Irli`M@OqF?!$j@B&2eac25U8RAjbK#%GJk|-
z_jj1PF|-`LpLb&8SScF^q#e_1*L3fi5TY>Wi<tP(HVJxf3!yQBd<~{F^GM+y|B2Ph
z?>RHn^Vri92j_*CN$&J)bH%I`)KqCUHW}}n;_2l3Ib-T7>wJl{r%K$jIkel>{AlMn
z8$U-y-DHo2@9i7n;QQ7W$A_=5yf%EpjS9YL%=WSH{qr}+z_+L~@O|no;Cr%H;Oh(w
zOO8n+;(wEx7yqvCK_U^s4iakL>Dv%ChoxhKz^M+Fvf?6?V{+3OUV|Ju{Q__#S=L|Z
zHUuecxH+yR+tn~9U5QC?e(yWOEkaeQ@KAFs5WQv1W%A+ms!OrS;3-DmN0xCohh7-7
zW!=&x)pCo={E47c{nrcF(DbrAx|U-S%yx)$Mi051^a!LrsiIfV&l?7#JnLHA|1mrf
zcVGzY&v6P*nc(Jvv)M26fup&;y?3-TFucjlB^<+1kW3w_3~HJmF4|SWRmoF9ulgw$
zy5-2bxjwwu0Y9GZGCe<K%bETjD8z8xXwQv4R3Sn8L4aw?B1=7X<aTwN0?S20Y)7a;
zav?;M<@CU81z8+#>--Hf1#ImoP;(=HtbFpxeN5;6fL<%|wUtFy0zAR){bi^!iO+1z
zmKk&uG?p==M%8{9ET%)P9yChXo_wvDQ?6x4{){Y8ZnHRYOc^SMvY$v@g;zta2nPI%
zu18!W%IY~pl$}|RYeZRtcuBY$a^d|`-MBb!RmMk=8ilWy;07q%YhAvrDV`qgz?DRk
z?1YhVaW=rMl15R9bzNI-Vy6r*c7+?gxTChEOmU$Fr1iRd&`!B;#@fdtfGg`v&w5Lq
zkSsjwo8+lgo;dB(4~u8N_1rl}M1s}cHA-my_0J0IlkNs^Pt&Hgeo7Q-qZF_J%M^DR
z_6$GvhDP4JgFh79Dy+q+JK574b0khbt*(M57qVzUi__m}h07S+Dg@t7JHY#DF4vYj
z!o`<<>=HQ|<=fe2R}|`~?UBB^cCMnso|oPJPK(RY$QJB(wQO`194I#iufbruqTJ+7
zryV41qszofyKYJfib1!3+=}!oC?k&k1x+Fr7aXKL&cfO6u7ZPZc@JELU*&37(WWVn
zpc;&0_apl!%#>5fjn6Pl`{*$Xigkp_=h|Aar*o`RF%&{jzua>e62L-A^7hj@4>^hv
zIvFFiW;#BQ0>~%`Mi~5v@~Bp3!D6=bg)cxYq;8eOs;sAp{Ir8Vw5X^_x#GCqCtg{O
zXA3*orw2P=z<$A^Lt>uF;gZDM4*trmyqjft>e8fqp6UEJ3HHL}iB>IS$mxfJBT&ru
z;j7Xml9G!!8J@eiDG>(c77Vw-GZJ$>iqELNky5t;3(pzOF!Em55-m99vYW7ZiUTvA
zzEd`$N?FU%FwyP5#)S?F7RjD-!2@&*{fAtmvfu)<uyKY^DE2?)E?xRwDJ?L;B$dua
zdGA;9^-`z3HHl8=KJAFoD88w8(za&7`c77he!R{cNqFs=>!X>clBt({9R1Y0PUXz}
zBu_VMbdG>VBIa1Z)0NOCJA9X*xctMiTBA>nBpOv;k)~B6zd8WW13jkDs9c~F%9>a1
zM@@|D$!nA$b{1Y})3dE>5hFRijg_8|8N=Ba?qTnkkzF1bkzNkODre*;Vbb}GV-rt`
z!qx51p(|l;j^k>F1RY!omdwa;51g6p>1fo9QVznk*4iHhQZtZp!yeOLnu*5N9G;1t
z{o&~%i}_||C(9*B{r_mP!}XvrWLdk#;3A>Ev<y;%SMcWc_d^-eV%3}kg`xZ0=x6g?
zM@JCT`=6Ydl~HbxQAQ5O3(^8bdqJ?<DA!7j@5^@S1(B2Fy+khzwIFbTKqS#cUX&s%
z37{k{O9^Y@Z_hDSsa~%M>Sj@>DD#4VW5hv;{S{RMNTch(oWcR0q78bv;}-Chbptnd
zIqFl-WGZmsl<auls8cr31a!f-G&gE5IYqnS^gRP)`aJ{Fn6~_m2r_|E_W9u>|7`G$
z!GAgUze<r`82qcx{?g!Od5x9R$ZFhy;)-F4ht0umlrXkSgfEqW#mDbJzqm@V3_r<X
zAg3sBN(z@(Hxr?HmD7Agcod=)c}^0J(wnc?2btVZ$0#RJu2gM_h~hRsg-5<vooi`E
zx#~HUn}|4Ya^p?xrVKEh&0^{)q2>4hoy2AxNt(8x$f|f>e<h*v<mM;K!6u(jd$6rJ
z4)kh4NBCheUofBuGX5m<!3fyAe&0z@1^z?RIXO(%K9{-VDgjRqb$@~AmCg_ZI#mIZ
zg&^qiE~3@o`m(;^B-5fB%ATMsGzt6zR!O7kzd=hSXxWAom<r8}aoCVpd8>{=4)L<m
zXfIl(S9bnO{HT^9zknYmczw*`V8^K2t9J|zOmL7QamR4V9+Ar)kq>Or9@!(}-&UZ-
zR?$=H>cca155E(6E0H(rQ8=ulwhO22jM!$qq)jq6$m0Gdi~N)Ce|a2r&m3nQ5U?T$
zy`mG=^@OJV=WF~vTrKQ1E<}WNAu%&9fxl%6M@BV(F^*o%-}y()-}uP<2{&EePPSjq
z9)MXOXT<nZB7geGpIn*AT$#j!s6VWZ>=F?)+hxsz8jAl73Qe?6zf2ro+3~OEUyA(F
z{Bu0(nEAK1OQV6T_d~)KIM|0E$3h(gZWN=k(H(Y#uVP~%%|`gTzeW%~fp{EVtAIED
zIKX2i{Ok317$)hc^*5*M|6u)H6z$GGTYpL)v~TZ|qAl!9q^z%b6rrbkecA>dwbEu4
zPu7d81bNav(gnO*!273mb->U3>26HFb+AiY7{%YEHMs&fG}s470DzCr<5g}{=G1RP
zNo;>Bfu5w@NoDnEcj<;|gx<)P3-p)I;GQI3nQ6mRF{dt;Kt9+%0%_TC-MC$Mqi3`m
zlXPX7zu1kjOQH>5uFK}?hLu*(mRAqb%&WK}R4(|2P9}M-6LNPI`iuA2;$#nfA0$G-
z!;Ye&u`IR@k^meF(X_08Q_8eh^Hdr9`zV7#WU;l11h$M?mr=ILnfz&}J!Ybmi1|<g
z+(z)jri)kr2;v4|#thR*i>qSZ$9<kOXHkk#JuMZO1^Qo0gdu71GSvy>*_&!w!?>d0
zpw(;J$K(ZiTqB(d^l!P13vN$WmqQ|T6M^y)wFQ1av|6lR6X$UTY6$0Bdw9+j55x?r
zH)>WhU`7D@`LHe)+8C9u97c^dnJ)uFH~Cse7;{pjLlerXiJoAExAGxv+>r`LwT|Gn
z!Re4hx4*)vSiTwP9|7}V92X*BMouoi3NgyM<|zR$Aq`vj{&G?TeDg>I#62nG`ELsK
zi7)g!Ql!u%RcL|Cn%fw%wLBUBS?NS`u7C)QnYEmr5VCa)2T!#Nq1+&JSYT909l&Va
zt%RzpS_zixPv1w?6<n{}YDSI0lf<Z&GmF3wtrmu|FBF|0j%yz5D>^1jH)E7OmPSg@
z5&NPQB^_!BTPrZR>U#f5JJv_yHl_;>qM#fKOO%J_kM7%*@^Z=z{$D}_t6;JvQ@#tH
zDBUm?-k6s@$ap|q(=Kt3s_DG=K3e1Z$O?{j4CuH6S*PXD=uiQ@ZghtLN<rv%t@N&E
zw0G^Lcim#Wd!B-J@8aZ}!A3&=?x1|Mf1x{p(HQ^m@j!+!Rb`wvHdFK9+TD21nQc}R
z-3N7ob1?>~-gw69o;-vnjrAf{Xq|c<%|_(;quTqKLecj0$DIi@Xx&6QGnqj}(!#m4
zAirnw>mPMv==Xwz;9fnR05<L!RqGj`Ld;Kj0LL2dq@tBi#3AB*QUu4^KN&^D&}$TW
zIlj<FQl!u}602+D5ivB~KWe=0<YFYC;OoMNGh-o#PV7+C8pin7Dfk!}Iw`jwmC4zt
z;I*o7DX7{SM!78$KV)_A5?#d@*_Xyx#)T2odS;O7M6f&gKu=dzMWS8;^_GCAhl8#H
zrR})O2y~L7%~lyui8=bKz`b;}SlaO=SQdcLpDw4BK<6HGDbBPb_hXYek6LkE2Wely
zjmz3TUa_KT{~I-Oh`CB#{P?+?qcsmo_xo5Qd$om!R5#!69#R$d3}el{_Zu0-7NA*X
z?x!MyZddQQA1A#2^WRljs{YoUU`n8WSAi9!eUSsl!RvL#Ljxn#P{*gFNhAIFqK!KS
z_@Ydag?GuI@@}MmuhXUSxY{0-+M<=)ERpz6G7=ad<$n&OKey)gRQ=iViU0uKcrne6
z=40Q_CmM79sF17kTc;3T*kZq&xqv=QNS)<OZVYb)nAsRiLRa$OK*t#x%EOZ%kb+{o
z^UHtXp&t)_dxQrL+joBY*F11~xAOpc9O}YD=dL^;F?X)d;-NhcqYsL9s(=4yRd49Z
za`CDQ^xufOPj!!gT}W`(1>aHW#pMn<6zKfOa3=LU=!6#HQuXw)|1w)Z=rIL3>$67%
z$yrH1{G9}2$dXWJAzv~52isK->}AG$Je7KQ_Zu}_))%hj*%R~~n(X<`cj!*fmba6&
zMk~r6$^JL8T%2jOgz^LXSbdQoYp;uMWL(5Qzcv@?hn_qNU1v}~h!0A5i7`J#Hm@#h
z6Zff&Yo~+@sI|V(*`X~)O$8lT;%l2UGtbx7!&BmGyV-jK)#Xcd=HAeIiT(Gr-R^nJ
z*H+;zcKK&Q-kYu0&X+|dWtrJfmF~rZr~s`dv89HW1g9x|!qI*jl@kPH5*_Vn%vRRA
z>(qLX^P>xI0VmL8H3j&-p>!mlha)CR$#?o^%B|srnPmHCIC(r*KaSz?3?5YlATqCS
z#MMDvCDaU07vrH6e`8Bqbf?Ao=wTsn!_tY;m+dgG&v7*j(;w-Zcg23B6+fKZvL2?e
z)8vjte~RmZp|EPD*{HB`Os-?}{|Xg{#5fJi=;{hg&9}~ekc$07b9Drw(h_^=#Pvmb
zrN-BQzDG8shV%$1c08m&A)1#cDAXWqd)A`Ke4wH?pOG&8uaDlBG(B3!U#J6jP*n$a
z3B$Id{Z+Ift`AZB7*P>$fdzsqIw&xY<Z+RH9M9wXJgUyIWOyAC*FV7QRQ=}^6S}1|
z>lHe!X2WhSIE|0(DOgydIvz}=g6R1Dms91{y0IJ{hv~<B9tTOg@{Ou+Y;dJNT9t-}
zy8m2oMC$0MJ}k`_j{Y+CxE%MS)&b?oEwT0J@?S=|?=xg>?iZ}uq#tGEtL4#GKcKv*
z&AXma<<qQv0NU>T%9ZuQZix?XzH+3~Kj&%b-idQw;KvK!|9ACnJ|*u;KYkl(7AVT!
zpMs*?OroYJ?~auPpws|z{!H#@^$Rz%b&%ovgi`t-nmaHJ1=mcRlfs^CVH(DQ;KG`U
zF$VU4xKjD3U8DRcn8_*gtx~d3mGlgkvn2y3WKa2y{mc^hfwstZg2km=$e^zgorD7x
zN=B58>Kf6M$OWRZ;>XMpjNsDGZgTmkItwFKa@ADoR;ewNA}-DfxwPq({qJc!eHryg
zHuaifI2}FekAD*md#4ITtePuzpoNqebn19xADn7eM~^oi<7Pc|TD*&OwR|m-#pl*^
zVFT$|aDX6Qpx7__U7+)=yINw02+C-etjRj-t`2mnv+%7P{3*~`*XXPv2ly&>Wof3)
z9H28jSXbkHoA>@&)q1+l{Y?C3Bfib+$O3EB`VI`6td#!4vW=74JI!IaH5MC-Rm%|S
zAyM_E<N5CvWd2fF)=3Se)Ai52@?+$q+c!?~Rq`xMWUkZnA>3&*f=_TQfxgLopeh)=
z)4arLn}l;NN5vskepOWFNESyLJ6?Dzf8t1i!f{P++jAqy3(_7%+l5#A$EI8FQI(k!
z-&Az{$!4Y0yLchkXd690DN)tnw7ysve}3|`1&U@RoSzI(5a;JM<@{`)o$URj-m|fP
z*U1-ne?`tta@?)KT2v*7896&SYq4xh_|sPqZ%gk!B2zN!zh~<NAvW)4YGY?73!#i;
zUZ*pkyobq)oSi&XXSrM0)n2=b-Kxj6YBpD{w6Eoj(u4o>{b=|rdK~C>-R397hcyK)
z#d&b3LwU7hFJxI=`U;dKe1j(+cL1Bn$~e!Xn6;}$c-#J`#6I~d^eLGaZBtgdS#V`F
zT>!RTk3Ay;vqxmh|HKJsvvOJY%4uVIvf66iP8QlBkcTRu5V9`!3H(zA*QX9hDy<xJ
zSuzvIS?vvBtr3?h6@63Uyr})#td+A9>l{a&2)eO4Ll3ltOY2hy;X&W?GF7R|_l+<d
z)*3wAO%5><Zr|#WDs!zI5R}Cs3dW<`8+TNDyOWM;Z^kdScLVJ)T;1UNDaUW{==OG3
zCywvlquV?27u(x^e_VTI|ExXM8xe!_GPhoPoX@}o?Bz?#3w{Wl$PR6FR#Ugsg}Fq}
z-4EfPfiuKM&)qi5H;az?+cS5$a#-P9Kz(kK=R`=VwLxH(OhIvj#78LRO>>lfCLg0P
zCQ0*1+~tO%yp(^a3WrU02CkegKD4O977Hs=$PI^o69-fK4pJf9pGXG(wkZ3pyLzkl
zRF09kGnym!8&39&lA8#3M6P4)MreWjXr`B2lu+GTr5}rKDRc_AVS0&ZLj`Se^<~9m
zIW$~4S)Kgdg^P@S4m~Be2(zKKU~hyBAeXDnq}duPeFB_>kHo47OGaH*^N(WldR=xb
zaZLM7>;^%noKKD8_mwSGDa$3mrn%XgbQsYa&kA(DO1&&5%`luMs<VC8!(<MrYW8yL
z!X4=nqpeo;??Vld@c;6<#Y^J)L+^3x=|ogapbl)X*6inKVR`p|lxKVm(Z-JWeUsaG
zXToGpv6SupxwCpT$5)p*t6RAmbtxenPAJc4E34Y!Hp-ekZ%bKV`VR$kZHPOx5SuI)
z-86f#RszRF*I6R{+zV(!8#IAA#-3`-zv~uv;I0Agz|8!Tsy&_zn3@4e-m^nfgkAaO
zMxoI&oL1RH9wf)867?(FWpc^f+w<BN?h)=Mp1-@C07=5Nv^xg{t;7`~0+nysE!>im
zH09e+=W^yku`oMjGX3YtUBtIn-kaAx$#a{uI4RQN+MA*+zD`kT(QYhWz7t<A-QHto
zAFsVr?e?b5&~16rT*<pb!x_q$B+o_k9dGma`b5BSCnL2rTMk{}$(+#S+NWFG180nx
zvXw7FS+HSa%8Od34tA$Zz%sKnvP@&1F!uyd%QRiV&NT^f3hF=YBDDoxiE^lBCd`L;
zLHD^zWsTEW{bE^%tj=Vq35(5__{)p)CJW+7?BZot!g^;Cr^hYzH`P*K0QA(gqp_tP
zn=ku5(*}1&i?chfCdNB6CUZ%B{Ce+2D5#`prrc-BdhM5IT_s#Zta41!di`&e^-qO6
zb#2~v<JSMY$oik7uBrwxVY989QH!zWbxZ=TwsM<;^4u3pPxs#fpN!>A>@}kficYG<
z@>dFh7r&?b(jDd<oPLgyg=<`k<WQablauGfP%8n>*tg_QE*}d$7z??H(gPxbigGJF
zGTkJ!7xz<<$s+J(Md<<3<DUN$G@)<g{+L91uvOE8mh$fF%QLoUdeA0ooLPv`0|K-)
zXnL?r(}VAaRP8&K9@y981?NZT!AZ&zlOMGNai-L!2DZ0(O#V^+iJ!a7f5?2S7aKJ+
z7<$64M(9T7qTIa9@5~{FFY^?0H_%1{O<Mlnj<0|D;6K4vg{sZg$Ltt@Kw+<qudD^a
z5jZb!!sHK-Z4H=>NcWfI!1rfco9+{Q<qEnen*R1EG!=zU)L-Wqe0?3fKNerJHNJjM
z=KtsMwdnnS178#9A#OKB4pBFn$nn%DEzEU_05AW3feUg6{wky~b|bTEfxYE#k=cl!
z51SuyX6%i!JY2jPPag~MC5&88&(>`7Lw(;9Y>U(yYf00{CYwiUwt1vCZMul_Md+aj
zjr1v&Xs_jL1?SN5vLbVxT-C*WT%a0KsoTsiM=zn6#=iT&6Btb4um11xFHM?W{S?o?
z>^z2lfnF_*(yKx6F9e|64ZUj9{L73){-ubUCN%%jnUIW{xFp==V$-ZIg}@v=|3_?2
zHYq7L$^@%PcL%1$Z-p->&Uf5ac>MW>eT~ic*o65`xZj)kmW^Y`I|u=b6w1LJ={dIS
zIo@l}vE1(+;e(%!%<&^`|3{jw4SNPVKhe9F>|ZzLrK|HMa*|d81T)z?W&iq<%(K5e
zn{ap}8{1B61qtP7Xu$;Un*S5iKOi3ekK$jge=`3AkI%o#{O{BJYux+;G|jyZd^<Ax
z25i*v=Kbrq?Im&k;fJ8MO7)Sdlu2<UAj%@y*pR>&y^;4z=y06--6#LE``y+2hpi!R
z9XtO=ukl})e|VF4CT47Gcd%ufM0ynEXKeY#zb9^g02<+uxk{ejXT3v?aM$NVm>JHy
zZE=gRCfdpE-9?MRGOa}f+4+A0Z~AC_O^|nNdhv64_b<vbzR~>6KA{(r;`y7HyaRuu
z=tWu8o@42S&EI6a8Nry|n)gm%a0124S1U!!%*|+<USvsXMaz;Q5K)_wd>q3lrEl{!
z<XZ<(#3)JUH=<J%Nyl@7$Q<ev(QCxou)1FkJO6B%&6wL&$oG{qZ%GolX$kl!;n@Q-
z@~xL5)twlvE=#J5D^)yNYW?&wY8@J_RrF7`y~94-k8$hzJCKRZ_BADp1~Z@4zU627
znw5?>t?lZf?$#s%m+Uu6)mefUSxL^shLQn@P9>L$VcFBwQHirKcDt;hszT@rkrR}P
zK^<_Mw@Lx-y@!xdwnN5Lw2>J)@lUAYYQ9snO^(}pBOQ*>W3Bf5$?<p8*!}V%_Hv<G
zf&NonjPxpxa6()pVym;h-ODJ%bcv=z1sR*nu^Bx0dP($#emzl$^yC!p=5XbGC|j$9
zxH_5}lD!;$oRMyM5U6!!Y=sr9@liSc;knjxJD5C#V0K2g*)uP60^*(Acp114rtFT9
zBG!ok6vTJ=?;A<amf%}q&GCv(-*XCS<5Bc`&kuquguE=HB*oBo_1};BreVlPzLg{?
zWmM%-h$<?jiZWG&QJqPe6yTd={mpp!CisK6Z^(t8iLzqw>!zQzpl9P#96Y)E0OX>9
z0WbhU>9Z>m0r<im(J7(GhybuR=VH*^T__5ro~6e8o;_u6GJ(5r?N#5<kr4rcY=)Oc
z?G>(oJFg8VnzFn$S9qly5lIg?SNjfin6hKgrCtv?sd>KI)dZKHcC){9yjgmq^rU_g
zs=(6m4jdB0iSdp8d3|~D^_>>0Z)$vfjQGHh$~J7BIYmaiG>S5B?iQ3O#gBs_qXOM#
z4Z4%TA&x6O8iBRJm@OhF0{>b2e&#8vLp$0tS_3=|-b$Y*s@uk)0`F$)g}W15@1t91
zmD4&0ez#dQcc|9$Z7p8U^FZ43=)yOG7ms}*gV<;NT2<2!9Yin`!~4CW#+^<r;C->+
zeO~X<%6p5GsN-wF7i-*Yv|yI@QMhBh$&vy7(zz5>Yiu<=9#^YcO=s&`)oKD8^lI9o
zDs8VT9iS`ywXDNH|Fcm;hyvy{R$)BMFI5$n!fQuH-FUkib<VM)X1LO~n^hGYzO`3+
z%Pv9G22}$iK48tJ1`zcU#PhIq&#h{R2<YKsmd7Wcc#K%{6q_&Stlh7G(BcnZ{!IT5
z&A~ARE`ak}fJX4W3_Xh`9+&bY;Vp9U4E@a#akDMKq!Wp*i+0p@_T)F08j>dRkO$!`
z%R83__JI@cmuZ}Mh$R-UlkdvV>~pn@Pfh2mAhWRN>A+_INBMj99&Ask=+PB`h8|VR
zmqA?%x?F&)YMEfylC5ePPAwWSoPu0&x9C;ny+DT<*8_{i<FzVJ*{1ldlw@39C(EDl
z@<;o^MuPHL$y=31g<e~;5%`sPr%J_umI3AXb<I5HDzGv=K>6|6LNA$LQ3C)ZQeEcP
zoPB+1Et`kvTUn6tRQ`w+BT}rHUDksjpCVFvOoMu}CkkL%tjRYEBH-h!*KZw&r&?wP
zXt|5DoE2@khoqXD-R5?y6-*a87(utWs!7>c)QlXkPE$>m^+(|t7<3`C(O(vr(20H4
zk%_udlul4*%N=P{5BH^N8hx7C(!)Jhr@lOx)CZ1g_2g))iZ~pw{sLZ?)=R65^b!`}
zCH6@7S+ng%2k1uE%lZ(EDaUsY3$T^q{P6KS*JJX3XA1#l*HoNOgd#Rp491fV4REFm
zT*$r-O2hD3MYN;Pyt)f|w9r+!lZ5L$o=8ZeG&v!4fy?-W=(;*`t+MPo*KKukH}Gu<
zL2C*BTQL<b5l-GZn+Pjzy!S%Y&RI)krNyg}I7t~O{x3qmVb*(bH>`meb3=Vdd{`&m
z{|^#lX51lzWUDN`TW9_^o_@-VOMqF9&spc`+)+CBgf#dMG2=@3*-WzH34pTCaSyNm
zYWDo$Ie7Nvf*T95CtKa=qo4shx!7O?8GqX%EO6|6%x{;d@5O8K1R8MoTJebL!V|F`
zi2TXfLUvp?CVw0i1GwXtnTx6YjmH`1gie2OCLaiIW5BO;qJv2#(!rK;j-jcG5VzO1
zg^MN82uJ`Nm=q+o=!H#Q8BQSUy>oFMx{`<QHi~l<4+G|rQq02@xVeS;@!-0RhYNTZ
zh(B5AY#ufdVI|ashXu`i#Tmn_TP+^i^U(5{tU8qbxM^WWjq4Rm+Q?{^9^wd$z`q8E
zIWnGF6n|)%^gk?9oZp{uny+c3tD0q=cd7d5<1^GWxAma@m_t?tnM{n>-{66<EjBy-
zCo?64nuMtg^j|@h7VbKVt_X7_ONS|eSqi%^s7QAwSM)(-Mcj`WE2TbVKWBd9>$+?N
z(>qwzrKT52aS5`on&Qx*qo+6?UE=JY95|_*iT$Fw2LseBwwuuQ*6%YR0Ceyi2%zvb
zuchgC-QjxcHVCooCSS{}`eyV=HRdelQ3Yqfvfc+)C3J3-pK@}~rMV>xa<XR=r$M%E
zUu&A-dzf#)3Bpu)Ci-S{3VX&g0Beuz#58_^kW4n6bV^Z-8+65Co3k&%Y!7%{EE}G)
z+(qaDytt#NCPt={#tS<$6&RsW{t7<ZN%;kY`L&x$vtEBqUOLgPQFT37VEvH<6~nPo
z>^x^M`?FYg;;L6X?nH$i)A_!>bU?E5K8%aYC4hxzPMddwK%$n-s=brDRf$3tlRJhX
zcML=B7zTZsd+nu2e4h3qb)2R=aV})<1y!$-@FGzOql-=~3W0nv3FDLjarnjD;YG?3
zxv*ynE7Ug=_}!`;bg&-^`v3%b$=Z(tp`4r$Nf$T@5D&hpfOxvQ2%0IDvKSD%il<Vj
z3-AL5<8fD{Exx!Li)JWAd9RA&7JxWL+pYae@Y|AUO|vei4Gn8vF5*SEGB%tW#Yyqk
zl(SFbt5$+by>R%^PS2zRbXvMw#@DK=A{&?FF7uwMyH|&M4B20NBz@z0k;qvK;zg=9
z;`k$V-?O4-%uv1{mI_RDBRdQ&W=lf3`Lk=*9`U=axDXUZ-eHxxizek(7+@!yi1qQi
za*$;PTk<Yv!C_a5cyBe{B)0fT`A+|&T&_}c7wnpvRi1L8-pgJm(<zyfaoCuh#cBf?
z<v67~83y268Qr;@)dG9ZR$5F<-KvG6@Ra<}NTrU8>BFO4<w}NVxr+1MMKdCOY_oEr
zeLNiPW1e(F>quuPceIOTF6e~PsUAjmhHbL1n~_CZhylZR;75o}Fo)$LvSpz7A>1qJ
z_Vrj=;R!M1%C_S#&36~oR+QVGTlvlumLfZ{p$`Y+AM@uM+QM(M=_jhS0>!uC#>6sk
zLL%)btA|{sW##tAHw#;Pt>@MRzpS4B!Y^w~%rC0~8!Gemxt02k%v))G@VA=sw{%ub
zHh)Kx5yz{yxuTlEeXX-rt|TkAV?VAl7wOC+O=QN!efR3zzv$f2fG6sB^&7IH=zfg9
zPS9^AI1qC3q1}}1%6l&gckpKL)7F6tOgLU1aVT+`7eR5JpWlHiAMUIa8a&b1$8`&y
zx{1bkVtU=ce-j%g8dJ9BOf-H+#KyGUb6dj`oyMJwJhz{$pI1*bZePRu-8ghd^1hj&
zpEr=ld6G^1Zdp9h2rZdtOs*Yq(3m-x!&hk^e=i&C;2dYe;9OZTk$y!+sQgd<wg1&M
zPsaOSJ@uGkSda0)a!qO&;N~Vgxq{4g!FuB^kvk-O{kA`8$paUxNu#on-5RXRuNNeb
z_+KS+==S*jSBG)-2H$=wGaU23dgmfF{om-!&ZD8p5&x?{=&Y4%*u&57iGu2{h4!TC
z{h8B}&5GYb?*h};|AIo@%UWBT6x@Mr_*i;0MvK^-b>_&SoFYFPR*wZ~T}*Ln)>rh0
zQN6mGU%wL)$x+R<NN%pcwWz8_%bC|c&PjE*2#639h=44#EYRyPu47_1BPO=o_k*Qg
z!Xu;TEFtq7GV#`wYW!UNEj#&i=CRy=S9*Eieu<kcE|;m!K+lp;=0v_<#)iFYwdZ1q
zZs{60q?LG>_!bLYgd9G1oiGF5jkvM8{N5(3e1dvk9-0%mpGBIbawtKs-_dmFBt_7o
z_B}a=4)s;IWJmqiYE5B1uKaI(urN2tdyzhv$*Jpb=wziSmjV|`0b0nl;}0Bmrl38B
zB=;W=CP=+dTX;{%I|r^^zWZ~NJO$$65N#i=(CEvc<~tFMplHSa4}0$&A4T!TkMGXj
z-tFZscbAY*5=!VDLYERvgkV5WiXdV`P^>5&8yY$>Mo>f%3#f=)upug71SwV!5wU^|
zFM^7wfC%LGex7IdS`Y<4-|zSL2gz%0=9%)$%rnnC^UO1|yRU$$%GdYqd`s5o?|Kvm
z_$BDH0Vrv=k$1fG{6tk`lDOyEK@ls$`7$*EG98EP;LMr!1Fc8z)Q!*lL#NK83`gtK
z5MuAdPW3LnWzeop5sFm*yTWykUidW9Ij1fLm52r<?fUjak6yvaa9-i9)wHo#_9*t#
zs}W1*#fYsMiB+``TZ~v-*T?5X@r8=Uc%=pH*suxU>*!F#nRZhkck+wEpxqb9E^(uw
z&WC#wh(~U-pog+#=ErY?Q=36bw<<5|056_j;s{;qKC1iF@C@o3W&qy)N0;ESSr_95
zz1V^c8wJaswoj6H8uZwIIZjD8p~?7yXBaQ7esj!)1=vsG-aSfo=|Nu%?P1(%y_U^%
z*%{tB_z2FFy5qGc2WpSO(+NCr#*U-60u~e$dkg4=vOH<J;Hu~P9X)X09w(RKB`8*Q
zBeaQebV6ECv@x7B7cIru7SN7zAt~Ny!>%b16A525`y8?fB5F@Fe4~)~h~GXJ4$vpM
zOL95|S$>`NCgfLjY&2zi@@sQWtRacj$=pvXTliDNJL0**`osEBe;9(-6ul22Mz8*%
zZ(nT16%Z%{O?jH{3wNgr5GC#Ev*C3=RG*?x;~vFv<H#EAY9Niiu@qG|L!+f#=f&19
z>K?lP5jK~w(tNR0o=cXek;y6(3O45b+a+%L6#Gs#@{`0b4bR5GC_>?R5IO}#UHh;q
zQ4oJ=SmZHpzi=?nViTEk#$<C&+#I}U4fbYuw^xnFUc_UzCFy(dQj%48F&Xx|+fhMF
zRKQ0T5pqTE!_(^`9zdU1%f|J7`+F=rb4uz@yCZfzTVG5|6P}id`5wzWls&V=bt~~n
z!kV`23TI6VfIZmm)CRhWEr-)`F$m#H2NB~qOIrgxCVup#tri|LLdT|%DivMYwl-zf
zP?FfGcg)iEx+MMj4wkmF>toq@r%4REV1F|u=Z$mG_OYpS=N7Mu%*X|~u)k3B8d!M=
zY?NHGUoAHCm&s$ynE!u)65(_nvhCn{!U2^x`tV2jjtDD3nc(rm8Gc_fJQTM({PJ8r
zs)2QDdwA0YG(0g#^2)bTOYG~_IJ23OQ;FMJiN;f!d-ZAD+;N}B))H-S%>H1~?hWWC
zZ9s5(9DejBQptl;X@?I!ZTp;96PF?jSrr%B{y$k4MazcfbPiTuY=ry3JY$}HJKk4-
zr+SUx?K>a?FmvYPj1CMKSVE(WkA(0UF1;NdX-9T&n!+(3r4?}6d`bI8(sn*8X>#rT
z7D-ztY46x+cv2QDJXF%2lC<aSG@Rcct-GY%A!&EnX;*L>y<z}u87*n!?X;1c7Lc@o
zk~Y{*yPnhb5=kMkB<)mAd#PR_#Zr~@*%~BgN^&hbd8U($x7$(jq3uio%}yqob7?#u
zPsv|O^4HH05nd9DE%w<rB5AcGt+SIBO3ASoBk6HTdfHBU!cOXjq}wHFww**{j{3$n
zM$#xr8fPahx03=$8YoGFDXAo9PII;$^a50PQztBM`yh*B4b}7p2FSmsMgC>6=p!Qv
z%w*-^X<g{6##ou@32pkkMF{u)@R@8}?j<Aba3{GfNS}vSXayfAopRX74L-0I7cz6L
zwRDC}ODQMZT(7#37kpq{p0$qOeT&5JgVPKux)MJcZcGKI{f<Bnd=4D@`(2fy`VsU;
zd>-kKu|0a0xbP~d(hz-D_{Y<WZ*Ch3?-_@48idlqI}zHorSPVX+F0CxE&EIuqBhJ_
zt$qSqcK{LAb?vB^y@`bU;)eOfx8k#Nu<mp#Zd7~)f(9%OShz7gpualb$iYLVW%>9r
zE8pG;_wQd$W~Z4&XlJ*gVWFa{(}Gi{Q6Gxdy@8LYYT;3b55qu-^SyKE1E)GB@3g+s
zkUE;(kmq1IDx=4``4bgof1ntVoV3|+YG)?-*gR54jGMc`t7qJtLyEMOm-TbrN{j>y
z1V7~Aqb9*=A7VqitA01=pg-a`5kvAVJam0KGQCbY%ShnU#cPYya?&uvOFKyo&49X*
zaWi?%jGOD{t%QhSB^76~xYr2Th*H66I5;TZ<$s+Lcb!EawE-qPmP;$(a2r!=`u>dt
zJlPN>FCZg=Z4rDN!Mv4N*Qmj`^AMa?8_3FNzPwJc7-U;N$N2po3O7LGDU=qjju1V-
z-SM|^--XXU1-1(8XL4Riu?aW=#xj^#CcQ<XGS&cIlG71;9;`e|FTvtMhtsn7Ur?V{
zp6Ze~3fkqWCnJfcArY+`i)Vau4rX)ln^s}rx+RjMh7|u&66et4PB{Ixach(~<)C+r
zf&FnGC~lRZd>w$E(s?3|zAWSmAMV4(N_@3`%3BugRb_qIN9)UPSYIAo60^RH9Eh3w
z59`Z|eSj7hK_lynuOkVE(tokO;1z5*IQw=nUi%3PU~uYGV8ZJio}>HYGzF(VhX||=
zt)|QBphlBaxh=$U<abzz0Dbw?T#Y~F8m_dJj>y)~$1+fi%6-@i9-ASJycNKyg-`qB
zFqxGuN8%UL9HfA9*5k=A%*zEpx(hD^G*@vdHb$=#F8Z7=aVkFYTqF}HbqN^(smL`b
z(&}>vnX+q<#G&vuGzH4yeOP`UI2$E{)84`_s%eSVh6lfkQNk)+o0z-A?C&%rI5|m>
zzl61~Y(Gl$F1{Auk%9W&jK1Zwbw79mR2hg|ka?@>BS&~de~<ylPQ_}Dj9G8ct04Jd
zphw?7TuwLeYvBsTYZQwzq+~zckW{Ll=qBAVqz0#w0ghzoW*qb8$#Lj3OjxuITPxC9
zk&1WXlzop7hG!Ms5(wvakXPXxL{5tQf+BNcBGKEtCuwYQkuYJ4obN`*=BG2M1`qXr
zsxuHYx>Il_4ysmu9F4T^w}2JK`g;}KWaX_OIkL+ugV~F&KIDA)1nQurDqM-{fIz<#
zZ_~p&@o;oT8+=Gr=ozF%<I`c;^j<%b7w!$D2b0wn>azV=i3_2=F%9t$cjX&N1N|j@
zEy&9%KRrtwUJ8ip0}D|fX4Mg#@{ypt(vHqT^ANoV&v8+7KRdb<x$*o$GjMncQ46#$
z$$1%D|L{mWK!CvuPR*f#hk?Ju%%9>=BL%s}l#y|X_TlDYG^C{6&yS<KRdZ=b_J?bK
z<eZw@$Cx`lFNeVYscbc58*u$O*~Ue8j<ZG+!porW;^+VfUwVuXHh(aR@R#tW%??83
zp`+MYKv)I=NZ6i^V!PDFwt@9@dhN$>U|R*@h}ENAPoSjcO}4Q`2R*wd4|?yS%R_jY
zyK0Y;TeC38*(gRQ?RhvD$db8yIb?F#A-eQmVzzPW4NjhhjOawqYCKu@E+R(WUbYYU
z@(ORkgSJ<o^GL7zHB|RB2zZ(p=kQsx`O~CWx+b4ZWFWde3nWg%?_4YmfBVBkZk^D>
zZuoWb8A7V|R&HKS|E4UaCwvYueF27%??jy{k&lwu^CCow=)a^EbtvbBHr)I+s5yLX
zO^*5m8A+ARLW?pfejwsSC+`!!Z9U~_R|u0qD}1k`6|su>2j<owlHU;XIZHZn8|uLz
z!!Jyg>h>}YN}na^E=s=&G?etsobK2s>Dj*>KhR6>Ny(pyTzLK(FV%yHYIkTnnDtw1
zHe@*-dsFX{;dr3_7P#(>dNjs?z=1K1dw0AgYYfiBaQWm&c=7y$FhocOy!nsb0LSmb
ze6Ba_21U0tf|1+<@zy=7cgOW(diKCOdV3UgZ`9jBjabq>Y`vm6bW2Yxuk;Q->{TNV
zF7If}-5vc3sJjOZwNJqm`ZKONj8#<d`>8Z>VZ>S&&OlpW6l%0<9!L=?XEQ?Z1C%9W
z0KGl#?HUfTK~J_PC!e(t?_GJ7B%L<408Dt`B2-y)y2UT0Eqs-v9jrb@Z{h&2q}|d(
zG)vpQgV(2)B=5M4a{BR1+u4viy;jrRk=2mCunhCDx?OnOnq8ZCfcd<KAH6Inz7ZC)
z(N&>gLgn52s+J3>_CAKYXf4((=fwJ%Z5X34cJ8s~IrK;|ZSmP!(FsT_e8m$;4hoXx
z4ie{#wISxUbH>UCL}UucyO>bMw#UX4)0|rzOtB}dMPI};sqAWG5!U73$oFKXtbrMf
zPGn!>j-7*S=oX)@*b8>M>R9;fH;Zi#S6U0UC!H9hj}OZ5ZWX?5&p;nRtbSF)%`m+Z
z4)yVZ;)Fx}nD#_Z4kXgyf8IaciD62xXV}k2zdSM4p0WQ%nhWvvj1rWEJtOY<=-?od
zGi0342S3v6iGMzN!5I)z5hKPW(}-t}_I&hU;;@qRRWE^SqehIBOcowFk~pplX18OX
z-<QEj&0d2fye$m^#Y6;mnEM-jt)a~|y-F_6g4I1+8`~EKXjm8{_F($bP|splxS)7s
zE=l#9d(j#=)`$1;^zM9J_UP@qYV|HYBfEFUaBzBS8Ve^s)?D1YE*&?4-b=Eo0MrSO
ze2)a{+8+4K9PId@%0~WNgZV^PB0D{Txj+0KW<0{E3uoED+z19iymDT?6gy=J(Ab8_
zZ2Ew05991Ck{Fir729yH?nF|m+ivi?XR-zUeC#a?AC$diF6}L23~V6nH+XXr?IuF2
za!~~u0*jvgjdaN5h6@NOUyR24H@1VeQ(|8}*lQSOeohbEe*mWUp1~lsVxb?tt{OyA
zC)?E7uP`E#W?=20ewAgzo{OS49^~;I^dySPR~5^`ZBQ0T7faGbZ`%SZoQe1tevaB}
zVC%sCn06XuDk^FB?hnvIO6q6geOg%P$G{Z1D}e9G71NtBgH!LNe8pGeX8BD+=oyi7
zAvJN$HhhS5ehk|u4F4S7ZNns0A?r?A2aeii$Gs?Vx8Yzu{FWWJAn|+_kK;lg<6CVD
zB@f;TN8RR2TN6fLX_@K7UyOJ>Q1`jKx0Pjn;;qbB{>?(a7R{&a!^GauuD;=}J8}@p
zYgI%<@9;(6Ol#dOCk{Ad`n#9cem{rK9LR7^!yf<g+A?A>LXSa~Z7R-EK%3!MRQS0J
zUIY(<Z_07v7H+Zx;;7i2AM-NclJxDQA<%u2op|KB=-0gXmjhFkxC%4!TWo2#elmDq
zzMO_<!ohl35uJ4n@87U?<I{-t>j>~Jw*BE{AskKM{VCyRJL8)rG4CLN)$l4j#_oJg
z#@_$oy9xItp)2zWsYfO4n!pH+9bI@<0Z(>pN87D@R-$f}sI;i{FltK%(|zBML_p2h
zwls0tQoCYXzM2@@7Gmq@_?DO(=3y1SzPE<EYDIW+=Mj2|b#Ur3YGctykW;%V51$QM
zhqo=qbSht)$FF@~JEjjFe>tnTUKsb4!@XA#rxBgQ-;^Q<(e*K1Ud1uuYMet5yb^mv
zi)TD;N`Dr<A9E8(cXf*r8JW7#BQ?dR;N7p`gDY*6N5ZpglnqF<<8}#bLyYLN@Bqew
z_bk|(i?X(8Bhp&!i$>k~HSVtN|0H+cdUtS|56Z}Zf}-#3^$1r$vcsET#=#7J^Im%Q
z{{!{x!6|{|l?G=o*bptNg2Bmx%%Q055@q4FjjdkPcQuD$|D3+SQQQWfBc1Lh3(z7k
zEoHqsKvz+$@a-K*=gV8?#@kKuw}G@WVe)Z0K5+w|qJ=y9P-M!)ZQ8iJf=LrM1+iRD
zz(o`%IfIgm^SAMPl&(AwLUMsIaRa~hy`b@%1y_DZQ8<HqzAMd}{B1OCdKHh`(4(X~
zPMzG&7cqORpYD7|0lAh_Q{b19->VqU@?sBFo1ab?x4fWaKq++LEL^bfRor_y=(?($
zvO&#+iL6!WReJfj(p^<z;#uXz*Hf?3uF}FA7GngVRZOK%r98eKv@2<MJ+2%U1iS44
z$+q}0H-p{q7WtgLxbOxchpBuwUfYP=hq0fR&yxHBNr?Ds=8;qf^}hme9WtTP@GFm@
z0(`K5L@#z*o5S>ezk=e48}M}ACV&&b_Up{thhTrgH*K1ek3Tdio4A8DF+x)cHKXk8
zBXRP+KN@j8H-c`egx~LmX0eGTJlRhz%(?n%=m7zsJHKosVxsGVeIL39ooj93&vk~+
z#vMI+(*fo`?jQ6jUS3Ge!2Ab4p;O^mz>e1+$Rqu<BI9mh^Rw`z2WI+nz5~R5=OO#U
zShpzq5M;;F(Z#m%)!%j&*)Y1)Bj&@GK!Ve1d1#P+2v<SF@<_0SlX0S%x6T7qkPUj{
zl7KlKl*JdtLH;qj8L@2DQt#mrxCuyU1aR{dw`}nI_nD2*>s3c+6w{x=-N;s@1Iou_
zg+HR~7-Td|cD$SCA_8K&5Blh=3c2(i;#^_#e#E%27xn=mV08{o<CjoQ!r3Hod~il3
z7#?G?*QM-OA^MaZ!qf#Nuytv7?-N3afnaq+C6V#CZQe3{S4;Hk?cPL=QzO)<=_YFI
z4|{#Ku-4l1a1cE}b&QISuU)qL0#^uyH*+q*S_?*VQ(rLrKi@{y^lY>%c^I%46t`kq
z2M@s5`h0<}7O2#SuEK&+(D8QT=q2Zx2Y`@X`!`@$^#np@B+<e|Nkqgus2&iJo73)f
zFKIaDxVIwB*^E~>p*a)o2H(>KE*uq%CSm0v!CI+z5k3{VZgeKKa9rcEJ3#mF3fz^3
z;YX6je2>O#f0S|5MOBn>>{Z2G+u|FI=+d^8JgC+13RraiLm1#d;_L7NLA=OsFG}7F
z(<*8&L+#Ynv31Y}9C=ga<E(H$sR=C@0~wif3ca@w2B#bg)I_NlQ3?uB)#&4NWH7>M
z0-L{Kkfu;J{ovIXxW_=kgd|t^M<|0(Cfm#rWs)z=59w93;^-IabBW~!5zA3vhy>&o
zJ%@ohmzZEI?(<<kggYzkcCMfaMs30I2JWnoMitS=nVo+Q=S%k51-SB=JK;z23dVoC
zuUow<T;sj~K6G&N9IUOIh-0gUhd(U5--zn8ua$sbktBT@Xrg`KeSEi|XlHQRd&pG)
z&d(CgzmLFZ=Ejs6oZqNyG7_K&I6@y&siM?>Jx8kUBVr%jJ%-%pw-saan^(Sth;rRR
zmZMvSJ8~3f_^c7-^1)x(GR(ZV`m0u~zq4IGy%HBQ`_g)_J9VRn!f_Uj5y32m#<G5D
zy~x_bZ$rkzJLTul^-!E=ETBf1ooCalSXt)_PW=tYpgCiN+TDL0F+9EP2|V}JHN@R+
z7({|IHX_fl<jv$F(L(#!T4;e5{e*%ok}jq!Bx6Vm%#GJVhjodeD`m<REwm-HP!>~Q
zfm^`>Y*8cWyczfI$hJc#kSribgX+<%B>k~c5?UYT7wyP<GwjNn@*Ot!Bqr!qMoGH@
zq(ZV5!pkrE8Xtt|7j6W!Bo&}@^_X%v6rB15P@WN7R<9-KRB0BiBufjq7J57Kmycn4
zkdLPrF{49WX)}^O8rQlYAWAUC$S@QFAY2a_U<#df4Q&lzmn<LxkHb5*SZ^8E65ECF
z4ih@dL@T@v+a}gnN|N6o8*E8>Z6Kt+;W|ipSsVNi_e>f?bvEw@RJ%%1&y5#C0?-#U
zES%o(a^ukHT->)Kj>?62DF2?t!`-tfiY>@EV#b=iW*lKN+u~wF;c|K;fF9{24eG~{
zobc@#G%K=!;qdc}VdyNufE8#b%Gbp!iU=Wg6lCbz93D*wD?B+tBeh{P*8MF9s26zu
zG65g|=_mm)>p7jD&{xAgD1u5$Z?+HPz!mqm(KN`XId_PBirMGVMUoyx_+lS+V|WDY
zV7Mg@(l7Kt(OztL!+4Yveay+~&vNxnbRhhm*Qr@HowzT8Zv%HIC_2{)XTzw0dnee;
zZx6RccqToDwmp0w1aDE-wiJ90!I@og7ZIQ1+l(;oLx<<%J_QbHT9wjc?={-aovV$l
z9hGnR#K+j?;dQtuWM`;^4CBhMp)gYr!m*0tCvIH;&M$iiPjX>UF7HmXXN71*XKK@S
zGW?A)C^6@%p-_mLqNm{@9AS>4+Xl-m3}JW`>WSG*Be>3d@@`Sf?jaw4WMz&1^$9LB
zXCkGGK28n7>s&XWhO<QZ=z_I5kBxkM*(W@TNN4NwauI<x-3j)khY<xGf56qT_N9BT
zATbheU!sRbdR)1lbm@faGEe4{cqA*+c96riGF{m}#F{uZIvv}A4iPKUU*M;sTbX{R
z(}t4tL65{)nVz<oCq9wHV3Qat(`-pv_c+a81!h^`iDwp&4fZ-nld4k=>f<Ny?am|4
zW6Lqms6ieVV4=0jS5G=fiadWZEqMBB*pNb??-ht*D^IvZ4U%1CMGSYHiyLwK=)*=T
zH#mnL`+!!khvxmCbLgRxi$V}lcwN9nYw)e!lAH(b=gv*UW!5~l;;y<CSi|opwwT7N
ze?hUkxB!Ro-K}o83U>+G9e%4i4c$6uI8R+k0zZh`r(;9Q>F~Hm#}CG2vH^8X_{-tM
ze)xpF$4MIfJ&yB%4WFbc>VpMk?1oUQ($4)YTw{K7hmvAw-=X>3zEKx|zNDds+epip
ze{6eQ7QfED42+8}{PLIL_99FZa!QWxE^upb>LhAU(U$V`-FNbZqu^A2iMc#HQqdj~
zu7MSl7LTe0IYjv2l%@Ekr9bpT6>Rfx2bZ+;{ZtL*;<wk}<78OFyL|8}m7pCS3~~52
zUJq>XJ^-uj!iOdIt?vpxMQY+8Uw}b=bJCwNgZ!pF$WyS-kwNZ}iGsWehcCiV;<=8S
z;>MT+OZr_T9mj;q9z!kMDtuF$pAyHbcCZ8NUG!&p`hnSaD<f_Ge<@G@4q@22uD`O!
zq;d`22~MF49(FUy0M@5R$=Q%Okc4siRzWdt+OAW0kp%459pA)SkaGkD$DT^2dM5<|
zeD!t8>g*mJ*N=ODmntvicM!`9>@Y89K=gqdW9&APqa>%0u8-s9_1yjCOwK&{S|Aw#
z!>vr7dEd<AEayPDueu9W(;K#P-j~Sda2Rg0Gsy=!&G^g3rJj)tAAN*Uv?9ov&sUN-
zOA>oNL;{P(V$~yd>6OS36?Qa#Wj^3`pZ!6^8`!#-vuHR8tPXg1Zk#Xt!!TQ3d7#^s
z_n5YB+^8NU(}{y3_taf^j*#j!A|j;ZP9HH(E-iJlvttESKiv1Q1yy90^u^)~auvbU
zYB!uL-eHe)D~A+``*G_fT-k8C;n-CAWBXebJjtzp8IQ-6zjlCkqx)Aqg}#T=+E8sT
zZ=)4({WM&^BC<e#%ilqMc)*my<=B5r=o~&1p(4%@&P5Pv%*2AC5j(<<mY`(OG{PKi
zD7i-ugIxrcylHz7xocp!5LqBYuwea+Lw|f{;QBh@Lz55lCpRFx@X7ApJFxSakTTAL
zrVu8nXjdkThd78^Nv*?X7LU%fdONS^&G1GPE-0Bx(XCMVSFeH?TDYh^v_`Zm0PTXh
z%+$lHy-K8am_3&da>(=EzopObt&T~bc4&#vrzt`<eX<aY&}Yi+QTli!cOrd;AdArF
z16&Fa`g}3zKhS3h3dhlh?hb{oZ1+#}Ipf^prq34e?lJf$9W4?1_z<$`qahff&#>Ys
zeSRq9+==wbMHZpYbGRlT^jU$d^vRYJ;-6V497iAKpB`;NpE%ZukqG~mU*2kYOnR8o
zL(HeYaFA}(V-JE6di1|7N{>&F0O}>uBO6(S9uMJ?fY9R^WIbVe+={|+^k9DJ*ydl*
z1MgUdgdvp+s~-!Ql_W%LAyY0wW=RVo%1Rt_l8~WkOTvY0qU4A7+~P3FPK1g$L-=z9
zZK7-opNh9SL7oui@M_5&AqvS02$@V|Asw*9EW(`!5M@Z6@V&?yLzHb1A;WEyide|t
zO&Z)fek{@!GFPE6h(e7P)k~yI-PSUTurRzu-Huo;A!oRQkTd1DqZq?G6m<CKU(xH<
z=EtO0=|qQK%Mh~Z^)!MJdIe@i>2;UnPNdha84kV9D|F~}IkKKGy*i<AJiVBUe#`!6
zdTs3Y-{^IAvt!b0_RS8xrXyt2>m~#v^!jRglwLz6cOt!BM;76)#<x23>VT{#Os^yq
zj;9w(p*1ae*2S^b@$%1UO^->B!kZj=j7P|($JGc%=<&g{C_VZ~?nHV#jVwZsYPUG_
zXo##QOpk*%CenlX=fxJsq{lz$H*K2`DgIW!8HY+`eYzSUTfz)M&?d!>@S3Si3f6DV
zklYbc*!s;vWD$8Bz;y|c$JLQFh7`xA-~4g|k)ni^GPz)y$9uQ%zIsfI812RpGmd=u
zCwkOtd`x;=hMI*Q=Obj(qYr`+dc257x*>|09vviiB0Xj!i_qi9O%6T$$a=!`*nWLQ
zdN6bBtovW-k=E##^caAeg&t=jWYZ%T!3aGbogAe{L&=>;k3wV-di;E&Lytof{sTQW
zqVQ4qV{4uNN)H#V$sS|A<fCSxM`wg=dSoLQp~sv_QF>I7+==uUi7Y~ouWoSYv3vY~
zpvN01oJbGmk9D>Gi5}7QSgbU?N{*tx{8FEY^0)d+HmVn*G(gCfr<n-aMA;U;rI3lj
z`iqj>5u(`o%lXJ6^7QTNnJCa-J{cEBl;hK1o<d;|C91!)+MFj+WN8*tC2omv<j23F
z$M$;1q(>HN7J5`h$fk!6!3aIBy){aYy<<6dB0bJT7NN(h6C8T1N7fi2a(widkHQt`
z!CbPi)_<hO#=6I(M@V`IrQeN^O^?6E#L(lSTcY&%1__{EB0V}Ii_qiA@eVy+LDmzd
z$89Jar3c<HR`i+8AG2%zM|!-03*N_=FPiia^w>8#LXTYtM)>2biBWoNM#53((G*#P
z9`}uN=<ztRo-jRbMBxhbVE!nq@gM221edapL61G7ocXd7A)7xwM=(N<Q*VyaW3}W?
z<c~~b5qcDjb?9*~vYs$Ku0r7o^k5ZrWcB|@j|ExBq{oht4n4LaWYgna1S9llaZ{8Y
zFG%i0dian<=yAgshaNMK^@Qm$7=<g)gZX1f<}v8OHf+ZR7rA!&80mKQCiw^1B;Qtx
zi1OG5Bw&+FV%y#%Z{khzyVp5Hc@v=`&JbRPpiPub;c7QBQ3!MRLCGB<3avcYB=5U{
zh=Sd&c`5EIKvTV<PIxr3#t;Rsw6ra8+(xO0H_3QTV<oi?_U4EsE+2(K6p2Pkj1=ja
z!Bi1lHS(-UqQ08<ujzAD&12H%>1!SOJdBV{pL-CD(C5$Vqx6|5xfAKL6<LHn-9|a|
z>4&T*OrQEFT#-J^HBGAh6MYJbCLX};sRI|5wfHgd{RgyO<A5n^ef}Y0k2hf2*TU;@
z0$fm(zX#^mJ@CVL!<I&0C7u|T>kb}_9$#S0vTxBlqapT*@VWTlM0hoRU^OjHFKbNK
z*UE+xe%zA4(Ri^r4qs>@8!JA(1;f6zM&dcd*&;vX?XJR=$nyHv;FsQiTwb5xD=58e
z2bx)4e<;D!>@qx@4|p)aBwxx(8SY0gQL1by!#xS6bEmR}44+Ogu4HIsvl;G0Fs>G9
zWzeL6uQkCib!%lK8E!%_3^rQX5Qggz>?ODl!x;qQ=Al-W%Wx{exZ|hsZRPU%Nd)@|
z&f;`Ma5BLmh7aP#;T3cNtxRKhFTp{A_uz=9y#9{_hX~%m@HYgf5WI=u&j?N>xRl||
z1Xm(>DZ}p&oJR0MhF24uPVj7oUn96O!G#RJKroIX(0_)XBAAZC%7!rf2*LEht+GB0
z&nGy8;9Q39A{a+f=s&}^5scG7^q=9$1mj#1{b%?_f@>12F+7^!S_JRGEkKMv!C3_F
zV0b9OwF%zD@L+=L5M0V|KZ5HLyp-Xd1lJ>YA;YH=T%X|C40j^90l~1b()bhHkl>LF
zHzBwY!9xIVYDz`%SBH+s@z+e?*)zo7l+lEvON*#T#lMlWC12K^HlB)E;$lK~QByQ6
z+xGj|{z%(jX!~)60L)%4^)Ix2?OB1B+HmPJ0+-tGCMRD4qCR2CH^lZw+J33+Z?gS8
zzev74e~Q23u=t@P;x8>1KLNeAuqmVk{<52ipO`+xjvx6q=?m@nQ1hdg&$Z*TTO2)o
zh#kMl_E+51j+;<w`^#<rCEI^1A>K(}YR4zGf6tS4{~i&4r0pl>OB}CFrtpz7UOQ~~
zpT=uQUnv(lUHmNDPpmKNNt;iX*!h-<pO|mcD-yrM_O;~#hire36Q5A9^drgF=VS2~
z+P+h6<mVEfP)yUlm-rpq#4oh-&9?oee^>4s$-n1o@pHcvKN0^(JKrYTuh1VSKC$56
zv}@@esb`bzCl+)1v&T+%`cwL&-M+nl(@r~nsgwSj<jb}FKDM7&-$FZnhm)Q#UJl)9
z0j0kc^gtN?9C&L2d|LDP^xX;RNp?CdD6xOEfYM*1J4iHZn#I4GR%-iR+vhtL2-AW?
ze_=a6E$H;OjaG8}*`Bt1tjJwj(CE*phZb=9vy3eEX#u7`2frzxwq;0d4kg*}S$6yX
zI8uHcMeWFx^ej8QYCEY<Vx#?_1(5y@w3T#P@aQj#e>II3MEcufr_%yRe-0f;A)vo3
zdmSN#fc{Rg;XUQjo<dV<#?r{`^V{viO2sfKHvi@?*F{Sl)`h|1J7qSxWPRCT`%e7v
z`nA88asJbwr8nBoAAJJ%wdJETuE&f2Wd4&BI7xw%6gWwNlN2~ffs+(CNr96TI7xw%
z6gWwNlN2~ffs+(CNr96TI7xw%6gWwNlN2~ffs+(CNr96TI7xw%6gWwNlN2~ff&c$f
zV7iVw322_<PC+;uE(@*@ZXO)n;mCrcyAyPajMC}eL&)|iOlgI-p9Mb`j^fFuwCmxh
z9Qjm^?q7_AqcmzmK3sEzDa~<N2$Lh2@@|5o^0{yXYba9yFB^{THP9Uuy2wd)ISP?R
zE`%_p(S4Is;P@U3BDNt+3CI#DOKqaEgo7@C(xpzi$ceii{HLFeyl@7b3(kb23L3%d
z0!Mx621j>cC|nhe>L)r7Ze@q5y@han;HWI!rMUtw3yxsEn}W<#mT(X*a@WIAn<$;?
zAh#3^rSgm5kA+XTDKFt6|90D_c#7jTfC$tkO4kuVY4zZ!z9Dc=z!ks|OpfXyzZYB<
z9O3DAj;8g6Gm12=HvB$tI>N2sOt{u?>2THHQsJt=Rf4MlR~s%9&IMN)E(5MATpC<W
zxH@pv;S^jA;Pu1RgA2k{f~x{o87>{JDqJ17WVjT#EVuw%UAPcjDqI@e*TA&{E;qkw
zVOG9YI5po@sPe7CstCK_SA*|{^T5r{cjX$0%Ro32&VtjCe-h$L^Q-0-=4a=k4CTqa
z1OqYzZW!E9xQpN}gu58-Qn<_DhQnPBN4JDWrdoxgQmx$4@F(}zG%s8bE(E8;8E`JR
z09-m;GMpbS4bBZ`!ll9~xXN%AoCnSamjqV{E(I<R<Jwha<(>)u47eU}1<IeBq6!OB
zRiDD33KgcSMuh>DR_Ib$h24>!4_66kgW#V7H(M3v_JrR{rRAOuT<0M?5UvW!)dCOc
za9%hcTr!*m=YmUuQ*ia*8o<?ss|{Blt`3|r!YUjz)>W7XHyCapT%WO7h3AfS<>tbj
z2iF&_AKckvvvPeHxBhT`xFK-oz}1Iq2saY07F-CfG2A6^Znyz(4dCj*rNE7X%Yw^>
z8w%&?npUcJUAobL@9ny1mV&Q!MOpY(*F_u6u4%b0IJ1wd&<*E-QxYykxYTzJ#sW@<
z^TO4KbHiD19ykT(f-~Tf;0D4~gUf`g4%Z*<Jh*e<2Ed&QmjP!DL>V|QToRlMPL0Se
z)JIskE~*!82=qZdI1(UeUuhB?+E!}7dEs=#2m88mJ$)PH`umP8)kYw#udCGT>ne1?
z`QVb^Qu|uDDSgpy<kjGN;p^}%_{ngp@9m`ll%sxl&T-{haPD(Z7hvj_iLefr)Z104
z!ySUN;Eur6g7d?9;oNWmxFB2zj?z4~pA6px=Yu;8R}N>w{RL;hDL4%db(ErxQs6H|
z`BLC61@2PdE(PvV;4TGjP6KZCfx8sAOM$x-cuRq|6gW$PvlKW>-Jq=wr@@&-XE+bs
zui%e8aKFRth5G~UH#k>*qe3ra0sLHjxK*f+w+hv8D_4!Taz~yC-h>+tcO_g~xOQ;8
z;CjM!hr1MRC|m(t7r5SVXThBbcLv;LaKqqwz~#WXAb6ADyl^_48_oykhcn?6oCW6s
zf0!e(b4Q`fC2*I+T>*C$9Lbnza8u!?z>y3jSxD|1xbNX8?pwH>aMW*#C+Ead-kKQi
z!Eoop)q=|f?hbJ6;ab6UhU*5`1}+<}HC!jSe7Kfyd2q>nLWO=fP+VKyu_<{K{)XH4
zFUQ;dblbnj_7~g!GTUEg`&(^)r|s{veRo}{r?Txgu>JP7-`(~H+Wv6cA8-59ZT}wI
zUu^r!Y=52YZ?*lMw!hEz-SzDHZNGu-x3~T7wm;DJhui*m+n;Xx_t^eo+h1n;>ui6k
z?eDbxURPdw^<|?wW%cUSEvr?>7TGPbF+P_Kz4DT4vxZ%B$+eefjlXR4m@BWlHmg;8
zB(~3L)H=JdmXN~5F28bYRwMl6!d)?LXp3Rjjp#JulCfhaTz2KPmtQh^>}A)syylwm
zBbtvKece@;4IA5n@(*v<x=rh0t%eW1v}3zA9fq~<&?dX}&{iFXwrM}~vh1O)P8r@R
zyZ!Js*{vFDXLRk_DXWp)uvTqav~H2zyj7dVSrXLr6uo}^N~TeNbd^w*%=#(%fVyL<
zjKy7L)j~7sXVr%GX0*^#GE?0*Rx|KGhpUF(s?vbEgM8=t&h=rE)X-X0qEsisO%c&_
zPl0x8rOA4=Njk0%TLer*z)QG0`8xZ$R67mVjFW886wnug)1*rLkvbVnF3_gvRUM=O
zh6rnrve|9njG>qaZf!~n%!brR3prCIr+2L?K?iUEs0OixQS5jKMW|enYJ^Zu-`%kk
z@x&9tiYJ8EX$=uRDMAFs=Zk<giK62mM2aCI7YM}mhp1YCT*uW#;e}|W(#ga+xU)1i
zPDkw_b~<V^0QiAA7{@us)tU&wV_DIRkUXBso>;P4F?3?WdE>Y%DZ*VoYNJ1P@RDPC
z%S8i`9x%uR=BdD7MQ)4?Armjkmkd^l=Y``U$I%BTLXNl|#?R1rN_k^?$c;*vssGzX
zMFoY!EjD|cKs5<MDk>Pb8!V(^S%rx}e99uq5iN;gDl%R?o<|W=aSU9MJ^tq-nm9b}
zNDH}}o(RYdwqmk#1Xx|<al+I)M$fohay*v=Xl4W}bWS_^x^i5D8JXA76~&C5y~h~H
z_%=F2Y0Dq(aO_gS;xsmb*PK`u;o*!ngT-ldatGt5{&98k=(9Cuu!Luw!Ez?YKZ(-V
z-tdS<TkCgdNc649r?En=LQk;PAGd-KB^z^SELlA<@|XL`0y)~$D38Xr3c@&6f$&#g
zzT;JD{&69TOeK3}#ZC3UQ*0^_?^xQE@OpxzV*M(HeE(i_J3}nGLqh+24l~8Fxy<eO
zjwZ5sWH~&pt{t7joi@1R2a}~%EQ?f>Tk&nIAiW&n&mHo|&rC;l*|Ue2?AVzSjAgTs
zLqr;zlsMX_{$HTP|A5Bvl!~qoOq!!mI*zTQb1!~`e6eIX3I|1*jjO@>z?0P`c|3(9
zWK4-4rxWOVg{c#b;C93=7Sex*1DV5OWKMi{q4dBi5Z~u$7g-)8M;g!IF?$@z%5C^x
z^fRF`6_jpUjk039z^YwTupDg;#ZC*JMX?bT=26_F!x&?WMhaCZzPp%2Q8n+x`W|g`
z#m%QPpw1c@>3JLnM*Ge~z&op0fm2Z#h@~8lbu1~c1N~2ZN*F%gwm2$iqHK(9NOaYq
zSsvSg*dB2gqw7EO=Lr!0XpN-tJ!S(VW6!l#+)czwCoD=f(<H3t@f-dVXuQLl+>BVR
zVd6wh77RwVha~i)G>#Wyu?i$(jpZ4RaQa}gV1k4=u};K}acnoZ!dTTxIucLbsA9$0
z6DNT)b&joCRbc+(>d4=#II;7og8XtcU1FzL5=6VeJ#sduHpN-TapqA(<0Vz3qQ)Dc
z6ca8%nGZ$y^0+#CbfqVLN>u2oBd`)@bmIOcLFD`|_F8|_-nhAQj3qRF&5WJvOjBoY
z?6IskUE&E7oi1ED5G(f*>?$WVD92hwBHX}i6}1h-ivVXx>>)e4%#G#iipHa;%5gN)
zQ7o%Rd5~?G$JJ+<b(lM`eNI?0Vh8pgdL3g7{wJGcv=Sagk>j*#I0}+YJDzZ{q)N~l
zj;p&z=i|s6{@W=RKf3>eDih~$ERC5SQTq&&yuw-&%UMxYaRjtIlTM(a(P80wV^<Px
zS8PM0%;1R9zinnb0b^%4V~twz8E4`QkDvbub3AS%_P^~hHzv9<iuRYyf$>u9#A;5M
zcF~cHrL)Zt2`gNb)-lY)S)&R&g9(dWoVpvk+;R7!!iGV|bZ4_0+o`3^*!LXIvB%M8
zUaaWG<cm4FIm-SsZng17c*5>-lyRlgk#z1yl)*T)f~pd8QsgK*q4@EIY>nNirbZV6
zT-!)ka{sG^$!S8O0v!=l&KZe4&#~N`&~QX)#zcyp>>QEE`tdZ4GGIKV5|?v69Elxi
z$olA}DE91$$L1&k-x+yMPq4K(h%pK=^8ja9V9$8fhpm&xjQ>A1Zt<)Gy4ib~ikeV_
ztB%gZ&O$9rOoXjqr1~Gs%19$@CWs0it~aXm#ExU4$`jw`iYAh%<b%0@`jNnliBdi$
zom=rg>SwGX^|#A^gdkjV1<{j0pM*s?vb`W(>u4thvC2(!d%?7is&@Z3yG!E{?Ibtk
z1gcLM&x*n&mU|M_ort7~QaqMSIC_=SDTmhf`s8e^ZMhiLBe)Z>JjQqvdCZX|G*Tzf
z<0v0+b!H69azvEBV!Q6ha=Q!8Jo-;6Ty%0(Yy^=umg{3rS(qySR<1|Jh&xwdg^i9s
zWBsp|FlP<0Nt@7#SOW*=I7YAH+wH7YHu)WfvdRCCy^3X-=;Y-dWAa9qxf5WggdRuf
zDa>@#af}XpbQI&ISFCl4n^{pAkL{CV%eBelY|d=*#M|phZk#|r6Nd8duV%z4U<oFx
z6R0<Vps_QB33EIucw|YBZhM8e@v2b;`4l(CC)S7PkaG_hwA-K1!`L~&Jd{8~mihlq
z%-S1$Uee-MdT_{bazVrf5i6Dco(p1Z46&zw+#x=dayFw!=kzGKD$H##Vw8G+vk{7l
z<#^>Xa(es!U=X9j%;R3+XwGhPybc!6n29T8gr6&F<FQih|HZ*vqTGuf&<P71MchS4
zHQ}&>&2F@rKem`Vt}*=wc8*ZC!Z}n7W5;d`xL46FJ%h))O2yEOStP3XFuMN}CN<H|
z5_hnPDpBkV7UoTCYRoOW_>oPNrLn74^lAe4FMgS#8loH#Wns>m`hQ4#*>f;*fJF9?
zMDZIxC1Z`oGUzeHV_H=_c#SYulmMLbxENSWHBq6$l^<Pwr>T>`#}%rwx81liO!EEO
z`f@pkt`2jA18Q&JAt+PR>9HtH*J|4#0C?s_{)#z~p1)Ftaw$`}mB-NT#5Lr#E?lri
zv9~f@G=938V3^G)KrbRzqm#_m<kw-Kmb)$aYD`O9#HC_}{#Fw%=5mS|cuXcz%oSVA
z<YJ!XNH?t&_;qVu%Xm7;Yjo9BlJY6PN>%~G-wGJknQtNkJ({EGzZ+bGTmM5E7Km+F
zFt!E{)xae~^pu#sPfA+)ekqv}+qTr$l3pt5TZ2NC=zYt&Td(Y2k3f>{(bIiv5lW-?
zMH}zo$sym{2&Hq=EImD0A5__T3UsUz&<9q9tLAwQP)4%eCzE?w-Fg9V4Qm;GYii!0
z(ajq&=6h3&-ASoNo~|mXG@}*`%=+L;WT{2<=q+{AeBFV}3k}UfA0yAwj4$*hsCk=z
z6JDF$S|6qdLmwdYrT!X9xnB(UZbI>|bb1ERd%5ljZb9O9{UHHsg*G#YA1&}?ia9<+
zi3FL0k~G&ggxUmF`rKO)XzRKlN#h(l=wT@>@IjEFuLXK71q|>R^0jln6!K3(rR_Zd
zy+i0$guY=`3)J+Q6B)vD9$KIt^-%MDfg<1P$q4GB^g6+B0Dh;N1dh?026se2<3^*Y
z@2e=7r%*n)6A|C*Udi6k_gw@uZZ;Zg!QFs&>dh&6f^G(XjDW^?qjHdP@8r6SX+{-a
zI0~Mj5Y~cw@cV;KH}mxyK-Zrc#6vZjdwD204PE(xAGz^_^o~3>0tOnP`JvyDIjp}+
znO_a~_X6tR9ia=s%jgLhEwEDvE>MKvrv&~8Fu^+#!8IPgA7%VT=%~}$2VA>!>o0h_
zb?Zz-{iIt@SJt$jb?XH@Pw<PR?a{4efPU4P=kT?9m5%?)MxEzTce=k4=7Ue4r3YQ<
z2>IQ0LnbkMU3z)Cr&p@G3SyHzz1yL}OA)MR(RWYkGGEtcj14qz6|27SyWSY|sqA_U
z1v`1iG~y}qhfeP__ZHc6rn7gZ?&mqPk9r`JMhj7q{d!Z%IlDQR+t0j5<$~)GeLzRA
zv`FCtv4vNm@Si%`q<QC<=87ovps8^y59;jwrCX(($dm36h=S%G7NQ&xuN<_gs;a3B
zm8q($8VJ=?wN#eTkY;&iupNf8;W7mE8Ok4Q6M?#`V5pS?xfl6EEeW89O_NTq6zT||
zQ3GmTYrn690~lje2TZQcfE$z0O3JSu$^*~@vqwuh!w8+`0NxfJUycJ9*Q-;I0b}tU
zqMBNNfPb24#$9R-X5!~2P4PJ@4e8$JGu$`eIollX3spiB5x85?65xHYmhYx0^p<L-
zg>D5rS8buxmy=j52o$(gPhzn!SG7T+BNhaTaOtUzSU4>r7Ve?vBqL(sUV&IFOhQ$;
z-t`7ggS^0Gy$x3|Pvubodm0E7so->H8g%7ppjlIZ<UVDU!JDV7H^5-?mDK=p?S92$
z`he2BwT+8a9r`z#&jacW!M{K#4(hk5mcc(G&=^(QJPfGCRAh5M3$bVQ1QA>CbZW`J
z*&do&t+W(xUE>#3Pt{iqR72HBHC9bjQ`HRr%~cE4Qf0%nQms`R)mF7r?NtYLit4C3
zsm`j4%2B5pZ5Ct5+W5`}vbG3mx~aUu{(#!SVO>#~W?u)Q-g8=ePV=ZFPGgAE1|Yt}
zOoeimsu4U7&?%P^bgCZkolDUBgmaLd>^mP&$DstBr>CV}2q>VRuU89Rf>5V}6hBx`
zH7_QJ7|~ydkmetT-=H2+8LrC_3h4`LnwKHanIx=xa;ocU04eS%ReV<>(1j$p*QKZX
z#{o+9y7iRQF$h)iPSyRIZvuij^nA6~(kuIJ0CXxbh}Wy9u~0h=F`Cz}V?t>D>j0$b
zY3dYn1VXud;_zCM<{t$pUC&S{+>Sg9ndUw})uifAr|00^7c|%WQvg-gtE+Tgcd9T4
z)4J1@c;8uex>8wLbr@?^y{3(|TP)Um2WxN!FmxvYqt{X<i+hHMeOcNIM8e)$4f>p6
zt;27Dl?wztEJD-M>Ve-gt={<UW%b2xZ|i6Lo@MRD@7dNJ=w%;{>B}+wIHo&G*Z!LK
zG~*@X8%2Y?BXtZuax_5Cx5xEcbs;sYI|#WRf$tQzvOxFxXq><2#Y^vJ5B5&Afim~C
zM|-D&H00stmj0lualjo`)(!B=6gT`wr6uJCvvhA>=tA%5DV4olQ#%;D)B}*5EnP(H
zfu7}+nAYb;XuX>SjYI6+^gO3cZP!F#9^`q?@9P!7TBT54^R&Xc8zKt-%yPo>zQGh0
zC_-WzWEzux_={rV>=Ex*@%CckcdK-(@te974Q%7O71f=W)WHl9SN^Wrqg>Jj)qKR6
z0x^4rW+2-iiiPln8Rj4~Z67x+`K1iA2x<Ej&DP|Xz~ts6zz3MhNuBD2rUCksyPI^X
z-<%SGYSh8fL(M!W4f{)3#YjA)G~=+!<F*CxajL<d87Y200_S_0V_77k91)^4CS`;P
ze1WG(nEAp8fy?6vTpmMUZcw2?&Gp=s%ncHV8pMf{Byo-;3dD&t0+S7zCZ4;6#|4Uv
zfDO_Jm?5-ziqVWR&#h_^U8fq%+|vvXtw_@i?)?n$W{Ov2Xe!?r06ju`JN0%DvzmU7
zO0u3oXrNVqzMrcZgF1qNm$HIDB2Cku(@o~=!4S-vzC^FaR63v8{6(alg>n}#v|4Y+
zX%|xO^(}f5Pl1b|8)^ENkVhm2GxV?YDl8;2^|g>zlMq)uLf4Bi7b8OAVwM=Jn)3V~
z!m8y4=#HfBF%%(<Kt!3;K4?!3eGjtrL+FyTsP>=q`sO)+h7$hdN7J+*$+%&7Z-S=(
zrk`RCL>g&Un*Op!^AZ1%q}BA-b)PjD&~TOvms=Mhd<9R-D|yCVW!1r=cQr44*Kq0x
zs|jMR<@oEY76^~DTH|+=Me65h9-=W;d&G>jI^lO5;~&rYC-9J6&%ZbD?~T@}$a9l*
zI(~1~yxr3>)fuWl^-w+4nW~rS?LEutW6U-@kV{Kl>ruhQ?k)iyyE_f)0Pl#*RPf4O
z47<B|U1@-ZxbqwNnLOSqdQ&FN9K)fF=BkV=L*4mlp$r7>W|mB9<~OTGpc*wzjdh!0
zNiEXc`><EOG^u&E#&no#Sk;m79)p#<dkt0y<_QDdC*FMV?l-JO_#oT^hUPuns^{%v
z?D6)sntJ=C*Y)<VT;F?+@uGJ?m6qOt>9xJ*rq@%0)Ol*KF>xmP^^U&)6sf7tS9N^d
z5xRxfs18`ndjg{6M$=DI8N5z0<i4RZRBU4TLf-$Nr11SDJ)rlQtTX+m=sXFg>ej2k
zGELXK=ciXz7Z}s^FM#k}u4#tORV`2+-%4bfsoz9(JqB!5Fh7cPo=mq1BJApm(3IPC
z>sq9hNF{gZ)@VSpbj`R^{~m?k3tkWSE>@rQmZ~Xl<ZSQ(EZxM7b6DF(Y6dgV-BQh5
zw7uUwQpvrdKF>p=FRWU}dr^8-b+H;^+@}vf74Ne`Kc8&|`f^o^=YB2y6||c6#`g=l
z2c*>xLIc0Vcu0R71wY_|3#8x*RmuN6nwq7rR37HG+G%GbF}E#@*$puAI{I20`NM+z
z5$uOh)uW>GFA|bGCL~#`TdkmTKQ8q@fi4V9&oG|UhoJ=<8UIt#|96y!vDei%*~9m=
zAbLg+J*!(c0?~7VXbBJvOV3o78cX$?fohZQ79>8e^Q`~aMpaM$+(z|+pn6eIy`)=f
zk^5yqg?)<mvh*5ixbcd<0H{9ny^X}>f@%kNffkNedGoE8DTA5dH9_>cZUs=|3K<{R
z5WJVC*HTxgE7eu%YITkAhTao&+3Zh6ruzD2s*0bqy$1Tl(27Y2mFhGv9euB%qwl@R
z+w_RS*C?X!HI6ELtMx`G${HW-O4i5@jg>rq8meoe55Xp?Dnf7R7*|a{TNRj<8Dty4
zvwn@|(^}co_f?g-MFP@{@{x^VopA0T1@Y|5K<V|oqt>rbe%7-#fS*Pf?{Hf_^65B$
zen-c?LDRiPl2sk~-$jONA*s#+9k%%EAVX6<*+^oFG}C`oA*M+42u0qHqeu&fA|J$4
zq-BI6*-?sYjHk#ZW-C#9nIf$s6!|coA|HxX#G%OMc!~%Zp~y!<fG9<_2>b3=nRo{Z
zhVx@g{;fLcejhVIw(7KB)KiQk4-w=O5ac>z8z~^#)?gc?Z4(+-Hmb0&u)%9pHx?E)
z2w?%+-bgjd_!KHX&;+kVQ3>mrMoZr{QE-&1u6Y&!{tQjoWQ@Q9GLrYq9~!+-)cc5R
znl~G#5oGV01(I#Ei}sN>co&<zkGzo<V%|r7WY7%*XCEn0WFL8xvyZ%4_L00<-eN$G
zYTiXb-UWYbtYXN|yI_G>Js{)Q#|Bim$Szo*2=a-}F8G$nE_ka!Yr10``$Qnx1@k_C
zn}Lw#eOxyBHe~Pg1(K~kZ*xC2c%%OrG^o+|AA>FBSTzoz@oIv)9{)Gs|3-C_x>-%c
ze~y}%bc<0_H$FE?z}DaUlj>ky(r<y~%0lQ1<9<}_ej^Yhz4%Mx3~I=Ds8*yH+aT}T
z{K)*30pX%~ZvoG_0)Pv3cWKB^iu@$qQ%!Hk!e=|L@2txEsL&3B)+r<dp%p^J*M{{M
zyl)JKzLgdDJMq33Z>OOdKNx)!EODW$>tZ8fJPaJQ#sz{`MxelKJt;_I5jM!I6ad#U
z4+=<w;U`PwWId3gi>-2sPUFF%qD)4hrXFPQM*#!4&ZfCZ?o8Ed-H^r-XcsHVwF)yB
z+RYhjO{wg<4h@;6*P3bgM<6s^56sa!aC3eVTn+5z2*}%XYR=EB!vwPRfJx?40fh$x
z`bEDO_}N20yXfav=p(mQZsskl=2Mf@WHrSoRO`^!Pkp~*%1%=C5cYnis`64iS+*Bb
z#G5MKG^MGjYML=!?LywqcoCg}83%=IfNIXB>n;d+d2O4?`wg*O??VMeytgn0DU%oH
z+mtm5Q>_?#LwL6<X2TMx=MM2^DPCJ=i|yr3Y{8}*gS|5{YGT7$q;7-l<5c586#^fA
z!J7O-iWw@b>IKOxwSaeh-XB#vwoBO{*`@p+V~o3be^Nf~`a<dYMpaegzCSG0KO)|v
z*jhcN$SnRPbM<1CLxj4-sKH!qgUr=yknp(5rsSbU_28$d<q2uT2&0<$Nff-#uQ9c{
zdv7p=T2Bgf*wRnNwWkzM;_G2TroKEaXs$<Yva#79^~Lu&&_1I`I(a7=RhhQWD(g$c
zKBug2IFX?xLd~T@&F7U?X_4xLQfybD`I*n_i7HmNs}glb(k!F1ZoHscW7G9jD)sCP
zuUA#~eFvrZMQ*IO1r(>901LcrR6`!~mt@R8)s56J5_%Y)VNDDEh~UeLWx=T`)wdhq
zGS!$G{5Ix3!>_0Y1b0%Y!Jm=5T%Ar}ch#7AY`LOk-21s+KS=exs+tOz#g)Dqsnkc6
zzNVn1Yh1IIx(5+Gy**TY|L+K$>Fo(UWG_O!yx-~G-~ohQR|5#y9=)nb`u^);;>>Tz
z&<aILinn`x{Q3?ec_k>0h@Pr3qweh;WTQUIJKRQnb_De*b#*N2RSJWld2Q5hB%&@A
z)WcQd;33rXrW!}NuT+(S)L<L@5vMEE>ecF82Q_iaY9@iVTVrnZnz%l^6;ly&$693_
z0SXKvDaq@UwH&C{i?>0%w}tiJ!F->s?o_Y=tGm@)b&tAN%~SWO`Oxt0H{QkZ8IE#%
zv;lojk!Iz6M6cy<3IW~+As-LU`)orn*61G2_kpnaTBse&I(@y1^je&8WBe#?V#|o|
z4~_YUisZicNj;MXSwK$aslQq3e^&PeTcZAt=+$JJ_ZbXoa|UV4?afJcwF1=7dunp9
z9Rgc0;lU|wq35<?ka8I<K|Nmwz)+<LZU@io2zV<SXS}DSY1RSAvrm-O8L``hRG*6X
znRuUz_l0<0f{YJf34KsKq!y@!>R~m(`$$GzV~3(+jXG?o+@V-oI7QE3LuY?)XT3EW
zI?v%*$=2epf#*^0qRb}V$ExS3#p-eOgnCjvrJh#L7`M@jCAF_v*C{qYFfD1M=sLH{
zGHdHP0*S8UrXzxjl17=VHk7d1kThD<27#Od0)cF`fsO}CL~Xc37T8(P9-hVjb7~28
z@JrS6>ILIL81t}uH`k&m4;okm@v1n}<;fxw#JTA#{f2wm+ebDJMR)a@Nqde3vg}`G
zhP)`T(4e)+b5&K7Rw{vNHZvvb!+iGZx!UA{k4V95Op~l%j~b9(n&-L{H!<nuo-yfc
z`MSb0UMw_MMl3Xo46?#Fro}~!MGU!aA6jsgXF@7l>>d;Rx71)8!eX&`6(+L{K_Idr
zaN^^VI7t!(;zTltJ;7!P&tx(B2o$mU*dUuE>Ga`A!%9XYo)Syg(_(>mMl2A|$`Cv!
z-V*VaiuXJepBIcT(XnsXVtI8E2ukw)MI+W?DG>7+TP$BPVl9?0bB`U1<udLRTP&><
zwCNQL{fp`)^|E)FG1ORYOgC`U%KGoCqW`{|VUmP;O_b92<izN89y9Oza$;mdkrSg8
z##d;vccYvb2^7(lR~WUC=$se{6w#HhadhPo5nXvDSLNu+s|1Ri7+vfAK~9XWi)i|9
z7~QA>`@~3~NChJuO@EYVSyn1qQ!3Pc6VzU9bb@OBosYV_M!M`VYVedEjcW&0cuJ3v
zjoW%u^Ok|V3a)M&KEJyTY-iq@2+;_Q^OmdHoO?Xy7M*2+H_H&6Wv$VQ3X;weBx}|>
z9v5#tL%2;q;<i+>D_`&JW@NKIvR)`(75PXXu|en~todv(_z3K6gEfYCq+jof_nyH*
z=zW1c5NIQY{}r_y|F5d$#tc_C41Z~mHsLc|gU~>Iik`|l@tH1K!XrEJnXb5<c#(k4
zPF%pqzVkL0Xo?dj?8JiuK$~LM^$;!k481m=PTlT0mq6!u>UP%#&TXG#l(>cxhyyCN
z+TP(J#UpY`HOsZ1K<AihmMeNr#YZi(U2G4%)5XToyIg$QGRLJEg|4-b-Qm!wkW7<Y
ze*%eTvzeL;=r-MW7>ecT2u^m<qGdb+jhMhGF4EGBr<CcVj8k2_-YivB7@X!J2?UL+
zDrcT9nO`RJgL+N9u2!g(YL&6qpf||=z?SA2y53HgGm@ElM3VhxxPZrzWWO1pjeSUx
zI;t+2^7QtwA(IUHoq0j;pv$?4fDs{NgDiyTl=2T5@qOa$7w>?0e~NcdyuZXdB;H~1
zj)+$--tXX>H&m&5Q>|8O)LUtHyGRR5F92EQ((6-kpD5U!fm{L#p*D77kS0HF1NiyY
z$~`W$OVgX!*H#3idn=rJucS7$Qw5|{>>O$9F%NV)0_xBeCR&L33e!tSuGtwRM=t*J
zZ3E-}1~pKDS`!gf2InFLRMV>%^m<{OuhOqFNhQOb8JFsAnRL4dmz|WQEU)<y5>k_x
z5A~Wgl7>)j_pmO$i>QE^t9S5UfRG#Q(aan+k8lMpJ<Cj{nml#zqgOW>k=y(Pki|87
zx#lDe`?!9;=J9%N@FaO|cCI!lyt-JWs8n3}!KEL%p5aYSNB8P+gG{}7CO63KG{_Ta
zP#~s3f$gX?cqO%?P1PVRDM9LwF<Dg)eohdijAo2eb^Ko=5F)kNu%N>QzeOk|pTP0T
z$BC(=29wt1{~m!#G=!{VY5pG&OruJHwy~B(l(zJ?)tMMxP+JT9geba8qUl}I{JR{$
zxLwtjw91^;p)#j2AZfoLD;csiy_+8V)d8>p_V|B^Kzsp$X&f*w2Zd7jx2m}X;c86d
z4F1hDFGZQ^<~GFFFu%ZWO&*h4<{ya9;`l1&4#d~yF{z_@>r&V1`@ch6J!ik{3j<1`
z{jz^2LiL#tv;p+vY*cH&!U4Nwb2p%d<{tbuGJnT!V@BPCQ8(q^X66CJH#cz#sI|~M
zRa9M1RpS>=HLH;)1Gg3A#z{?2Eu)(!%NXpbt!jGe7`v^yxCjO<S7v}8KF@%2K*I7<
zV`hLp2;zJVBw6$hAK2PUm*)(90Om^-9w5CAx~OLxuIBLNc=#04)e?X~J=QzB%{K5|
z2WhIEYM8WsxjN!URnlGU;G480xK>wnb%5`tt-Nb>XN~iDIA6Ntvtsjk<MJhOzRHr%
z7n{!?moJ&~Rgrvw*nGjbd?C)4A^B2b^QFe+tHk+gNxrn$eCcudDs#R%lCMf^zN&Hg
zs&T#sk}o4RUuIms>a9T~)kyNyh|O1%xQXbenQ&SyFMjm1Jxp*0%q~fqn-<P2_XWfP
z{A=;AkADOFo5H_o{9A>8Gx)cLdk9L^cGJy}I_}HxTg!bVe(So)=2}nl_|00nm28Hs
zfYH%w1&v%Q<XY!RF(DUIjRuR*oT^EqkyUdQc-%`6Xvn{f__s0tHsRl<{M*d^BJwoX
ztV*6!tTgi%m2R}4j#YELg7}uiVXm{2+%#2jURT-ET^j{qO`?51qL{JY0%KouI_QPG
z{C7>b5|%Y)Y(pc8*T&99c$u~hdcziGU--(TwMmD3R=2uMm_ywj^CrZUSm)u_&C_<S
zZhG}To+N#UWllq^*DS&>DC{vCx-EB0<@NYBBIXV*<L5F9b;Bw_Trv%*0SFiZ=H~!{
zCP~|n$+DC2rJ6q=rV?XK)7(j(JhxA8>-Kv(yOTXNJ?Vsd7UQnWxSz6dS7F?MfV(Q=
zuEw}C7<VS)uFkk?Fz%X~3V3QC2DaH&2TasD=?KE^9zyxLT&5nEsn6jC9Ij(dK+c9-
zzLDnhG&vLCom{+W3Kie4hq!6YxM_c&X+*f@4DI8lweYV)nU;($n}1uG?;xhN<_>yJ
z_0a<6X-mTOuH^R--Hx<0qq6c^IAhV;(V>9hRw0JkGvtBA2u<$5%{+xMcI0p;4tF-2
zfD5{ClXKAI({2QwIgC3Ont^6CvvKFf;m(WT<{q5RJ!p$_G3rlOhT18E(R8a#XxiJ2
z>IO!OHZebf22JeFjW~mwRlvX9%s0`v9-2?3dd?&kxI3_#c-A;inP9M9+>hS;dlvtm
z?WeB@^x?vN`L~}*-xlbvxhr|P`aA=EM(Md0AsI+^5#w4#+<GpM4Ppf6@$X>%J)aR=
zz`qxoeSzj8ZsEls(^Vg#^gTSXSM$hrF@oIpYq;;73=bC@!N1og=@(qbp^^MM%A5ye
zqtUH2&n;_!=-%Lch|rBHE@2No(8d-rl}4^mN9!i(TynC`EtsNfo~b&OoW~`V(a<hA
zjnAQ+qSLA93@JKOiWZ^hZCvy|E@~L3*hP!uir!8|OQh%>QWOW;D&2Fpz7_S(PwojB
zGFR`9Fg82V?0W>mYH9bqgkhdwxKF(Kg6Mv<;Q>Z-Kcjh2Ka0@ZYomE64$T5Wvrx)E
zEZ!r6=24(o#AqI1G>-|ICu}r}<Ip@#Xr7SrPm1@Hpm`eIs_a>!w*linm|PozZ>e6N
z+OyYXvIu-$=Ov?I3ys<01u6Zaj<I=3P`pgYmmwa7U(s1qF4tLBJ};SHh0Ltt$yHT7
zuj>%T+Cx?mvaaBJ#zvM29vrfgaIBK-E5v(4u;YAKAMz$TyPC@`;IeC^Y!fQm=Pk;y
zRtl{VZ=IA~k766R*g`J$wiIj2RlY;T-j)1si}#)s!vU;5<O7u5$YmeqvYVvrsa)lU
zlw-3L+9cjbQg#c9eX4(mUOvLbK9gdT%vAL9bIJOJcwdV5m3Z65+acc9;(bH)d@Etx
zRHwD%dkk3g`Y>M}_Jnm07OF>sTkz|$+Us7|W`xS<ELyW(FhW}a{m6G8t+ig)HbA@d
zdX!cp9T)Eb?$#R;_<=9wGeD;GsW<ft1l(3jOz7ZO2zso1NN<Y#Nv}(JeR@)WK+D1z
z9EZGChF(3m193R_MjI&gXPx!~RzD9zzeueEfs~nPkM!(U-K-8Rcdu?{!TU|T-*vMd
zpg+XhC*FSX4v6=sZZ<-eg981fGtNVT^RRB-fV3kLSuT-<%AAPENy?lIZ?e+dwLNv+
zbv%vSbv^ApuV?H=TmJ|R29K_wgMQ5#DebnQ?nzo`2zE3pC2@u%+R&MJV1XLEDucZ@
zGBkHRPeG;nuCBVffoqRy=qXmMG07GMrX!cnC^KnnZ&&y!QRX+G)g6ilc$RpxVdQP(
zxl?sPna2XJp^V?y&1LSQGIONN-I95(c=w=8W6!-@W^v$Wlu0&z;WG26%zaX3zGS{%
zya!OGiRVGp67@VDsDsHKFn;4Q4^f!~Qf8rKeptLmP^PKpQPmM;o(K#?nV_+c%PgWY
zk4c%ulKFA*o<Nyqo+r7?lY!e%CS?4{WuBrkPfM9+B=fW4J%=*QJxjQ4PX$(?Op0-c
z%Pgfb&r6vXB=d{ny@WC?JTG%SPY3p*OsY}NWtLHySES5x$^5E#uc1s!&+A;~nLq;!
zaV0~!X?$?-1K}iPR!QbJ#4AOaZ09nzXSF&FBlv7^Fv_M`M|7`6GjI(b$63X?kxWzY
zE!BpYY60d3O}e$%tOCRw<mr=cJpywEJ%&-)TC97MFG6S?-zc@F0w0xFubNPa8%QK`
z&MMY(x{q^ikR&JP+k#tiQr%Upm-J+=_8mT`rD}sT*{fNv>N49ic(x}Khri1wM^-ZK
zLD4L>L9jTdz9*?EcB%~`HAwdlGp))h#67BR;kYN61j7635^6YZ)-b;JRjgF_K=E9I
zRABkCkxxLZ4d^7-wn^YFc5OCD)9qW}{7|uKuvxs16ibpViY3X%il^dMfj$v$n|Pl}
z+GpZ@F5Va7eJS2o;%!$fWp)VkwRqnsR-(UE+=%Z)pnNahPLVP{;Jmm@k-AVH$~UV=
zKjH`pOX~zTt)9D-c`3-Xn>g$zY5vc`TE8gGvxiGQ$Hjh?VmEWKy;AHqD)ze+`$LND
zLxqR9SihiM;bAGcfJ+{clI2tqcXp?tDY%_m1aC5|aP2%(jSS4^C4mOmKh`jwbnD!{
zX$G=SmwYqCn~Ci0Jw--sWM3K>fb2DmXYA~^QTAfVce{8cu!Y@WR7d9LIdd)Jc{}qg
z$~;?g-6`H(z}>+!#~6t0F9e<k?kwYFo{D!H+@85q<{oMOy;5eLc+es}^Nn68_#zjq
zZM?<>@0WrPP{9YK;6qYkfp`m1=P90tjjK`gr67$~9b=VSXWGj-5)B!i#vAgepjt$z
z9urhJ>>(23SckkP(1s_COMvKQMpV~WV<UQsk53##PYa@F2+^~G=sCf#M7*VFQ%BDW
zJS58kucA%$jCXCay-1~Al6)_Vw+z`kd0ycmc_k1=_WH&~JNt6V{;K4AO}y8Uy|ZTp
zQ+auy3fQHAv6+drQX003%Df>BE0r>Dinkg?yLi?xv0e=lr!_RT+I@UW`UrL-VyzWa
z>j>3)LA60}ye-~4K!vAP)M+YL<*CzESJlmfLk6hUuLTzXaU-L#r@HkZLhl)5;PB!C
zY~W#ljlHL+>K29GH_o7hE;!o`JPNpp(Z-XU{5V1%@Iju@%wxJ9i-0Z|JFF)GZlrI8
zX-0ETlJzveO$M7_+j}z5)(@qvo5lOcFh{{)xKX?<)R~VZWvh6fNN2VgtXq6)XzqO1
zFRHuZi_D$_xFmzy1iZoj)2Kp~W_t~598ezQ(@>+e*R&=8{7b;L0B-<zNWk_0Df3|g
zJKLF$2$*AME*J1LfHxs?p=&r*bh@`CgOgmeFC`FSXR?b$))W`>>r@xZfoU%D2V|M<
zLhois)J*Y;B>Qaw6^nPfcqNi{hd{GjnrF6)db@&qd#8(gdyd`PyIdsYje&Lt<_I{*
z?!esw4hBfQoh#tQcIJBoyu{9YuYkh<Qg7!;Z-?8xy-#`z5WSr*O}t-v`+&6cLFw&7
z)Y}CTwNSi=CHo@+Ju2QJ@g9@3#R5I<(o|2+YpzA;)Jp3-^zC&%W-xXo(Vk?5<XS29
zuA-XWkf>6S>0Q@i6kEl`-jiZ)_&g}~zU28pifyD~n<VN(6x-~22F2dsVjoGdZCq@N
z<oQ^NZKYzLNYpkI`_lClik0$)x2e(6FJ_9b1OaWcnFY2>fgR#~E#5aSKP?mAqO-V<
z@7d{E1IhL#?>B#-FYMv5KpS0H(9!NU*d6<hX2#uq(<(ryOpdoE`jZ&^QNWS@WT|(T
z)U;c?pTzrFykErIBi^s#VPi^d`c1sw#rs3Ned6sG?|^uJf(D0qpjI>Kj!08~@zX$+
zOP)fLr^6&NtV}k!f+?n^daJY4*{YA{4l^GeUz6M!r?j)o_o?IVKvHrJz}e<Y1ePZo
z$+!`s-N_qYV{fu)<pI2ljoHSIWHR-jx;dzz&)rhNT$3fnJtj}cdnIz7G~_;1-q$mq
zt9>hZIWXNX)%xseABeB^!T4$)N~m@LRl87Xcvz}^M5=vMA{R-ukC~chF;}{l`}(+4
zTFtKXiTFyNjIZ>mgi4>LN}mz@&q}4wNu^69a;a4MJSu&GD_zGueNif{X;=DEe5EhP
zSGp{r(pRX`<%0iJsq{6e^mU0`A(gHKf%~cco~`COjLdqgT4mhiF&oiP)()jvnE<z$
zY+kGzO2fc?Dy2UY?{m~L<O>OZDG0wpFSeWMV3Q4uY=_y9kY(G*z7}Nd8QC{d@>}t~
z6YqQRb`t6zBpephWkCIh`6$r6&60JWiS=DGf+@tn`%N<mg7yFv`%{YTGg+D+G|d1~
z{t_gI@I=cwz9#MgzSe4>I@hz_JrYRXu~Ly`10OjW9nuXe4dC1K!9<Nf*eu?01M$0V
zHtW3SHZMT>`)-~rAGkI5Ab)RqYDJyz`OBS)zQ1b?L54%_nuMT7m1@>dfQQ{M_h?4X
zDu#6_z$0#Gb((Qz71Dys-8|U}J=}{)WI3NK*25_tbCVP<!Z=LzaGiKE%JXMNUF?7M
zxkh83elVj8a3yaHrtZeros;}cs*3`DWl)XDKYBt#k?Byz7J_#9lZP-=3s5>P_PQ>L
zfR*WC*x>*Nq~l(1>ZJ$`Ou}BwqAZ6q3@U+Zy$l`^kgoM|W#t*9)hF++!%$%+WlsJj
zE#(RnJvVuu)H;cy({X>7YZZ`cO{Q8WXA+8Z+~0Lk)dEueTIedkQ!?KooQIM^)PX6P
zbjp$bq+k+=Fr^TtLFp@{`R7q_u4}4P{XvkSX#(Nq3zKYca(Qa-8WcP~*~Fb@!hb>f
z&XiQ@_k|Ix7p4CZV4_^ijk1Ww(=(aIK0S-U8Pe~dp2p~gB-hma+}xR&nbh2!(p-Te
zh%a#v4`sy3)RLmian#$&dWg#x32p510!7LXbIM=J<pb2P%aR-Fsl5NbEt7PP<Tko#
z{fLBOX(FyYaO&-u#D~c^Xy;i~BK^s=Gv6VZPX|ayXGv;-ZZM<H&g6~eotX^XCD5Eq
zo_BX=^1Qnvlg0a7@$SjgJojc^1m4)s_~r@vbMzE$+I<2J*4<2k`I+OXY3J+7OoI8D
z7C?uD0!2tL+#$i`GB(uH`<Z6R7wc}W?1BG}uCsu%qI&=ToXI=)-o1C1g=Kf??(USf
z5nHhp6}wx%(jkbXV$f15Az`3|9h4FVr2?Xa(ujrs=XpM74m-cs>%Y91nfH9oc}_ob
zChkl*8ZcJQ4XV1}B$TDEDl2JEM%uTjq~H{}wU~3eO6G76o?7lPj{Bs_%IKLYflWUK
za$aQg^yMxr9p$#unVE*n+)oC%d8e1Vi48ug%Qih$HjpXX?NQn85OY$YGf<(gVl}gz
zWnUQ#3l@^iHHn|7s|^pt`CP1>&ohBIpG(T-it`x}i1WFOozL|n;bJHVJIZI-`HUbx
zL+)l2+|6@=WnWDW?i%}Aa&eP_(&XaSX3wDU*T897=p{PS*5p9$6y2c8CAtz$L3%gR
z-L39~Q-L=To=G?@5Z8UPqTflWUeh_n8E|*6Vl^}H*22mVa#ORAjoE>ed=5FkPl{`b
z^ShdA^3!5ID~g<7yU6){7IS`|$DH5i#WCjxG|kuEIKK{D-{S4Qh&#V8|Bv%q9dmwb
zM2(zZ$H@7uMWo320XbE1e(S`9aehE)Z*hL>i%UAc^~F>>#`%FvcR3?+ew{eFxVx{A
zw03@9gCghGIdXnojPvUnIlpekYWJuU_K-};5ZZt`j-1~{lsj^MXBy|Xsko%`0~y0-
zMTXB7!!kNH7ss66X4H*#ejpS7oGAXe5?`F(dB*vDBUh=7^ZOQAjGW(h@-%^Qe&3@!
zk@MSvz>)LY3N><m+ls~cZ7&w*w*&M;u{ggUi)AcrDV84l6IMHmtzcL2|BUn74fB!n
z`xzKHzdgk<=eMW0r1JxrDmXu?f}Us_c7DI0OGeJ`*W#G-+l#V9&Tk*mj-22AVjiEy
z`5h?k#~3-k0}|LcKOpBtoZkh;`Sm)Tnct9^$oc(Vd<PqhoZs)UvVlz5E{w`{k(iST
zJ%|cDgw<iH*SX@9)`?SU=ycLf>9x31dOhxx-iSM;c`>InpPkYIc%y|_y$Pp8R@f;m
zf>T-yr?iAz<aOjCuMgf0j#1gxXK$=RxA2`2g%uKgXlq%(Bfz<h+~X~T?*-hN&Yg7J
zV{iFBa&i|PW!XJ059nfy<vbwn@%j~<la)yOJ*2$~X@7vUKMXAAqhJw5;WQ<=k5TX_
zxliKBeHu^hvq0{^Q*z>TKSvs0!1$L`up8*uvO3_qd0%DQRi#HtoT*g79cXI}(s@`H
zYc1b^qbs}G&98&uM^TRTfwzP5_Z1iCYZPZKlG=dcY($Ei$iHt6dMUc9Etjyq3Fw&=
z%UM<!$R#WwU&0dg+kh^VS<Z5;0*(3|)Q<{7i3?<v&t(ER<J^MA-HO$=K(2Fb54`KE
z(Jy^KKLlPMq8~v&1zumGom`Gx$ocNT@^8!(_%{Wu^ZU5%T>EDGmSAttmU6Zs%Rgpo
z;uID+3H~wLK2%X&Q6SmhAMmhr@{1hFAdoX?Nj?rxUHm3?Hj165P6ch}_qd&daXY}%
zb`Fu9!(wNX*lFfe)pm}=?HrBU0iL#VjO-j2JDbH$bElTJb0Ti%kGLJ+X*(yWSKS&6
zPb{Xwf0Gzfot_g;$Z6&uMFrF}BN9}%BNN`2Bs_~ph(;yk%;&j;<@a^F1)~!iDeAX5
z{LI9d1Xr|^F81VF_1FZTMMRH3jAK*dVPgVpJfEORXkrfq`cAIlOiY~15sGyD7ZG1S
z!SE8te;M&#!D<rBOeQl^6T>L#_mb{3#5=2)&qt?ov>6EhD#FjC@Us%LDEt-)KO5nD
zOZYh)elEhlhVZXb_%{+0Dg0ImKM&#WlkoF7`~rkui12Sx_(h3#Df~7GzZl{BOZX)m
zeksDgh461v_;rbUDExK_zaHVIm*c0SzT)s-Bm4%0-$>y%Cni()9TNT<gkMq4@kZ99
zwn)gW=I;>ad#tu#wKYK|x53PIYQfurv+di19}+{z*bj-(6#K`7IK+dJ_)iHLnL81A
z7iVQRwtt3=J(QJSBr88k>|YW4n8e<T*!vJ;KgT|R?cWfa?g$45CH7Ae`w(KQ@?7x4
zh<yYxj&kf{*glTfClVIr{g1>*%KOg5d31T>B;uze{-3CZzYyndj{gs~|3&;$l=^>(
zS1I-`iQO-hlIKe7{-IRjfDqT-LW(^I+k^28-H^}<hK3w_x5OTX*cBx9aKwHFF-CCg
zk=T9~u}4wt*F$#XKZ_%8;q0jaNB%}A)Az`c&kKDDWE}bYP>vAVkuM0vkuPNXZ-#Qr
zm>1%hh-4PiTL-C7OG3-OBUl#ZlPlkodAK&cpU~-Ai!1p(ihA|?9Dh0Tw*nDY(wnM!
zt-|&Pl-KW4O(+|`h=ce(#r2fk?*<w67p-_p9K@ECthS~s`%e2V`)>Ol``+NYOl3;-
z*W5jn>i6=pN;TYZcMI{^aPED@misH=mJAuQSma65)=YwF6lfdhInZ`_09?J|$rXdv
zYD(IBunrA|3>9N5FMh~)kJ64GGv1T5`YB`i_t}H|`|bYz1HsL~&P-P_y;mZ)QT?(K
zXjjIYO<{Luyw_><GevpO?&m+`UT8lY?8~gC2>aXxr0nOn7+Gpnfn3!;kfGy)r9LR2
zi+sP~8Oh%>(qj&0WaJzI9meWNhHCX_M%v_9M%?J}j1~Nm8A?^OU#jRNs%QtQ=p?_&
z;&c`LnQ6z#?Qq2QUoiT2#v5IiE-GR5FIK0p`VT8A<9G_vKTFvhkWEjd?SWaz#h|R^
zKay9@f3(!K{$oMgU~rbRazL^&B+CZ^ML!1T)*6zHWo2kqvT^`f8J3ko56?<gp22Da
zRwJ=`7OPRn%5$8R(a6deWMypD3dUt$p(KBknjW9!iJCf5QEGZZmX9UsL}`lmd{%CD
zzkoz1V)bH{>h>j=dO0g~`wCf{l<h(V`du!KPtNjLxw7f;2fj2uC3_*EximgCD`(<z
zX`K4dG?<;9mAak*dKLC(!u~9>KRf#^8968h=VW`b{al)K@wN83rwx+5*I?mwSa<^#
z=8=W@V&RZjSO5#<w1tH+3va^0B3M`q3rontQYqtMDdSrxV@+Mgx8r4e=d?k-Bl<2Z
zE<+jL1HBLX%VB>7*<UF+JR&(<g&a24IsD+XL9+KDEPMnDAH%{YWZ~0ndzWsJ$VBV2
zY{<3LQDLG5<OzwWpF=%Et3acE0kxA>fkyojYG<tijk+3Y7p($~x&~@jtpbg@7HT)G
z0*$&3Y7eafjk+G{nOX%J^(&}!Dl5}Ypi#eudbU=9M%@7Q9IXP4x)JL6S_K+)6V#qs
z1sZiT)QhwVWYwG8h~|kj{+^_HMphnM`wsLyXbWg7Xd7rdXb0#A(2t;>Ks!OZK)XRd
zgZ6-a0sRWv3)%<T4>|z)4fH$cAm|Y2Fz5(9%T8t3oK5qicSq4<k5N}Wo-L+JS;rE)
zNIjmV36bSgs@+!l<_W14r+RI)70}e#A8{3E)RS=)Xw*NU*3g!LM*RzFO|1fr`Zv^C
zS_K;QAE>pp3N-4!Q0r(FXw*|s>uMEf)c>H?(<;!Y{c?GfJN2~+G-`jS4YUe0>Hw$>
zwF)%qK&Xwh3N-2<sExG>H0ofeO|%L$>JX?+wF)%qP^itc3N-34sLizsH0toU3N-37
zP+RD@K%<U;+ES}PqmG2yN~=JlJ{wnoMjZvUwT=rk>T^)rXccJG(NNoJ6=>8kP}^x0
zXw<P#+iMkQ)NxQdXccJG@lZQz6=>86P|wgR(5TNt?W9$pQD1=CS*t*!PK4S;t3ac^
z2(_zLfku4^YB#L{jruaw?pg&J^%bZ+v<fupB&cU<6=>ASP|wmT(5O?Oo~>1&QKv#Z
zN2@@iPJ?=`R)I#H4)r{(0*yKY>iJp)8ueAEJ+%rn>P)B?XccJGSx|dv6=>AiaTRFP
zIdK(e)VXmLXw=u@D$uB}$5o(F--xR~qt1(~K%>r&t3abJfO?^>4WLmMLcK_<K%>4H
zSAj-d6jy;pT^v_|MqLtDfks^#SAj--3+lx>&p@NT9an)yeJ8F0jrwj}1sZi(Tm>5S
zy|@ZA>icmOXw>C#6=>8IaTRFPm2nkl)Kzg6Xw(nlD$uAOLcK)SGtj6X#Z{nDKaQ(F
zqka-sfkyo_t^$qvSzHAg_4Bw2H0l>|6=>8i<0{aotK%xrsB55Jrt1o5)U|OHXw-Fa
z6=>A;aTRFPui`4us9(obpiwu(RiIHf##NwEH$lBzmltT%%}}q<D$uCk#8seCzlD0G
zjteyEcX1VH)bHae(5PGDD$uA~q4w740*$&Yt^$p^J+1<cx+AUvjrs%Bt8}_Rqy89I
zfkyoa>eV_f(5O4(D$uCApkAZn0*$&mt^$qvGt_H!T%b|+K)p_@K%@Qw^?I!Wjrwa`
z1sZj4Tm>3+Ut9$mbwAV_bh<#J9*C<zqy83Gfkyou>W$hm(5MIFD$uBhpx&h80*!h&
zt^$pE1Zp207iiR@P;b^M(5T0t-lA2YQIA8tRjWXwo`|bJqy7Q4uZ{~e>dCkYH0qyF
zZ_{ysM*S<U0*(4N)Z2Ajpi%#ct3adv8&`owJr!4hM*R=!9Xeg0QTye|`R|=t1sb(~
zTm>3+0Mxs5T%b`0LcLq7K%)+VdXH9tMjZ_GUabO+It1!{S_K+)Xj}yvby!>l8g)3-
z`?Y1DQJ;bOfL4J<9Rc-0tpbfY66!-*1se5Ps1IuuXw*?qAJHn%sLw%tRI5Ovj)wY}
zR)Izx1N8~50*yL0t^$oZF0KNNIv(ni+A`3n6QDk&RiIIyhx$LQ0*(3t)TgxyH0ngC
z{j>@+>Wgs|Xw;XW_SbQNtY&_ui)?M30ks{uzV<T05z4htpt14_)PY(B8g&xXL0SbG
zbu!e!S_K+)N?ZjRb!uD%8g&}fA=)y~sMF&r(5N$@4%KmiMtv3PFs%ZOIuq(}tpbfY
z3+gjk1sZiW)Dc<*8g))w1sZiO)R8(a(5SD)RiIH{hx)9J3pDB*aTRFPc~D2`xIm-M
zhx(jWfks^rSAj-d2z9iM3pDDRP{(K$Xw*ee$7&U5)WuN8X%%SHB~Zs}6=>9@P$y^=
zXw<i$KCe}vQQwC8f>wb>eFy49tpbhuF4Pyb3N-36s4r<1Xw>(hzN}TCQQwF9idKO}
zT@H1UR)I!c0d=xgfks^kb&6JjMqLGUs#bwU{Q&ATtpbhuA=K$w1se4us57(*H0sAt
zU)3tmsGmTcsa2p+KaHzEqkaZ;mW~TF>gRD4Xw)yD&em~(M*R}%9IXP4x*F<Stpbg@
z2I^~C1sZj2Tm>3+9n{x#T%b|cL!GBppi#eqI$x_mqkau_fmVS=-4IuSM%@T?p^ghQ
z>ZZ5~H0oxkZ|b-}qka=tfkyon>LMK%Xw>iGD$uCkLtU)n0*$%_>JqI2jk-0i0*$&2
z>QWsSXw>a-6=>8QaTRFPAE3UaEd!1EV_XFq^(Uxr>$pIp?u7b|R)I#{1@&F60*$&m
zt^$qvb6f=)bx&Lc8ugdB3N-4kP?zak0gbvht^$p^59)h5F3_m^p}wzGpivJ%U9MH2
zQGbKFLaRWd{tk7eR)I!62z8ZKfkr(9^#iQ}jd~dBhgt<1^$66Dv<fupQK%nl6=>9B
zP(RTs(5T0ueyUZVQBOeqOshbn{sHxKtpbgD66zOP1se5Fs9$OoXw<);uGT8hsDDFU
zqg9|$|AD$zt3adv3w51Vfkr(Ab-h-BM*R=!S6T%cwO=VY6aHGOK%@4Dx<RWzqYj9x
zK%)+ft3aa;g1S*#1{!s6Tm>3+2-HnFF3_k$p>EbH(5S=WD$uCIp?;&|0*(3%)Ni#4
zH0p@B3N-3SsNd<hK%+hj^?R)XjXEl>0*(3{)GazL(5R!KZq+K#sAJ+P(5PeMD$uCo
zpl;Kafkqt<b-Px9Mx6k4hgN|`eIDu$S_K;Qg}4ed>cqGTH0p~`f7F(NMtv!+0*(4|
zTm>5S6{tUH%Rr+}imO1QPKLTu#|0X73e;U%1sZi~Tm>3+8r0o7F3_mc<0{aoGob#g
z;{uKPD%3q%1sZi`Tm>3+R$K)db#`0@8g))w1sZj3Tm>5SwYUm2>g#b8Xw*02D$uC&
z;wsRn^W!Ses0-pM(5MUJD$uBJ##NwE7sXYeQ5VNmpi!5^RiIIq##NwE--@e1qrM$i
zfku5Nt^$qvZd?T#by-{m8uh)n3N-5baTRFP<#82g)D>|RXw;Q)6=>8|aTRFP58^72
z)x<A!ZSF(<i~xP4N?)M+2!RRZ0v*tp`Z%rvjrs}HUv*rdQ9p&cSF1pyeim1OM*Tdl
z0*(4bTm>5S%eV?O>gu=(H0m0t`*faxMqL|Mfks^ySAj-d4|Tt`3^eLjaTRFPuc7{?
z;{uJkA+7?Ax-qT-jk*cy@7glZsGH*|(5T<URiIJ7g?dn11{(D{sE4!)H0t+I4{H@@
z)GcupXw<DxkLb8Sqi%zGRI5OvZjY-#qwauuOveQp^@q3$H0qCW6=>9-;wsRnJE0!e
z=>m<q3+f520*$&mt^$qvb6f=)bx&Lc8ugdB3N-4kaTRFPy>S(2)O~RkXw?026=>80
zQ2)@i0W|7waTRFP-=Ut=ae+oX2=z~`0*!hot^$pE80ud-F3_k)p#Gy(piz&;RiIIi
zLH$?91se5uTm>5S1k_VHF3_ldK>bgvK%<_7YSqC}18CGg<0{aoe?e6`F3_ldL$$RE
zH0nQ49jyY5`Y%*ht3ackg6e4%Xw?6p`dS4VwO_uR@FuhhG;0623N-3~xC%7tK&YX%
z3^eMXxC%7t;J6Aj>X5h!H0n^ONu4gxsKcP9v<fup@VE*z>N8N&Ixf(tBjPI1s3W1~
z=(s?mJ_|LYRiIHv#Z{nDpM#p!ae+o14YjORfkqtzwMeT#qmG4ItW}^<$3ZQpRiIJF
zLoKgWpiw74t)^9=QJ;^iK%>48SAj;I2(`Ml3^eMCaTRFPm*OhWs4qjUp)CWA`bt~{
z8g)`!1sZiS)Y{rI(5O@5D$uA?<0{ao)8Z=7sMDd=)#(C_IwP(EjrwX_1sZi`Tm>3+
z7St{}U7%5CL+z$jpi$>QJxi-Vqt1nTj#hz2eGTflS_K;Qb*SfQ6=>8qpq{T)pi$>R
z?Wt9uQRhRwK&wEbE`WNWR)I!c2=!vE0*(46)XTIAH0q+b3N-5CxC%7tlDG;q>e9Fh
zWYybCCt`2qd%w}@?R@#=)jRp}%d2-m%Ruje-UlrQtpKeAtpa@j`VjOH=wr|)pie=c
zfj$R)0s0cO8ngzq7PJnu9`qIHYtRPJM$jhEX3#gFZ$aOIz6WgqZ3S%uZ3pcD{Q&wA
z^b=?&XcuTV=x5L#&@Z50L3=^_K>I-lK)->02OR_*0v!e&0UZS$104sQ0Q~_v3HlTC
z7wB)$KcIg>r$GOK`jwV;?_XMa!2r-e&>+xY&=Am2&@j+&&@-SBppl?wL8Cy=fkuPI
zfX0HxfyRR-fSw1v0GbGT5%d!1WzZ|2NubG~DWIvKX`tz#8K74|GeNUJvq5t}b3w0x
zUI)Dang^N>S^!!IdK0t=v>3Dmv=sCf=xxwDpm#ybK<|Ox2Q3G!0IdYA0(}7b5cCn~
zW6&p{PeGr7J_mgP`VzDnv<9>mv<|c$^cCo9&<4;(&?eAk&^MrOLEnMC2W<gu1#JUu
z2kij;0QwR16KE%B7ic%=XV4zdFQ8vRdqMj^`#}dlzkz-S9RwW$9R?i%9R(c&9S5BN
z{Q)`&`V;gQ=x@+JpnpN9K>vaImBIKcgYgF%2pR+$3>pF&3K|9)4tfSO0yGlzENB$y
zInZd(7|>YIIM8^|1km%K7eEt1FM?hIy$pH<Gzl~rGzByjGz~NzGz0W1XeMYDXf|jL
zXfEhA(CeT#K=VNJK?^_&L2rT<ffj?7fR=*Z0=*4-2lOsz8R$LG`=I5Z6`+-%RiF<*
zAA&vteGK{p^eN~w(C45pKwpAZgVuo7g4Ti7gT4ZN4cY+O2-*bN4EhH2E$BPY_n<AH
zt)Ok7?Vuf?A3#5Xegf?T?E>uv{S4Xz`UUhWXfJ3VXg}xx=r_>spo5@8pu?aeprfE;
zpyQwupg%w-L4Sh&0{sp82lOxK6zD%tzXFWE0`CY-h6WUPM`<;%Kwi)^s6YngU<4Tg
z8VVW)8V-5}Gy*gd^eku;=sD16&=}BI&^XX|&;-!)pcg<BK`(+{0=*1+1vCjX88ihn
z6*LVr9W(><DrhEX7HBqT4rnguHPGvzH$d}1^Fa$h3qfy!7J(LnmVlOm-U7W1dI$6_
zXc_1|(EFg}pcSB%pjDs`Kp%oW0(}hn1oSEBGtlRtFF;>{R)f}n)`Hf7)`PwReGS?G
z+6dYN+6?*z^eyN+(D$G%psk>7pzWX?pdUa#f_?(+1nmOt2K@}$1NsH@D`+ohA80@5
z0O&W+@1TRAL!iT;BcP+8W1!=p6QDmpCqaLL{sR3C`Umtc=oIKbP`|P=#`>3)VK@LZ
z5Htuh7&HVl6f_Jp9P|um1ZX7aS<ooZbD+_nF`%)aaiH;_383deFMuY3UIe`adKvTz
zXcA~LXbNa5Xc}lbXa?w2&`i)Q&}`5g&|J`Kpw~fffaZbbgBE}mg5Crz0xbqD0WAf+
z1$rCw4(MIbGSGXV_d&}+D?lqjt3V%sJ_LOP`WW;H=u^;VpwB^HfW8E+2CV_D1+4?E
z2Ym(l8ngkl5wr=k8T1Y4ThMo)??GEYTS41E+d(@(KY)G&{RG+x+6CGT`Wdtb^b6=$
z&|c6!(0<SX&~KpMK?gyHK!-s`Ku1BxK*vERK!1Qvg8l^k1^OHG59nXeDbRnQe)OqK
zI^y*&#P|aZ1PuZW1`PoX1q}lY2R#EC0U8N<7BmX<9B4FX3}`H99B4df0_b_r3!sUh
z7eOz9UIx7angp5*ngXJiS#tTNfu@6IfL;a71kD1?2F(G@1-%A(9rOlh9%w#j0cauU
zP0%9HV$c%MQqWtVw?XfK-UTfKy$5<9v>dbov=X!m^a1EY&_|$;L7#v=1$_qk9P|a~
zOVDc28qiwMI?#I1SD>#!AC{6!NgJ?pBX({AZ3cY<`WEyZ=zGu>&{ohk(00%c&<~&=
zK|g_Zf_8y+gMJ3>0sR8{6|@($540b20Q4K^chEu5A<$va5ztZ4G0<_)3D6&)lb}CA
ze}Vo6{R8?JbPDtzs9zC=VG)KQXdq}1XfS99XeekHXgKH@&<M~-(6gXXpyxoNL1RE;
zLE}K<K@&jFgI)kl1ic7)3G_1P70@KmWY84QRM0fgbkGdYtDu>nS)kdVIiR_q*Fdj>
z-T=)5%?B+2Ed;#@S_E1QS^`=MdJFV6=pE3zpk<)<K<|T=gI0i6f>wb(0DTDh2=p=N
z6VRui&p@Apz5sm*S`AtQS_@hSS`YdP^fhP$Xd`G7Xfx;=(6^xPK;MJ5fVP6RfwqHo
zfPMh|2>J=M6SNDo8}u`159k-rub{o4eW3lI1EAkPzk?2f4uKAXj)0DWj)9JYPJsRZ
zodo>}`U~_o=pWF(pi`j#K>dm_{)#dFKm$R8K!ZU;Ktn;pK*K@LfJT5uf}RD90zC&B
z4H^R)TkKs(ZwMQQo#U}{0_b_r3!sUh7eOz9UIx7angp5*ngW^%ng*H<ngMzhG!rxn
zG#fMrG#B(5=ylK=pn0JApar0Xpf^E_K#M_3KubYyf!+qaQ!Fn}d$-uTlqzHyePQ(R
z{38E}(uMYu_EYwg{{P(G`A<7VwNt^-;tS|4j3+a9Q@D%WX?7}cFTZ51xH3Im=1!+?
zS~DImrmr(u?rXM};BY78P37)FyR6_J@@jH-kzFW(Pa^PAJLFW$<@a+tE4#lvz#eE1
zvIpBk?4kBBd$|3KJ;EMo53pSN3>bfOeJ;JFSU$SGkly;X>UR2eEhUM+%Khnsne-A_
z+i|Q*X;%)uxSmIw>T<`sh}QHC&R$M%8Ev^%!gXpTJiD6hJN_-SC*$^}Wxz#H4|ozh
zA<K{=C4-@~KV>BXyP}ikub*YbdJerN(o)~UdWHgK#d@yg<Z+-}2~<h~?Lwe@3RGGG
zm9gxE9XbVLsb=#+ehXz;e(pg1=D3?9@aX=LNQEN(4oSdBMIs%9Bw!?YQ@uJ2Nx(?*
z1rK!uk^oEIYm}Mt-h5hCknM`HT}iep%XSsnt}5HrWV?F6;cIw{Xj#)rB>kd9$~%-y
zd-O8KT3ofIq-tw()dm-%8tO>Z)}_j*CskXYzMaR7mM=BdkQ*(~jn;?)HI^EqSJ+-m
z0{4TC^g2rE2MH?uC3ltV2df6qyJ*o5<ol?;m~<$DrP`JE_!Xb7&%J@Zg5)~q2c9#+
zx|H6^%VuXrX6uvLbMG!;wzV~oKD8odZO3+P&-U$r8_-VLDLZZF*cm(PSpBIE+Dn3#
zsu{TSvPF7dX{EH=fHv&lv371V+Hjmt(;gMZ!nd4PSMXY~%-<DpZ%F$s3_MA>a69mB
zXLWs-U)buow`9FGwBbnT;ngLyK!Uy-fIt5XTDz22OVy%QptXD~O4WA!&J6q_Sf5_o
z>v!i3c@J!DC+I<04`i7L&Y|rv(U1mLGH6Q6RKVYTPs=h#mKj-QWtkhKNzJpu+^~i7
zky=GoTT6*+1*1zziEBqC{y0|TPgKb=Ychqd6BW8{ROn9;UY8ma727?NUT$A67+cou
zP8;>}<`yKo(nf>41}dF8o3=iaFPP`mVH<93+R-p?ZaP(yHa=IBeJf`TuM+3djxUsy
z`b!nGr@*UK(3W<tQ9(yqt)(xm<b_S0Woi^fI@hgC`>MF3^OF^6qiXKcD(RJ{t@jjn
zxZH}NUo6CaDz`en{MqHkuI7%(q`9H1yHsAcI)$m>j%(morH%I$zqi+YxUS@~ru#5*
zxm@y(T>5n>9DUxL!;KHT#zZTn5bi`Lr=g+T0jil_|1{R72tiZgRVrvns}(5O2PoNx
zRI*aZa$$a|w^NEcatG<izG{e$97G*C5q0D+YSw)D%!2O7r6pUsBLhsECZje@MQvI}
zLhB|Kh+35TCU@il>Bx=Hk;}>}jT@sQmyLDgLg~nj(UFU!BNt0YE+-wiyfkSA>97@L
zSxJ_aWm!d*Riz_Wv+UA#8K*jbjk=?kN9|aHN0w?C_}K{TqN<jRteTRcHpo!TScYmz
zhT1rZQ537TBv40|b!Aykmi1-XAZSe0&@gCD%SQB(x9ODL8R9+Cc6Cq2QchIqnW)m6
zl&tioB`Y0ZDm@!jdTvzd%}Q3ffT{GLHEGSI`dY}crBq@oslL`SAlg`ALF(GDY`Rfc
z7}m`c<&dLqdluDJC#kJ=Qd=ESTLPxG+DmP9Ms2l^)m8_ot<Fy3eM-Nh)XEvM>?F(1
zvg{(uuEDkRQuS_9Tiq?E$0Vw)&Qe=>c74=Vsi?N{quM&NWNn>QvbF%Gwn|5}RVJ#f
zvrE>N!0Fn$k#cg5)YG}LJWp!te5tLTQd<{TVR7oZuw2;K>9vU}sf$$7g*5yu)dQ6z
zz?CEeu0ZuDFQq9P4Y<OnG(}Nq_&d3lqX*o@B})S^r74a|Q!XmaB_&HEU`m57eW^6k
z*=VFoV~uo~G}75l;(>s^MJENmLYm}CS@xFYRkFNVme&NmX`*(mRK#`CVAo64++aC3
zR-nr0Dg`ZXw?ng4hzeRUD(Fom3)-h-K>?<qm7;=HjtY8n$$|=)g8J7|u5O_+Sa#<^
z|9XO3`8z%Iy<l-UeYwzfcTWm<n=EgacD*Cuv3zH6BgN<|`|pzdcgq;RC*ULIy;8vY
zq{#PMVfobcb_Kg)SSeG{dEiPaayRiC4@xEXLM1;KtK^5Il4+vK$J~cy`G_nZmE~iy
zd_4G+^6-SD@}yMkQ&w0xb%XQ297?0Rr17+*aXHd>I+jL1C5c>)MEWbSIRLAHSPjBz
zunKC>G-L=J9IB+QaE7XiWWR^lA0`JXbu|(greY}!M+#RXg=b)F1Xd%ldKRluNZ~mZ
z&{q|#(Ueft>;=vkm7s*ql!V42p_`G=*my$YkkHLYXgq9A!0LIdUchRi3Jy^UFH#EC
zQnxshR4Gc~EJ<N9Qs|EqCdX5lf)x5Ag{d$$4Xf!`&A{qa6<kE#ZziQsJ=NElrA|>t
zI$KhhjTD9;h1u~G<{*V3NMSCFy@u86SiOPOJQXaV6llgB)=1stERYn=krWmpg^@^M
zVLXL5k-|u%un5K$W3>dUrC7bCf&y|*Z&M02Q+@1Oc5S<kUDvK>*S8zk4edsDW4npn
zG;EgBEo`3CJ8Y42Ak#93PTpG4zt(meXPr7sg~kN1k+a2<31H)BJg<*U0KbY&05zE*
z-6R^)O`{?Gb!-9%iUzf$)6Ai7i}BQYgQDSKJmiL0I+{#6&7*W$MCojdr32FGxR=l*
zuBEdzoxY4VS~(9E(3FaX{U%HnH)HDf4V^l)wcA;h$p`U?#S^ra^OZi-cuzf9;>4nO
z)o}XSdp+_u|DV$e$2v(RsU(L6RgN}QL1~;%uzyp9p_G?SdI`$qi9Z!^>BD4FC$5C6
zfQIk{Dpi(gzojbY2Yfc6T%Nb`#8f`5>Q)S0o}Bt@)RkGQ$5S!+29#>xB>0>}z@-_c
zBnjdPrrVjs1kZF;Llst{A!dj4^?54{m(Vg9E~RA<zDvuL<>c_ES1Y^!kXOxgq$u|K
zZ1NO=ZPS-mc_YjDwfE6i*J&g7bB6ut1fMJA(x=1dJf1#)N;M4PCJGWnj!R!%O>$XW
z=jtRSL|b{3qUF*@SA|Lm$v0OynS41G&?i?JO0#AkEbS*LnrELtdSxUAe{+zn32}M}
zPOpHYyZpITPE&}}wY)T8*^k-WKS_^sUA8=jq;IXV5dk*hvk~82jywk(R{|7Dn)I<j
z4qYTb-x`z}O4$FE3No}iwCO{uKHE;Zd-CWUo+{SZOuNT3QV%(jpA;8QK3Hd`RAnLg
z{-#Y|8Wdw0n?AED87<FUjSNcC0>)@IGMXz!6P$|*4{;&s8><qzg4BnOEMOw%MUhKM
z<h0bjO<!0|<&v3-a^$5Ctfop4Rg#9FPYjAynZNI0)8|!Fd{R+`{#!PEU{FF<m5}s(
zK`B!;(de5D(($T?1;jOEyQXZ{3b_tyOINHKR;2AZvaD<6WWBnXoLsM|Q;)x`T1Cv&
z7vuDGK}o8C5Pe)w%H5E^Eojq+Ri)ew+37@;CSb~4Ix2S=v;>#0k@PzHu%IN<n1%v<
zB2HBoYC^hY(`N-G(Wa6peN|AXnM_vbi>e9kq0JxTkn~MK85Yesq#JcpA<lygivsSV
z(jCg4&O{5z1bt9Y%G2^DHb&nQl=8GJSsnpXp2Da+MPf?ouazuY%d(AS=h>y~d_I@5
z3+%FXAzdj++eLP<T`s45P%WoIP%&FE?5eC<lp_1cD`+Y$KJplioT>C38$M^o-^3C0
zEiSr-@*$5Y3VMGOG(ktI*~3Z%ZL*4j{z*akGT=EB`%9`J@tu~cUD3jo8&6$GZmB6;
zs_IcuE{oVSG%R&i2fr(=oyr0Qes@~CqMRKDV+cH1`oRR+4(NiQS1(LBT@_ylauV;+
z&O?--{}OG$2VPale}y3AW>MFa4%X8;DY5BtVk)T3*N&X@HrjiPbIK8I^>8JBBW>hx
z)2K&7v5*l9Pm~sgPed*CWXj)6>s(O==82NWO{NCrNJ6C~p?r%kPT9G3o>zJ}RcbRy
z#Zl9%NsKa*%8bx&O_fr>gG?=PB+0U3cxD;DEh&ZUZ`G^<zdb>bq%zwR19D|o%}Mzk
zNhv4aS9&AmasgbmASc*Jd&>v)$W~GCDV4E;WU8VpD_P!f#Urg{avMp{$VH5hhFWqZ
zHT4J}jkVN!gd-IXwvYx}dIJR>DTny<o;?36LmBGoIa>(prH^Oa?`WfbiXE|-qxmMs
zr|~9<jZ!?|LK<*_&&e@Aq!A~9M<Z<-a)M(NkGYV>Txt(l9jkcIg*50=oRzT>I8CEY
z@&e?%WH>Jk(tKp#%rrcmnQ_PrjXNpZcy$vSq=6@88y_nh$ds*7RJO)qPAZZnyxb#c
z!po}{l;z#Qht?MU+f@2H`Oq4!&$?X4Md8j;K5yJb|E+LeklsWacYn^XKP|<%o0~dK
zhgK<#kSR=Ynhvc}m~xzYlI#8+?k8zFv<lrT#D`W%=02(P162j?XDvKfI)^K<e0T!s
zJs>fn+7sgRQe1fta&)dhpDR#EN}Ya}3ii;)Z0?_=)Y(IPXid*WrUgumRY)&Ko-1-(
zDb>T0=DMuFBLZ}2l^UuP{#Po)9j<anht?c!;40M3TrO%=W3yV+3e_b)(uW?U5LQTs
zR-wnFuykk@V>LoLv`R)F=R<4MZv~9envv03VpO`u6A!UbI<!jUC!{`frx!4hYe$jm
zNaQRxa@~*)t;yw7m?!zr8q%Rv`twtgH#)S6_CFr=Ast$iyD84oQVKe>O2~c~2y|?f
zLiNWmp@VC%fUMFv;|H{&b4DgQXJn#t#>KQ6qGY`HL!TUq6*=ofsbtgGfk6%mc6mNe
zwkz3{?J9Ovx-nDTu3^{2`G;4zEIIn-X?EsS9-+Ldy>-NUd+gNxHMF~G0q@?QO|K)U
z_6sf4fn@p{T37#!3-?ea#r<3$J(@-xH*57Gl1xVz;@Xr^>zj@qaCJ)8uEB2Cbo81+
z+|ki*65(T|(>us)t$m#06Jh#Cg4%lsRFxb#Ce+R)@XyH2*-1+1pW*pj;dRJ@<=;c!
zK4q#~kEt_liLrVmjnyw{tbR#j$;eoPn6ZX&V~t80Yh2P;<C4Zwk+CK*V@-`Q8O+V7
zjjga>aSoR`9S%<AaCvj6tWvy8WTbh_NDFO5>Z7Gjl?u$gwUzYBfdx6g5@f?+PKNzN
zZhD+*CGFK(#%bgzgt)!LQM8FU3LzN=WSS4EZP|aT!xjB~WU(DzRZ@p4NoQ#v(lJpT
zM`!84$LFZC2$;^&p$ti;vj~~al4m+gDbrax#yZOx|EII$b7wg~dFdnvmM230cY@Ab
zKy@OpMCwvf>Kc=}m6W<il2l?3vct_Ha3&`k4lEFRXGx^NDM?B$WO7nc*`=e(E@LYD
z9LaH1*+QnW&*kI4@ybHFvU7O;Q=m@Ol1WBcbyX!xs{A~eF`TN9JVN^UG(odez6vuu
z=_yp@l_N8Wo+;i>6CR#CTp*>=bDUlRdX94;H#N<1tmFxbF4sa;B?T9BvACW~#PeJ#
z+n0%Jx?Hxeko{N6c5m5#m26)v`>&DhYh`(zsMpK(4WiyCQ>vR}f1lt?%EHY-Cz{*b
zBKvQZFn#6ldz&n8w-R-ebrbb+DxSV{l4z1s&3co{!n4@RsTeZhJwV0&VN!`%tUYWr
z_5Y@r#qwbd-p-QK|Lc_HDBfAO#5IBIDYom_`Lw&9dpRv_cK7N=3hVIB`m)oJoi|Zf
zegD!4=vtN4z#T|I5<E=RV#gg!>yXE`+V04Q7?V6vQo9^?D6Iwj_H-D%mZGM7vSy|I
zTWOi&-$Bcae<v-o{(ZE}^&h5Xp8q&4OZiXJGT$Fa%hEo7rm~Dbl9mNNUGlTa`V(kb
z=+nhHtH`Iiu!{ZZv@GY(qGfsiby`;N7t*q#zm%4t|1K>n`P*q(+5d@_Rb*L}t}XcG
zRU#SGNu(0RVUVV5204_=L?$c_vVIvik?ZeF<R!}nrTpqDk)LWEl=dsAM442JL_w-^
zP}c9O5{2oSK@rV#gJQpl3d;G-RHA&UqjN?rs(`j$Uve0oWTJd`PLeAdP>PR~i3<Lq
zprTJ_1eKg>tTjySq1;(2q&_}^Hmdt+ie5vuYg$3&oXRR)hntA4inTh@N^H%k%kA7K
z$u6_5G=2f^t=EFMv4rKjdvx804$4l09<<pc#bK4IOndAqPD4JxHO*nu4cUFE8p@G)
zAjz@Qh`(6dOae8MK=pJW5CS!3OU)%vV+qtm2Ld5b6Asiu0yU99t#lv|0yX92T1ud%
z5~za?1VW%@Y^jw5Y9@i`9+PAoB!TRzPK)~}SZntzs(3j)RdoaZK7v;ML$qw2d7PGQ
ztVFfU%|Ug)qDs{8ufv?eX*-Z&w(<H=fOc}7ytCu}kAW*c%OABJK-9hpkz%i$qDJIF
zTZR0X4B8**64&`OdTi@Fs@;iWEo)R$9<Zk$4sb-tbL)@|OEn2R?mf!kp(Bk{olL5>
zPQcF4uE|<aTTeihYveYe9gZp&CK}O(>+s!apDj5u7|VwNTN7Z5R=Aggg}>3#3lG!M
z50BF_kYytICvAt}zqCw-^c8t4m1H$7dva)+OXHS48B8U&QlZ-}m2}QerQD{ewDVHR
zbEc+poNlR%b1;>44y1FP&dQohvAH@nN7d1o#zV*1C4E@sar8Z2sS=0Ilqq4HhWoUs
z3LJJ<atf4kyYL>_U7S4anVciEJMCRXOII4GjU!)@bxYA6haX?KhBhqK#tE+DwKKr8
z5_~Jrqk_6mTUXMdfpd5*)t5Bbos`4lb4ui}+7*<;O3lz{j#Yu;uAnMFHknk}%F-Sd
zebL&Y$52$jxv?#NZpd*s1h=CEQ51wixDtYfo*(*U$yPz~DGCu>Nek7&NqtG{ge*f@
zCS{qDWgyG6EOTU;2{?R~E;2X=|L>zFc5ax*7k|pSn<z}Fw>g1Z66p;QcyvM6{fenR
zExfsT$#oI<G-Jy7j>z_wr@U`y!*O|n%K_v%sJAXFP?}rKqq8C_xrMgM@Z|t?Yk|~H
zK~>tL`X;6DF4s@a4i2quQ<-hF;d^{#m@`oJEZ$F7CfUA#$w1&8&qxLmoB=ATx~MS4
zRa}-X9)-nyXsgiWCMj2)Hq`(;PX*9T6LQiNu~&JCSRwT<1+Qp@l|H0qFOsZQmJ1gb
zA*)Fa@4iKGs!U{?mlUS?vb2EI8P}jtaARG`e-$2?Q3VQ^{HLM{Oq&X<%9Ez30;>sd
z1zI`FD3u&~5H0f#ZDhE1a=4zdL7)1POm&_gMb#u=lF5y#DG$|z=BU9_B6USlIzY`&
zD7NZNC&IKAFdd*KA6ugQ3mC(tqWtHZ{MV8;*BzjCyaUuZMUGBgm6nz+P51qzrRf)b
zx~2J{6jKBNlTtxcgtAdf_o^oiRzLMV6{CSH8(Q)(6<w)HluA4nHsYR8EbZS|+J7Y4
zAE4Wx=8hDf+P{gIYbx#E%nF;|O_9q<<Q5WnEFuGR<f15YONrb{BDc2e{IG2mDr)(h
zw$y;dYEpf%(@yN%2Ri_5r<~Z4hG;*F(p3*)Bnp^@XkUk`M~_4SQ$6LQhNxf~qC?4s
z=x7>3>b|0S9HXp~srxg!vvEDj1dQ>@QQcS3-G^$clN7kK6u3+3X^Pu5)s@^yH*q}O
zQ)kll8CIfnSUpiDd1u(;I8|~5kG()u)sv3j^GIjs+J=U$IM)N@To*4X;ap2koM2f?
zTd;F=thN-fr3^+oicyTpDUZE^P4_&U8qr*|<g;{+r>Fs11s72${~eCfD3RzD0at0z
zlSw-__q+%Ka&F;eL>5(P@)BArF0}**bd?GuOqIG<P^eHI(rlcRM-`LJIbP!i;kC53
zRTGuEnl>D+Y)LkyU78pYFv+@9*^;cMD;pPH`AWEx9!yqRC+m`j@>TaBY)l)0>Ru^q
zNE?Yn-!kd?v=RF4>`GR$DQzYbed}1>{j`;1<&@?*rCdsPGP}r78}|_svOGal=jVq{
z5TsPkZ00fA$mPQMJY{e>wY-)bh#tLMPf*I^3$pp%x3r|6#nZBk<rn(h{35@%U+f>q
zDd*GOT)G(J6mYb*!A=T-M^k${?vD}h8Gx5hjZ~N|OUok5ujn6giZ@b}cJ$CM&)hlD
z>+>akdGf}Vx&iS8I6gfMOx=pcn5AxVB&QW6r+pkLTO~Q{-_lshmR7f*Y?V#fB>4*7
z5;9UridMyYhqkL`-=}3YDPVOeU=52eEBRHzHhxwAI=@=-Lc6+Cldl1^cUMu^ni<Z*
zeRVCT7VqpJJ8Q|#2O3zpwRxvY9ar|(mU2BzhbVJ-if@lQbv`0X9YYS6qn>aiY#olN
zC~U~LQZwpFN2;bSfu-gZy8MI(Ul_O4Qku2!Q<PctwC5&iqdpftb$`<3>;4U@2~<k*
zrT*O11C0<kPxV9KhP2xRZd5XGDGn@6Ah(y)Ks14XGyykN6DiU_$Kzq%)ccc`&7^Xh
zOXq7L%a%0S{LB0X$zHjw=23E7A}NhrhKWkkhW2u4#8g``)lQ<fx9o<w9o{E(Fg3U|
zc1LOKk*Je1Bo8BNxy?!L#BHpe^Td8<={plVkMG%b%u~I(h{3Lw-N@;-k^-I)vdMfk
zEmzETFKM<%WR}Z$rnY2xPf<5M%iBrO&i1Y*Uv`cd?_Nq>bgmdb&vMRRN~Sxx*OPj_
zWal$S>anL7`5g5qAoa-MaK&Oxjzq#0i?xn-BPkb3bzbD%LfaR6x6`thY+oX&T`J4V
zWO=#eH%?z?H}RYL7yHd}y7<j=y5?TNQyA_n(yOkJp1F*AU<;>rEmAtWL&)M)e9EN0
zE|5yQTFSj4Pg1#to=PH(9X3A`D-Qb~GIE0ER#YweZxv05ZfgaH%`OoSPOI9qsXFA!
zgVVMwX)f&Aoyf1tqpZHmPTHfhAxoW;>)pdKY5HfMZS^Wdnv<e5E7Ne;w6a8+gOv56
zW%ZLpPoFoL13{W@j^Q6YUhP<W$@CR2yL^i$OZ0%X$Bx=kwetO@ymt7w(3rPacjzgG
zh74R4jy%PX;K;VB9eR93a_CV8zcp>R4te$jcN@>CO*I*GqD`BRimGn)fN!42g_WRI
z+HD^}qBT7~-h;?#K#QPuCg{e%qc(i9An3xt-6q*3blI|iL-(D<gv*n%Bpq>uJf`p|
zGROxOc?2z}lPi+Xl7FEW@DVQH?E4pa?l}x)8O|$3OG!WuXsTWi@`V&fmYytqSq33r
zS_!RW)~%Mz^}5-4spKe%+nh4#O(Mxxm*+@*2C4<>Ghym8RkA))r|UD|YD~-MXzirA
zDh0SYB~Ku(QY-99i=41N^|y>{XJtDbuBScpgNeKTq&zh7a;aO^587vPGPH3wTlVBI
zb_;bKx<F3F?iPBt6Eq0A*Y^6-M#FGNJ@-!9xR*~Ma%Rwh+`F5wQP90~a6fI_$C=N(
zmUi!rAi=l%L-wWjhY!-u2g31`+JhlK`0-FUk+vU}<s-6uH2gZ(vL2(~bkrt8R|f2m
zOG%mO89nydfMt#pSs--{?Un@c#b{}|9B1bzOM8bhWjs2%7Np9ICR>B4COv+vBUIO-
z?vtt$>ON)azMy2?7yO^PFDv=zin<ptbq_jS_l4mcN~uWdzF4-)O5K;UoFCN()ChNY
zZ<gVwI&bs5zsT$r!Goo|Q?zkq_-Q)$hc<pvR99BGnjYw&JnfVlLCG7^u6>&LnJO<q
z+&f%d-}7nXs^l%%z@?2{2tT~Kga@T)w#I+%VoG_p;`4`qj+SmIYJ#8T>OgQup}UmC
zJ#zmwxI6GpQckW8?m;H639dpWuZ=SKODvPW#xkkNWb(Qwlh;R?+#AazD9WT%#tp&M
z4Wu&miI0gYV_z&CO(vZiqjYYH(%BzN2V~OW67~sh38Wcrj!Jk-RKf$X68;uTS(8cm
z)+pt^QOdu^QU;lnxw&tPN_a3<!h^ANG?{d6kJ7m#O6O249Z-}`_%^lZVHI9dU|C0$
z_<^G;Y)uaBn98z;IIhCJ)H6<C^@oyicv8^=uJmvRR)4AR1kvB1e?b3&PGR*Q9ck#f
z6Q}%V)USHG18MA3;4Y@9YI*3T)m=#$emT+b%S6L3YldG%>bmAoQt|%`ze-eJmeRvd
zzzjc-3_tk3$};Av$oQ)&+m&SeRTDo^UHk-1REwON-;&X*B)hdFyZ6y?juOgMkK~A_
z&Wk*CsmN33o9xyunO&Kx;>ahSx{hR5dujobT@bP>2cEigLT9PRJjri8X`}jbAZZ|b
z8p@tVQi8@(f+kXerWQZo^MCK<I?WnUt6uHV-6pHF`ky0%xQsflNV<4I)Wyq2UA#GU
z4C828qzNKF(~^%RNz?BNqkdNu^}ALQKzkX1sMpE(EslD3Yp$qh_|mcbbP@oQM7b!5
z@=+3PN+uy-lHmDB1$8!jctz7HZA-RF+tZy$>Y<W4r>fLL<)|L2MD@_FWIePmSq~lJ
z^-wj+WVI-h9ZP0Xz+{p~dG)9s&L~+AXOyf5fT@QXQ4%$yBs!H$LLf>)dRu4dZRg_H
zBw%_Q==8Cvi}b0kSswJ=WaxL7Jw0T^pPA*6ewJ*XEu+4(Y@Z{`b1mmn^(;*u%6Zf2
zN?WhY5-)g-Ha=54?aJ&Za^E2OT-D+B=BITR5`LkovG_x!V7?H|RIRX26*=&HDV{55
zNwGVL4nwOorEyeRBeTh{PlirjDdt+m!z8mWWd&1c;;~L~ZD!s{yORths?w<)&nec!
zjeVuUX(WD4cjn%e-;ni{S=ZUg(daejm|n9?enS>bW0&z25j~HkE8X<lv*egef{(rD
z@p$lFB%|ji$2E}2;d}Bdc=F-8GNJ03d^jx=s`urXDl;m}eVHOGhnF57NQ<qIt24=o
zXflv#GmYG6GL>AZ!qHT!<w_<`tCS4?52#cl><ZEv?a7}%=nu{FU!sb7$fw`o<xHfl
zkv8`WUw(%tC4bl-UPk8ckC>7_%f&KY_gVY&h>>a@WpmH-$L2^igSeVSd=BC`h#(Pp
zt~y!_`{O;Ctb$AlHF62F=TnauV@tb@#cCW@<86zcOrd|ydC!r-Ydk*lsHFx}k`s^f
zOXmB0lWlJcjosR6U{UZLZS>@v1dB5gwN7}nszmL{!#is1o;)vt=k~UdP&ZhTD|J-Q
z&!|=s{(_Q8U0{-;*6B%cd-3H7`sZB8cUP|s`cXf<s6TD0p+x~d?sE~hLhw$mRD^(^
ztJ=sERtP6T11RLheD^kZHxuyuULgFlsst2}fR>1t$cP_SNkjn_B|jI@9WH*B$GPkz
zS!ojRJpXdJ$MQl_x+dQY*K_@IXzL0-yHPLY%ZR*^=HCQaiPf8ZqI$7{#OiHgNuRt*
zy5!Zl>cVTJS6(Zf@;Yh8>#bxRuXnPpcOX-bewu98cW&S_yX#y&)3{M`KF5(Ef0Mx6
zY7(uFWjAnc;Zc5l@EL`aUk$0Zl5!p@AWkoJD>nz98ptWZtztr-5(vmCfzy`_-5_!M
zN{|&v%ejrC-YAuJo9z6M?(yEiA#ak9cgW6l6!K0E*+)X&DLXgNeKh_}MkD+x!*1q0
zK8xH$6Dic=d~WDp=6TQ4uxspJUdHEIZ{oM4k;sLXr^mE2;F0+xoAht?Tn2eS%)bM`
zH%2Z0PS53`{51D9|DL=ApL8|#A1EWExS9VjMsag9iu=g}mQk0c$G|9_J`aC|?6mM7
zDJAtyKR;rN|EnOUYXih3`TcU_d=11WY$6VX*dHQDL>^Lu#2NboFt|ZxaBCFz#b<tl
zRXCY4F$C^wDBRaDxai?nJ%iN<tms(<=UG*TYWHS2(6r1P#ers2>_GEe{6I5WRi~Uq
z2b$5U<bh_4KD6LKGggy6(CFtwqXW%&bD(iwrEE`-Lqp~d&z(i|yyEL>nUe{34$%wp
za7gAaFI+-25p(DlQHYmlO5EITVYjqfC0lzp+HLq(Dt3FjgWb_SgRbdxc3xIT$kr|H
zt<-s5QFMr~LV5*7b6U3w-@p;7wQ1o=u@;_GvW0J>_-#UZZPOjJ(KcLM+T}fyB}le2
zUHG_*aEcr+!$q~^@H15oKlD_-_#}{NKaJ9U_fXVnC=fj(Vt28-+THB#&J0zKF6iCr
zccWal3olniX+HgbRq`0pMSST{K<+bgG@2<D88%g3UJr`ao*s}a@Xw;H4*aO3oWXWX
z-ib5VS<*sDJ+}nu8BdyL9J5siuC8lUS($RoQB4_X%8}+N$6Uo1h{Eetd6{#}l?!Xp
zoZ~e)_l@QpuWQm>;SFg}o^$x;QO?c?Z&Nw`IkeR&JYN+ixL!MFYll`|PukZdJXa<C
zUbNLU>_I=r;7cLhGNUTW@R^7CiM|#Bl8Yf$&P5bszT(49c(zJgX-;W@8p?dHN=b6v
zO>#X<a%XaK5_us`OC>UA?oHKzBR{MPB!_2ll<*P~7Ad))vRH*z5G_GFF2(9CC3CvB
zLGMuG_OQ>SXF<=h&$gS{=h){a&-3m}eW=EhX)dD_?}Dg!AH|9XlH!Rc{}}0hLPpQG
zd)gP+y_^l|TpG`}<?!`}jnXwUmMWCmyD(#`LNR|)WPVf3JV@7G1v;VKtcvMCnMtWc
zwi4ktD8si%^*bdeN#DcIY@s|{=xkN%s2I1)P}{~&ky_z<7-|=X?_;Q4VuspwX`9HI
zZkMd+3Bm$O?b2|0X{qPSvMoYOhSLrVrxh4ZATyja%5dVz=?|2ni|DkiynV4V!Ja||
zxl;!Hqy9U{$zvub&&MWb&zG2-c^8s<d)!}xiQWsC=ry5HBl($V%TZ)4@&hvY(P;AX
zB4%4JVf8Xruh^E;&t63pb*Y>%-AV74kh8kGq7$b6Hg$J%S~ozZg3+m?CUZh_cXUE?
zPjo^v5aIOz2SukeUPqdd+)MA5@OWN(pFg)i=C${mdF`N*^IAU8KH$&Aadxm290%HL
zCyG79mR>#-r5a|-TxGZ|la*&|D|v}qkv#OJ&LsP=DeDvA7Ej9hWK`D4v9eByl~t1|
z>r+u#{}+{YYOJgvQ&v7hd^+4(ELXawNqa}9%F|-$Xfo;0Azxqj>#w4erpHnO>6AQn
zn*&t1HJ#v-w1JAfrPR#~)Xl4?shLzMmwDIOms|JK<ecwgyhxK`xsS1p=7#6fT}(bX
zlE0w!bUN4SMAz&~Jc=c^6!e=~9CtMZY%M=q^lqa~^-!Ta#^T*bK(8Pw@71FXk7uX$
zLv~fa32pf78R;P$&+?klR?waCVLK&ag73W0aW#;)v4lLEp<^tq{8qG?q=OQhtz*e6
zTvCkcu{wSmS_{w{j;i2ilWCdgKCa}~mI8J^vdV`Q(4#g@!h>o&A(v%Nz|;FoFrJp_
zfUo>#EvL_R3g6AYk*ezEa<pk*YBxymohP5RLcb4f;kWPdOxcxGb253guduzKFNuAF
zJ85}au!WS{#poTuy|m{}S>6?lp>ce7FoBl$SYgJgmJGA@b^P-pzQ~;_^*kkcIYo8(
zhC}`XL~4p7FFPsCr#R|8C+B5SEVVKx%{LzK`=jOgY4$n=QadXuS%FO^%1ZGHWxGt^
z)1D%_Q|~nm%Q$UQ#cZ0>5ZmQS+E$V6^4u0m+s2*d6_vzXMKO)-^7P9^yR1`rBLyyS
zIqy{laJ<=$l%Yz=GE`*;6P3Y^(y3N5o$65;tTb0{4QbT5jwD-ClC34%)ucqVrJn0p
zVWCsesmsUAHWIBK&-K-tj>z@t!XRH*a~dq7y=~KflHG>fxzy*m?%xEBIHLL@mBTga
zxOA*d|4X|YOTNA=6?CK4Y7+2sl63F5yjM4@;B?|&TR8Gef4lT~q%@U$udA5TlfYH$
z=;v~kon~i{;r8jfX;*Wp;w^!6fELmLTGCqq>;svqHr>>y=CmrNupQDbQCPXWrgqZl
ziTsSiY1f;ijvhby-0l+2YE^n1b}><Vo)N1;RoP!k)Pek_C6BWP{Cm9i0spSJW5C1Y
zjDUai*2&7$vWrw$+v!4|(P|ggb*iK~KTeVEBvH(Fxqwa7)#kg#%*#{rFn=qB?;?qG
zl|;HrB0VILGp$TL`YkJ+BA&$)>oYtye)jEbx1n+*m2-NPOy%5ID(9I(SUz`v^J#)+
zsb=(x8BX*ZN%UMvw5KF`fh5{Xwl9?CzQ}Se<};d3Qbm{W-CxyKRSNE=_?Jk`OZl|)
z+PWlkmY*qKC*882@`8pmQm@N4rRDWj*w8){Hgc}98#^~}FYMy;^;3RliKWKSB-3u<
z-1GuL*W8}8-%}H5l3_QcnO5_#h21S|Y4^^wvb!qlOqyY^*X^G}5v?kgI!WHz{-uQ1
zy=&D>nlE;yy{ElyKySzj9KOR#@A+Vts`~MwQ@%Xgir-sy^~4Vg_^mzh!vMNfir=q0
z;-kC!a<|##VmS0gg%-3HaG#`WaqLUIGg%s&kZ|&2cCT_;!U=FV`tvu?LecMG>CGhm
z!?evAlMHcJql@{-kU*3nfXPsT9}7}CLt)7b378Be`DSsHp=3jXC_@4|LsUgMobzt(
z)iiW7e3?>Rl$1hb*abLrx0(wpAaUeT`7Ax!o2V-f%%u{gcm!18CKR7b1Ibc1rb6}w
zmg?h&>~EFxaZ0R~7OPERRRB5h4^XHwmQ(OJ^^flEk5nB6JmQ^~%XvE@aEIqxB9#>@
z^!`JZ1XxP$B(j}psc;W%IL?#`$=$T!y7N5mSK26Kf8@^3diw~91Quky0|dpKluO@m
zVGw8TE^26r8F~O2X8g#Exb?^ker&GIR4i$x5}+Y+T$$>}a%KjSV*x45@04DZ`&gQt
zbPo|!Wmn~0ke}p|1@7;8?lIb^me2X44<{U9U{h|>T<-)yb?#R#y+z48NmPReqD!yT
zWT_@sl6#b*x_=SYV%Ox%DHO{I_lvwFzxW_@*A__<<g_UueV&4o;F+*Vq7Lmq5~paZ
zuGHJ=GFB+l3ukWGBzKUMbUfZwFGUN7zSY6I1w`U`Sp6B~kR1Ac0F7s>zGSaIy<>r(
z!ONt$1Iu7AHSEqN7Gf|pWIIvc6_6eyFFR-?oqSO`<&up?JXLb~t142SMq-}c^2${K
zpjryKHq-9A6{H>H0OdCEq|8mEy_)7GIYN$9DYt?A_}!(~G;`sbiP87+taO4Bl;5{I
z1L=fSGSh+^$$2f4=F9*nGc<W4Y$?lDvTQBOHnMDM$ulPM0za9=WJ~;PJ)dqM@=w=`
z=wBKA({;LbnEwC&R55y0iqq~Us?Q!SzrdosINaMaGH?W(hSj9_0^AKPH;)2#;JK5!
zBkAQ6bmWpcyQ+HpRy4Uy<Ls{FmL=%K^G#=Wp38%*Gj~6)TMHRDooR;|MO`Q;0Xv?s
z5^ZPHoq3rGw2`Hs;w0U2wAEFv<y8#5A_k7^%Y!l@JCj^>my=|Nk{3CXQjs&skDN(2
z?z4_|CIWCKR#=NHlvZ~qGc{<VjB~kan&Dwn;L!9oU5B>1^OV}ThcBw6xF9`vu1ewR
ziF1%&vAKFwpDB&5ox@o&Ot@HXJ&JpFS&F9aO}d=UIrQIh)|5^+Fz^I*4s!6s=d!3C
zQmIA}`1GKP9CAeRdCd9jqRyj7WY4GnJ^huG*abdcjz3rYV=w757s`^3((kJc)XEpQ
zHK-AmtHEU7yDuYNcDY=`asTpU5NhH>6+40ziXFE5w<n&U3wZ(wwNfsjxNFLaU+5b7
z1-iGwp|xXJ#T8~e!v}KxN`?xLjt}KA3732W-ZS`-<ifi@N9=>RfkOV(PB-s?9C6Rx
z-O+H*AImimcMSaVC!n!emwoc5ilSKV_);GKcIY#?xZpNyg13?Ma3_|R;rTg2O~_gy
zA3)CZ9xRYDp2eL!Y)LWC_8u#p%ftE{`T<B@ioL-XD3Yt-`ZPAprC&jk`X!3!sls$?
zioaT}gu1?>Qvn|4=Xwv76L+&laZhl`-?0B&6IGMT&Sb5;4T-8rGQSpWQ<q$|c%$<o
zZ?q1Ac1P<Y6Cs;8KZ;E^@HicgEo1m=RK>DF)M`)f;d~j38?b+TDS2PZ1yK!clnXg7
z)sS?BP3l&%<~>qMs;O6WP`EH^{LQh(-yCI0eCjvx#{U-WMU5}r^E-Oa0oBj<blv*m
zH8hIPba_5-Nn=XRIpVoIC%BZ)8ls_m86Q@np)7!*9P+>_>l`gD<G0X^-^<PTmC<$u
z*NVfV%{xxYmF*b+D)Rpz=q)+9N|sk!&NWxlpgGICmImgv?7h^0Fub0iNDZqjsTDhK
zHI~%MnbfW`sZpbG;PPrX0#}FvS2TgIH-RN<H;5+{N8sN`QE&9$rHSlKekDr0k7Vj*
zS>7VcTV>goZmc+4R13=KnKHb$s3DY9uQ?8hTjbJsG`uxM!+Wc`pMpli`=V%gZ^Pi!
z!~5c>g}29Aczdjccf?!xhttDbM(`zGt9%*3m+}ba!SW;efbMjcMV;;^^ndPjJS=xg
zAM{$}N(TnX0ObL7x!1BlhQ$>$mXbV{uB54sjDg-X29iAduafRBp{}OEBYpQ8a<UTq
zS_&?Q@axzwiFQ35zGWEP;L@R6DtH&V=WaTQpu1PjZMCST&(5tzO;X8et;(m^rQXgn
zZK51;skigACaQ8kRO5GCj0zVw$M>bI$ct8{YL-&U>n^A%#8=#1(xcu@g*FcL9xCtY
zsn)%cB)d~Nj8$>ks9fooRZU;JFW%SgH^V><Y7fYvx-H#G;2AH!HqnX>Y7a_E`k?j@
zbxS!`r)yEUA3mGNX{YitJjto%bWnM@b!dNe=K<;-em&ZJL@M!7Sw1Gq$7T71ET5F+
zQ<n2T{z<_(lKC1AIatYjO_TYjP3BR+TJ*z<d<j$Agz1MX&vKv^v;EO>1F)js^9D_V
zgE(!ZCz5|wV=2jAGIPJt8<;2k^QLI#z6bqNp8@;=(ntAU<)U_U4zL$}olgSVQ~mex
zhLuURrHy?UF3nYew4}zXD~i|0lXua~e7_kzR;nXKJphkfU*$=U(0EmCOAO=LIk%xI
zErvB}!<y)rWTm^1DLgpmHd47_`5-JeQ#iZONS3Xz6NSFn8`@aH9YVNbx-r9tK#deG
z&3ITIE^*7#p-tNO2x|B!R>x?h(0x0*wKGWVrBl3f!(r6X>!``e6i+e-<2mPgDp!WM
z#)g<@f@Gp89&v~q=$r<0w055%D?`;d>TzmnGR4!-q2lVC3smUwM06N#Ht3n?aFCvg
zK0|q>uBzvvBc#ou$><1FzwY59;n-w4DtUNT4&+WR%7aW#>#7-;p4Q{(X~@&l`o5l;
zjuKNP=B5qQtB#!PJ}23UCa4Xg32LM0s5V-<OLSNRm7JnBR<k_uI!$QmSxN><Q!`nm
z+v$3;I;O-Rkbyl`Rp)|7)77y!s_UMNo3~L<9*-W&B@ahZWj9xIeK|L8;e6xCxp~Vd
zw-aKy1(i4ow{pJ4QMk3q*z>0|CbjwkTK4p;ccN6CqC1Q{?FE^6uSTBu(scJlmDh}}
zRlTHSs{69?nI<YyUg5<gCDY-_=#Epcnu^sltfpf%gXSE^)y3ooE|lZcEl!7w_<>ua
zndS+Md_B|r12i@p@-eJ$G|4<EH^q38>G5yF|3nWOmkoKUdmA6y_*nE8#uObnWCr>-
zY|)`koYg<b(Z5)o!it{^ZR5Nux9ZOeH&Q9vsQJE}mCQs7^iUO~I%i>2ok=(D^K$4k
zjeg-wx1hRF$u9D`QR4mW%G?clrNSOWw>y`joOifWFrywIcZJ-km{AV|O~Z_Oko>BE
z?&nJ}2J`8;B{OQCZr<tAH(J;!43WD>E`70#cMrAMW4p8SEblBzaF|_#lkc69veL`F
zoH?ow?KxjYR9m$KBdVP_;hQUU7!9krQgs+sa;Dc_EltX~(rf57@1WO!Os~->r%P4o
zE-byA!FfZ~p-Sl~)3*-JavUW)nr=Gp|8&#&@FLtz<t%oAl7g(jadJU)T#z%5g^DU(
z&T6^2-$XCqPQjgp?k0;!hY3EXIm21$C3$}*?nK;c7NdLUUeh`1HA`S7>NQIhU&oVK
zd2@>NmYhpCt1#Jnn@=GvuTN>2UVvm&O0B<xKJ_kE%cyI$r7H$_s)C=Z;3q5O*$TSJ
z(8c1HNb#lAr|1|imr~E8!ZrVwVo?(E7Z%Q^hv8`kz?V`Tdlyw}8+qZU#lLvr%asma
z;B)HJOn^etm5)WV^!&GI>HF`|GLU7$UrF1c{}C;d{%5pI`D<vI_P?TKj%;USJ1gOH
zWe>eRi_}v7CR*lO^xn<u0^jS92>fDtJ*?X!2;KHU(rcGU(a#ZswA(Jo@wy~3{F6a>
z!`$g>Cr;_Tl=HUk4`fKGJMu!lD`oRt<m6V`l2^Uc?QHLR1}P9;H2M>9!3<ichrHx3
zv@T1nX{kq4>gNdfuCcevkn}??S)sp(k}Q&Cv6QL6e~b2%^Ql$nB_jM&@(NPQic-o-
zvRzrWt4N8fN?lg-KctY=Wls(Ip^e)*DD*B4io6ShV(&6~9mt_{O+LNhR}4t`9Q8QL
z=bG~I<qv+GPs&&8bor!6wWUZ;p-2LzNFXUvBeGD(vdcSl2T*p}c}=K`)GJRr)sTFD
z5J7#(Z3BN8Z8!8s(z21Xc4L1uZ8wqqO(je-SvHqFEv%q|-;Lf;a$Qi#?;TY3I@z%o
zZaFRIQgZD*deYHq#q(b^pWZP`&|1=dGv9ljpp6va?Q-5kg0?)bRrFc4mk8PcN+;=d
zz15x$dZgs!OeW|+cMU9`?vwsMy50j$ucCS%zjNpIx9w&(yQ!PqY_jRS_uhLUq(OQD
zAwX!M1qdw=dIxFJr1y?gQ4moP`HCo_qNsp^V8LE~&w0*!@4l~}|9|)Mo|!Xq?vyiU
z`plWaEx;xVT_F3Uq?{rvm*&-&lvArHr)8AP_H<@)MsOy$Iy3ke;4Ef%HZwVe?sMoq
zmoD=t{PPKMT6ovL5Yd!^<mgDrQGo5y_<Ja>2)4NJUD$Z>Nbb)Tz7Oo^7NL^O(n5e-
z@0QtQfXz}S<{0w**x(=#9!LHj&xlSSJem@=hmv$6Zi#uV)ykCZ!iT`?Nr`NEEZT)|
z&D(fALJ-+~4zT-7cJMf8AiK{wVd|14;)&g7ddfAscCJ!461nR5qPTK7evbJ`lP^hM
z!Q%W$=tn?SqT6%Kdx{(+3^#ZVPRLe~?5FB;Fd8by{0uDtd{*P<n4d$LFf40`yx%9u
z!?Kp4{l*kzyPHTLo7KTg@MT?bF5voLA>bk8$e}ExC4?JTR2y;04M!u%$CZSe`R$fK
zzTHaBZX-{(J9sT&X*kRq9}N$Bm~ezQCLD<q6Qk_m!;!klo>WWU{I!8qGf1mum{rZ4
zlJ2x>?gT#_JxIf<d4yC|Q?8msvZ|?=R81$4x_Lx-JnlS-yi1$D41+ti>2l0zypNN@
zpyHo-{}CLgn=cKZWlz|s8)DL#n`Z0Jg~{9SO>!GP^ezSd7=FBA?qjjSF-Q0E7Q+lg
zFwE|dJ|TUM8%&=Fu|xGh7~KHVCxk}1(EG^MXKG?RR1;(AI>uu)G44~w-$KvRDgH93
zlW_pp@ze|L5~%-jC+}96c`1GW<q1xFJ1hm#by2!vO7|7`lrlOAuSN<2s)QL{ZAvHM
z@p@bqULEOLm99Relkj*bfOHaG1L=AzU1Lfo;c-JLqw7q%zDn1nlCG;XAsJma(hX6%
z?v->+RdhW_H$v%}E9qJ?y1Ni?&xru;NTq8{>F$AJFR52qetR>#(Ms2r(n)^%WZ_AE
z`;u<F()CN}B){DI$-)~zy2(m6Fr}05233VOm~>N>Zb(We;c@?k;YmJ*k#45a4NvJL
zA0w*58%er3N;fK{lki4Y(TyS9e5D&(Nk{#BS$T{n-6Ex%P)RqjD!fUgTdZ`GE9s_G
z(M=`Y3Z<J?NjJTUZU*UADc#IUx>*_BUC7mJ(ydXtIVqj+Wo||%<u;FWhbZ0rlur1v
zAfvkv*h1E$%}Tc@rF#HAi?ezpbW0fCR;627NylcM{1dw6q}#4^D=O)D3O}Peh;)Z3
z-Kt7DJ$^6sT*6yTx}8e5rjkw%+^2NwNViMr)>qOUlI2(UawzGJR=N!-o$zI272PJ%
z9jkPkE9rP9Ka-cOq&r^ewpG&cU=!&cLg+il=RHctv-i@jp(%C_&-g6u+D?Xdiqi4q
zz0gTL;xYRy-Xlr3SLu$bjF)$lGrHZRJ45M?siZr$D!<2(ZlBT}UrBdDMko2$L%K7S
z?!=T%@^MlX-N~doN9j(fq|>whQf?C7UeX;<y3;D@POl2@4ANbpbo;nolQK{b?PobY
zi-(<EoJ(Jb@XyXjrHr8!7LcTO!E&x)sa&I^`?-)Vp-OdvlGv0~<mP-0^+qK<P!%ek
z6saV=P)Wb6q!(3@s^VuRKuY@(CB0oqFU?4$yf14HRY)&a(mR#(iYiiF<jF$4N=ffl
z(yOaTp_aKat=B3krqRNz>oQVd)|aYDuUFCsl=Oxw(i^KuZ&K2Sl=S8*(l1w&-lC+B
zE9tFOq<GP!GOf2O=~GI2M-}N;s!8uu(r1+Pt}4>It4Z%sQrvJ84%}NsiU&{91l$G7
z{Yv`0l0J};inu)3PAaAKkdnTvqz`ALQm~J-lS-(MD(S0A`dCIPq2fIhz)ZBCP}0|w
z^vR4=O6#d=(x;X5btQeKiu9}1q|Ykpca-$GDpGywEla=)O8Pw|eX)x4rRq?>rlfBv
z>C085U#}+phLXOmq_0$wzFJNCO(p%2lD<|&insDAnf1Do{zOT2uU0CzFbgl}Rg!*J
zNwLKxGVn%KsNbt5eN#z)p`_ohB7Lix^#7FfS4#SWDpEYoQkj4sD(Qzx`c4(;kE%(3
ztfaqH(s!#!e^O2Qo|1l|q(7}9{aH2X&z1BuC4Ik&^cU5nzf{scD(MGRq`#^r{k4*Q
zp`@6RG0$2Ay5T~}x(m)9De2#o^y7?FYQb--L;anS{zFMW$w;MOKdmPHy^{V%Nk6M1
z{X8R0q=El~k{YA5f2^YYQ#I|MmDZMNzo??s^<>FdA}zsRmDVfM{w<@Grr__@;r>Hu
z{W9%8Gg`^ozp82ft+Zj8_CHm$|E;Fo=d$-q%Cu*?l?9>e!c4F5;cTTXlxfeYqCJ<Z
z#u@E?rL8a1o>xVCel_g@rEM(JUQk7QVKwbVO53$edvO)*B^hlj^U_P*%K*(j%es3R
zNz)nX<!<E^^$PVX&rz@Bodrx%9nMd4ghj$y!g|6+!Y+i}37ZL93EK$!5e_6ALO7go
z6yaFH351ggrxDI1oI^ODa1r5B!WD$82-gs<C)_}|nQ$B7VT4Bz9z}Qz;qing5}rbM
z8sR=e<7_qdtAVXFnXBU=3o)>hCB_wMT&>1+YTTg4&1&4L#vN+hrN+H#;8K@_@~|3@
zsqv&5X78ZWBxvKp_b@tPXnRs;9hC9v<Sfg4YPy`#pvYTzc4IKHpO2WtFAjgQs%
zL=ET@5z0TR@r4?HQ{$g%{6~#5T~eN-#(8R7pa!mnODMRjEyfjCW5K$!y~=F@);T@{
z)9tJArZi-8zx7VbFb{gke7Wk6rOTi;oZ)t`*HPte7hdE4lFOydo%!f3&|ncWsI#-(
zdV7O=2ZHLF`ypJh;Sxq&J75{drE!xQH>>ewHEvM@S7v&E@-_t2(T)uoywB|I?gWIP
z(cQu5{3Qva!{rux7jB27zIUtdZuPwfzW2MokYsq0j0fB^B`pVOS{~H+9#Z3BH6Bsp
zQ8gY@<8d{f(3Cxim8nq{;b7J3FIYF?s?~R}YBjG*#i~{O^s`tMU4W$x>^SU3wB5LB
z<?Mvj463CvFheM7SMF!HKVoOY?>79X70i=amKkCF(_#AlS%85r$7rxH71(J(=zbJL
zw#kjdoXiPit0TGq76(smJL${HHvc?Q+ugeb?ga+8K8z)b;s~MKP{XLW0EMU=dT{~D
z>?qKywwwwlk<v6~cQLvX7T4g;5ve;!Lv4;!>bjJg5!Ul$|Mo6KIubdiz$`cTk07~4
zX04G~>%^>erVAcgN0?pd(v2?N>C!}(9=>c#H0R9H7GJg|dNRaThFDK~FWNiO-kbI|
zti`$0Oi;38^EEh>ANIKyF-VS>8;jg9)V>9?v>!w5@3^($07-{vXpDus0|x;!dwfs0
zJP5Ajw0}1&(7uZ6JAC?f$iIO%#iHF&wFaJr2h~!<h2q9ou0anICPJZLETIoSL6A#q
zBC&XB3?tyf7*P6W!akDu8OHcVIr#<`%eSLt-%7$`IAeBIgyW~-jLpK~r+8aR^!kiH
z3~}}Fo`dHE9s<2#u=fIxiM=F@8#*TvYG7`)iKH-0Qx~P7PJ(NiI?=*7*&hNpg;7jp
zN~ig9z<9b7barRvyPy^XUEK|ioL-ZP^H>c&m)j64v19n-=8B3__~?0Dsbs}bwRYC|
zvE)*#4_+AJp;HbXF!V{dnrut~Z}1HR8M|6si9rhzT2T@{0FL~^lOaU*g`0ih%eYp)
zh;D-=aN-aG;zqWx6lg;tL%kxIa&+wQDqPOi^hLmp$qVqR%>$S{#oWs<JN!|QlWg)*
zir7JT<?bJXIr+TT-!?3Uy;DH~wdR%*gWyYccu>kIKZM@-b`Q-ly}C%-X`X0Ix*@`@
z92z{?=rsfJ%rkzW2^QZDg)&fa3e2<W6pE8$yHCZ(T^LRiTlJTVj%|Jsfp?{2v+&FO
zpN1EixAo&V3okM6s8f#N$@y;Eg0gYC<(@-yJui~ae_<0=LP6|EiMI6tSi6&?*Ips{
zSV@SG1RY54E8x?_WVDfkUnnnBY13ay1rKtPH-87qQLSwMlYsWnO+48_GJ{MHUn+>a
zHSgeJQ4ba#D(IQM%GXKY9Odgh$k*9JE99$o<BJ$JERu#9Tf|N?0vM&T)kDJSm{<8i
zRvh>C1|mE+00b`o^LD+p6OfA0-ZoHsugUkFqhL#lxN=l{lVH+Fy7$!j8&brB_O|eF
zV2=BIF4_Up$XjqEphUWFl!yc(p04P@4v!R@>1E<0V6m<!3Y5U9_|W~022Lak+;hJu
zGH{;{%{j?16kL>y0*sS!fO!9<;E&H0u`64WedzK<#EOUK<n;dkhgPcc)wG;!SdL-#
z^B*EDrjKTyNS6J)bl++AiDW4Z((Lo$M&`J{9GA0S%(CxIMS4rSB$9sJ_*`1483_0J
zro*d7N{st_fgCZZT_~vR^NDANE`rJK>1<%s)L!1_n+v=iLgSeJ1ETprBItqi#g)74
zX6RslA&>?M-HfXBmIIM{1dbWsC0Lpw!Cu?1usFl!xR5%3CGbvY106G=$zKIT)D=2r
z@<1Y8h)f;kt%Xll%JG8OU!5Y<5iB2xj&8@mOOWnF&_~IArzRrH3;v-%dN6^jVkWSe
z3EZj)B$6diF-_n`eAhzKZ8c17PlmXo$=eK{R<yy6*;!y*yrhXs^3hiK#L*GBCoeRK
z**TE@y`-E{|D5<UU%eUNF3lH_EMIM@mt=)oIOb^0N?*1J$HkuXQT+x=Do!%N_7n;B
z*amW5E<!lLxwes<SBx2Ryj>vf&wdTB!tG2E?y9mH<q;yQQ6Z~F1C}7Pv>Fk~gtVAe
zqmou57R5lOf7aCG*Jx2|2?r(1k)*-6ZjI9f$y&gntZ~C=AD(Q0%LvxKk#rwL`)Jn0
zF|_wdwt!}AvIB4&;dr`FAk9R&OmgfLscT~kyFfKH+lH5G#x>lAmqWSJq(F4zUG9BP
zmu6Eo-jm~j%}C_6wVBCYz*&rPHiMo+I5*h`_j!zRzLV?Yx90lBqx1cONha6dADbHx
zY{(D95tj5#<J_=dW^Q<}HaEg=%8d*@aSq0)SUUM>=!Mw`zlNrBJ_aIaEA_9#OJBpQ
zl;{k#5uOUyefaUH4li<;8!V6OC~~dEyC3U8;NnybUW~LB!)*u6y+b08vM)q#fP0hJ
zvbDYpu;kqlPx_!hm}G9TzFZwq<R%QuBN<wU{P;c_?*jtO(w2#cP=PTH(nKgC;XA=c
zaF3Z5Jo}J%tu}|uIuZ#3CuR_MP6+c3_Kowl_%iJ074V^?$lzi!Q>FUH%c)W%OBGu)
zQ%;qermD6wRf;fGP9!gvcfgnmj}ZAxV_hIeiR+otj!shWnry=q_#cpD?1ROcCkHXB
zjT~f@v#N+@RT$n0?_cJ-^nq<d^j!qlwKJ>+`jp&KzMII@#^_gY*1O{G!nr2j9fj>|
ziv9`bCZ_0pIM?QT<lwwH>Hrzgw(2*DlNY6w&K5d0vmAa4+77sv4Cl81qRAj{q0gs%
zFcz>qWg4*+vK_64PpgQ5`2)VG&-ZG9eN=QToO^Sm^mjNz=eup|1&{rhnfUU6XCKs7
z$GEtPd?B#DQh0(n`F@hgt)$^bu{=6NngOH%hLLlApiph|e}r$jO>An)RD($c%)#h-
zNLNrTLrlIr`!JLYJ`HCV^221Ry`8SZ>3S1fp|^5`WOzPW6b_^LIGlN2yt?E^u7lT(
zf_RQ%99T6bm108W<VVZ*hxx*-(d5<}2JGsZA0q{DxPKl#9>b5{l0duJv68@kXlMN|
z!)qLq@-Dpc<AtL;ePQkd7TyQ)LHGPb85|wqzX|F|oXP!O+?(>eXt&G%2<|+SWWF$M
zj@u(YMPv=v2Ec=<Owd1tGtK#FA~Q$%o#4(E3TO8a>vW0gXeNIKsT0=*3qdo}30o}2
zB|Y=AE(gi(f-qH-4Pl_>hUBAS!rkC>c+PQ>R{yB{++#p<49Vv)ciqe6iV3-spD&3y
zmWiJak#fucB*yKPUjSioj`N>Ka0^gf95YPZd*>G}1#&#=`66<2vTOafK}G#@W(pE5
zwSKYVw&j;diJaieNy#P5(o6{hTHNx>u1C$-Q<PhS9+U@LUruW;zeD1A%Jqq@__AP6
z<IGYZ`p#Dd(xM+k`zqQGX1!hwwQP26*xy>4U-M0bdLl#h%{R-TuDzu))OGh%g}R<0
z9l}r#rF{cK-RQUj@|&c^Jjs6v@oti_6YlDvj%*g!lj*vduJ6Is9hl!DC31@Ydr)ts
z)ci{_J1D<R+)wp`0CffjX3+GZ4b5*Cw(lj|x#etp+v4P@N8>cQQt6usZBMyJbhwnj
z_4ve}k6;ex*q|eVu{))dt`H9?sGTgRZt!r2<d2Y6>U95fe6Wk@X_0u3lp;BUb>~PH
zNgr`NN>a1WmsX5N*Pz%OVGPY5EhT)W|1!cjT4KT*M4;Z?6?A7&1dc)P=9q0(dlgZa
z<(Tcjq$xg*;qS6O`={fX_tR`G{Z63YxmMfhJrtP}*;=1Oc(Ri}MRIvIW1$uEaP3e|
zh1!Aiwn+Z8&p~xg;ZTU~=^G{XZ`+ZD^@2$;SsZ}#<%qx;!Wk<E;0l|8?xQX(i;*#X
z&(zbp7#cG=yh$q5)WA}ZcJ^!|&AIuWt!0Z_M$zQ5Z9GTh+c9t2Fz5t6?lUxSW8xz?
z?>E5(a6H=tm%upBIQc)BpTPZ6|4+!@pQ)M4;s9as3!pWy#s0k@{6do;X9mfyzZf_=
zRt^nHW%#Qhm;Ppg-uUeAhBEsPHU4RWCCDw#txMy9gTcvYZ+0@qe(sK~P}i0!Y#Z-R
zbEmsA^5>(-v9(?hSvtTg0OrRiN0F*BDq`hd1@8;ctRlbf+I&%X8DPD?M7~|fbJoH)
za`mE&tB!vSzPnhDg#X-jV;Njx21qbJw*`KDY4)+>UkB>Tl=_!;R6zt^?@wT=UCSy^
zMx_q046ZN`QOEq+#{Bq7@D1p1ZEycZe0-IL^&i`#a0}3@bwFjkF8<9#Mba>d#beU9
z0l!8+#_I&dJAj7f&$a^#;aY0nvz=YPbSFF_TkpojyJ3rMgWHwmcAb_Rwo3i`;qfIc
zHg1km9B?(_BSeO)5h5x#nm{Tc?nUHLEpabGhAX%iaXK*Ei;%Gg?nQ{;;9i7ax2ti7
z8edW4P6*XqrYAn#UpNb2>Q=d_^+q!DI8cyguXql;6I18qSRnVP4|2im=`OltPfOrF
z<yuE?2u1jQ6YR(54`?dyGr@(x9yGzFFdou0KWu_4fjy$RdsLJ67?O96`*Wm0W&T_)
zPiHb;o5}osu3Pb;I;IEf_LA^a=FfM@o&zpr{sNZ_xloOZT*^EYy^73V0&Pz-y}md&
zKQ)|X|KcXIy&1_IZ-V`gxgH@(iLtPBzs;tFwNK+iVKL}4HR!X{I2!|ZQ6e`#|2R5#
z8Oh4f><JdPY4Usy(#j}CX@wuc_eqtr9-i@kkfByiwdSPSK832ez;19C+GnXJPg=Qx
zj7Od`GBPnYLOx|=QbuKHLRsf%$1yj1H582(w1s(5x&0Cb6N~a+Gk-t`=NBc&ocXGk
zFx;1`!!4s(xL=p?0K;|I6Mn-4pW*vgw2^sLOZuBCkFRN@-_l55M@|-dQ$;17yDa}5
zGaj;dfDPPtwdMT8>(2%*Z$5>d0%_phXm8+3f0c}zPrSMsv*LT&dcCQw*Y`F2w-Ekv
z`-!_EuS)YS@a2r!50v@uL$#K>GXJ)jj?`c19|@ngO)tsG(6GA$ja94iKQs%FqKgVL
z{dmV5Eq)Wi22Um;WwbmS`4QXWbdprYviZo5WkuD&gv8qi^IbM?>8#{k^()UH%9zee
zoPx|ZexjV15*mJ8M$6NWpYWnmI@Ky;`Ze2&_e>xcoqwwA_!-!7u)oM%o&Uh}KuRw5
zH^cK+RM~947xi-7HSSt>ox9#W#62|sTlP1X6ix@t@04Zp<0!(-*D@Mm8P4AaR5#_#
z!nlTV_9;(RnI&;tI0wGJ=S3N_G!CP)GBm<N73?BE)7TG=3+y7xXxc@7uFdo6I4sH}
zq-fU0Mb0w*pljHee3WD=QsOj(RqKzM)I(y+s#Qi&wH)VX_{r3IFoqPITGz->XKBUM
z+Dv^NW5s1UwRU~$EC4}Kk4z&P$25do)WlL{+^#>!bcut_@WpeJf!$V~MDwGPw;k@j
zG^D0k5m6@Qm|mr5C(Mw&Pamw-q=<`!%-|tlSWHj6lZ>0Q(Ey+|-s6BVyQd*Gc?U=m
zya$-WlT6;pCN4m?up_yY9DXCs;d<oo++`IxbnZqD_acW+qYD<&pQWU`LAqsm1!>)D
zAiXAIh8tM-eS|-eIkPfMGDEufLtw&^1cP9D2-I<9^g+!=+*L3`i{b5X4#K+t!%!Xu
zi$ZyNEGA6C;jrhz(SZ3-nt(z$1+WMzcc#6~$j;8Z_!(sVcC!mXnB68HJPY&=UXL;>
z5ncEd#EfHM_rSOlufW6iE)(_y<I$_N!MMkS{bAf|!oe_b)Ju%})p)>!58yi->UsnQ
z?ttBjGJ04I?6pOa?H!?rqQt#F%DH&m-JJ&AqwY*&Yel3cks%$(kQvGlV`eBrxLVrI
zkosYg{+Y^<j$}v!8PZ5rbRz6bhIAoAx{@K?$PhG2MYm*|WP5BfS09?#t{kMRF8U_K
zD$CR~w@5_md*-BgS~#U&Ut|%rAdBT9!n1jfu3B!2@=pSB%@#i|(q_$$PApl^?1&3e
zvOY6Gs=(~@S)#3>EEe|)zk~$!X42cjn_%xlm%eoAN0<JjAHXaPWR?bz$%6@pgrYlo
zDB&<Cs<C~dTK9pZgFC|{b?zKvhyMU6n8J*WAe(n9n?ti(**vm6Q^Mg<Wb-k3#yXm8
z9z!;dWtzqjjwhQZFijJgrb%S;WGsl-sj@mfl`&3}M#AjX7$dV+W1QX|qcChntq`5o
z*Xs(&o2gTl8rFeX!^P)xP0MViXbw{}mnoXZw9F@5z!WWHiWV_Niz&%Vn69Nx)X|$Q
z3Inl7vg)r0b(;4SVmnxd73R(`CpBdCk%ByuuzsvT|A<XOsUA&`_O+}J>loE~R*FMd
zCk|y@*g&|EaFY{t_9jPNY*W<LYmK_ukx_TsqqyZXgeS2u^sQ;=3~C#L+Ro2+5FSQ&
zxMPp(0$xrhFOOpBz12e5ip^U!6iJ?8rhJ9ZBlO+jcaYj+$f{$>q~n<8<0&jB5bhz9
zP9&2~a-t?XR9@P%r`(IEW_WMn`&0W$=Ks`~e+x)r{+jT+y^cL?2WVynLUTIj;np-3
zMEY|kfvUU_`0R{u2_{SXOt=CD4jc=H$(_jg*=RCAi=)LN>E{{>?S3>8E$-)0PaH*P
zwFfwSn-v_4uP)FItkIzIQX!$NV#rl2d!bo`m}Zww1XUiY0c%Sqz*Y#c?Ug$Ywu?B5
z4)K=QF~ks?6RM6RJlKUv5jf?TCS&4LGc=TKt&`hPxiS=fPTYwG*q-h0!Rt)eUSi};
z#yRm{Ky#^)z6$e!Zt<UqN~__D9@4)7y^MoAEDuAG)<1w>Zsv)%Xd)HwD>%iWDpF~g
z^@ws=7wNy?U!F;|nqFl5)jH5>HI2c4;CqeEgnNStE_%cwwkW|v*h(%&y)2v>oXWt0
zNcb6CFE=!<)s`Jor}I!5aei6K=Xx!l8?=0IQdyc7=%2BVVW$(zgnR+}aSqoo#2ZwS
z_OTYWn3AXiaf)K_1OmnRCCMeG5==w~43M<x!}(hgl&4Mj6v&>@xN*`};&@i$#+w(?
zkKjzL7%$+!0sLMx6xo+FU0;LTjP%Ch2HCBg9Lx<aL}<4vHzs2?9gX%TL}R=S(O7SF
zG|rB3$43*qCY)u_zWeLo2nRBEK#+g4p`!1udH+qw>I$SiQ;TSwM-g}%>mgPh>U4{s
zfLxf1YMFauUazCKYmP3_V$E?S-6r6bar1T^j|_O`>vjCXHzyZJ7~l(s-M#H6A>s}I
zF^^r(^L12S>$iv(?r}<s83<?1&ul1*LLp%k-VA#j-VT@$<_PoQU2rMTUZlN5dkyWi
zw0EGr4y!L0K>qqMiqT*XmYzz!baY(JHB)>hH*e;YNHP|qO2^C_MAw2@R0<zLVU1&u
z;~C@x+9%RJiT25~PoaG(?bB$VPWudu7tnFVvHJ-I3|M}}CWanPkPTwn<pf07CwUMl
z5~cBuImp!I?oToI9JjBNJPedK#mvDbmwOagcj;cFcI6~1yer1am6LX>2U~*GrfUv*
z%bjLsdNo=yF8_zqk)NK-Rcrn+*n2r~2Rp%T{tzJ!$o&&(*dmRxREENTft92(<o*qt
ztYGO#fdOwzt3oL9iKnapcZ3q;Vc}mvemF~SC*cuf$SyMENGGndt<W*u#2tm+sEkfu
zGN6r{jO)t`kWrdL2IQVXpt{$v!E`GKJ3_O;6qxzGOqHy$k!}5$`7LH5F&W}vA<;B2
zhylcoFx`v7!^j+A3JfeZJ4}gu9azOzZc31^MJb=e9BFLv8KfmAHu7+AI0`WgAy<bI
z4x{^UW_twTNS+cZ>&=~sb97Gs(rrlQERNRx(=B&PinzzQW1QR_KxcE1(HBd+@>t#+
zuDcF2CU<9wNkgcTF?WpQF*GrAFi!%_B!esQDjC8<RXi8r+>h{Zp|n`onb8jyNgUaS
zyy&Z0mkbxt71o9yfPYKE4*{2kp8zf+Tuv6OU<t1bKZX13@DG3o)tIHLLQ&f^hw^YR
z=~k0TYsjRvw6CLmJwrc)p&!a{HxO<l3pYVmu-y=Kce|L{xXJx}ZPdekz{br`_slEf
zaW8jf+}mB7X*H0C-PjVY#}deKM^v<gT~_}9E1fHmS8Q5NK%PfJZdu*2H33buo;7E}
zRrJ&P(}X25$T6c6k{BZ|=~_D$ZbQ@WTr)oBSZ^QPMB6RD^7jHWQ}ip#ujIIk4VJ?F
zYv5>b11uq1n^}E=n}9?j$Y$1<5VI^N28zAe@{QSwCL>M;py-At-vg0n7Wka``B12s
zdmAo+kKuy<f3WeTOus1(zAp$mNyk^U20w*0&!{kM`bmoT7%2ol&M@9BPGqeuG|M!O
z$Q-9}#915#X#vtW3jY&Fk*6^BXdFbcI22<Xq3nZ}LOeo+x1z;fOV|Nt2kV?7x}jnR
zyTR69iijk~I?qjWo6Aztu`(qc|0g94mANI7r9?3fizM<!CZv;NJ1;|Il6^*)PqSXg
zBJ5HbVVD1jkav7dIqO8S2o+=2^^BvA-~a#41;ssPp~8>wAArO(Z%2B}sonDT2_pTd
z#bB%?{ZmUKn2g2pj{xl{LowNupl7RWF<SH$B{uiM4Be6lo&arc)KAQ21_aVRv~f7T
zw=vN=e_65gEL{7@n80E=x901>N*3$9xmRGTk=5$bH(_HNEbE`8ZvpEkys=!X_T|KE
z|1}cD#T^(wH-POV>z7`H*Fb6U?PObU5~&om=F$Qy#vs%()CH{=gUc1e@xKe2A#4k;
zFr{y&2v<TOFwNm&WTq$K(C}H<hrxAtD5vyBgwRsrj08T4J_A|HhOoRx)2{VyEbZe6
z$2;+~50Q>hr5Et+bm5JxHs@Xf)=5^InfDoFs4Q`iSu>e=3=qSYK{hLV6>v5qpTo%J
zGU$1P^TXHSyC8f6a3S=bg{|?T-(?Xlu8Od87U7cW2$xnxxQr1lXM`&l;Yvnz5aB9D
zcrYVe?G)R>rgZudPuv7{j!dLc>tG&^+1k}f!{$qyHtBRJk4H*u+Y~9cZS`#15@|#`
znkV#Jc7rsezO<l|cglB9cHno4lv~n9eiwcfL{sqW<y2@o#{gv@Hn0ZUnZ+S?ZE>jm
zBpQbOYUgQ$AuCgNV)Kk``b2Q%{o^azbf@n-5XXbrrkfK`V>JC36X1*4v2T{z;7-`R
z;9d!fFdz&GBf^+kY!bp;@F*zr)MZm3EE1LoYY1xzI}k!cjc5BoH^r%8;GGnPb}C1G
zj;NrQw0DM}Gwr*iR8Oo(d-NvK-j`|DBf7d?m>xQpB_0eEL-ZZk6vjAE`ynZGQ=4$!
z>~d4Fb9auwiEtbd4g-y2y5$4u=dILp97T-zG=wKr8jTKKL=h?jB~x_wIS7C|zX_xX
zmv5BHH9a+~uqv#mIxK5iH7x2NmpB+!(jL|rgzA|#%jYEY;fY%@ZJYw<ybM;|+mK4Z
z3Oxk77a>f7rN~$xb{mO_4t~dd)a^vd8d5%il={-8*GhY6CK^6%L4?C4e+HbX_`wY8
z7)%p{p~#M%Hw$PTzZ*6*lm^r{BRd3hV4-UAjCCDT7CF%lRG9|C#t1_Ur&A;|yUw(C
zp}i}8yAgJ$dlTJz(B4dYOC+zg^(0Lz-Frpy!hG-O48S(RK7@S<`w{jh96&hGiEHs@
z!#LTLY6|w_E9${<(I5vm0GlAv;-Wz&^+du+gp(aR=reeB56%JIV3AE3_$TKB(?>ca
zv>{TDbP_D1-JuK)O<W{Xt6|Y4fWw*m5zNX+20e;!baVxL$3)ivj>SyF-C(D*N67Pb
zS%jg?G?@Hp)e-W%olLqJ;S5GNlM&8hgtHmd9KyMba2_MXdAmAHy-;*Gx`b2xGviL~
z+PJg(Nv?}b#JZwA(AxQOT01`n2hP}AQ9Emfz6<S$S?#pW5-?zR#nHM>9d+I;9Tuw{
zC&X?JsA;m{Q7~<EETD&zZg`@E2jXQ8uDVbvMs}DC`WqoXSg`?lz?JQ}-Et=i(hML=
zFaD{p@Q@u>UUH`ajTVZh96=DzSeCG47|*65AtQMJAvgmbIkX^{-VVv12@{rL{%kSL
z*pQLt8EJ2gG~W!bEYgBhjrK^3RgspeBCYwKNJW#QW7=Anf(}Ib4iC<SV;%eQ{tfv9
z61W*^3g^LMSy|B-TnxlB3vKa2SbVwf;R#`VEr3~Td?6;n4<hVUV6H8&V<dcUh-8S-
z7|E!m6Wxo1o$1>pl95Z-=sSQNqVEHCi{1w8PJ2`IW7vBTHb*~&y(RhuV9)3`fUR_|
zC+tPv-qF>N@HWRT`w%gY^Z$x$E>Ejd>F>Y_BHjM`XmD1LKRl9gBR*Q0Ri}S~?4ans
zfU6>z1Rl%?R#SY|5U!0ZM0On`TaW7GFOBQ`l}WwZXmH7-&cqFNQrw8=NIJP2An2bZ
za#zImc^zpU$>&f4b6rPYV(rU$B3kEyOhrSs^2M8Bjm-WQPiCb3$mT0`g@;JcQ0D~l
zaevNRt_hjS0g+5m2a<V%$lAft{cssV-=WdNun&twQRr~yVFVdAl5iB^Xy#`OeKD!+
zDwVTas@Xnnfrg}>c68ht_OCfynmAELNJj1?BW2tDmvA~FYvrW6?P6$05+3E)L!{Bs
zVmY*`Sc+M(Y^W}lja9|6iN&&+S=z#4*~*}|5pHL(>|i#auNe!wcHGwy;izC05;lH5
ztY$+Lh|;PFC|$>F?i5`DWFq0D=nB{;J8_#GoAhxf8N5z9RmesM*TQ2eevY{fhevzc
zX~SV3L+9yqe$WOF!ZJe=YaYrmH8XK25lhO=`T~HRCCA{$7A2_IJl>Z75iE&$vW8+X
zJMGO1@4-1IVo(zP<}oknN+!>#V)ERS$!L@$JM-|vrJ5%1U0CKbZ%@R5)K?<1B^+G<
z=LO{4LZ)vK;bOuigi9TJv<!T;fOc0E5Z9lWl4Gh1=-8?PI*tW&Jfk^*1+<4jpGbHT
z3+QAP&?z`f=MIjCpgS0f?qHbv3GOm#KkzuFN*txVkLlI)b1V7*TZ5gwiI~W=ub;Y+
zb)G<?J7{mfp~KShL2E%Q+AAHOLG&H7Y!IL5atz)<<$#P`!~or^!`&H>^wY12$kz_l
zJYv@weg}08WgHxY&jDK6ad$bkpGOIfew4cr8~q+RsS92LjWK)bnX*_$US^htO+>;D
zUWTh{=5%HpqASfYL-PJNVCGo6$436EK=S;0a>$?xM5bs^M3_uZ!YUpwLChb{@HkdW
z!z+<B<?!S|I>(g5tE~)=2*cB(>mBHMszydcJeiI<dYz>Q0*Pc^_4GPVy@+I99qDzk
zdJ)OI8t4Vp<pYp9BH~3ZC*gMRw2}5sw0EYx3rnIaVK=&TCu|BulN+8$0&J#B3t>+u
zD%rJBjX5f+HAmRigAkJzP%lPaKJh^$i@djEd5cS5P@(olb354im%Iw>;Lc#CgPVU-
zn{W&k>K75I{w-epa~xo57g$+*a-?1k<1VCQ8Y~Yi7{B*K9$|3pR5OOVA>2uVs&A>y
z+||%~%#DTUY`EB{8SRPZ*x9cD8|xi{D93u=kNpvF9_RhrOu!STSo0I9^}b_Tf*~1p
zh8gAr6M)X8=R3xa<=)ghsn_1SrgJbR!&YD`KM_*s?Zm5}Vw*2A>iwpSEg<%poG{FN
zjFw;_leLKU#aQ49{%7$leO9G>Da-m-Bi+LbqAQjYKGU;f+$TjuLtmk{U;`1Sy$82f
zVG@xR%^L(SYoE^*Wi8H=BN`+KC0q855)rfxK;*8OCs)LL)7rrk76sm_m>lBSjf_Kz
z35Up&L>jk@CyHe}2{WE>_9Q$3v7C{JOkd+qf+ZGZfy_dFrljLHngt?R783hKky*$|
z7Vx!{Y=Nf=IA+GE8Jo+EwunP(&It0V^bqLG0Ob{tj91(`OL;{kizuJ+s*v$Yo<7ca
zRol)hY+7Npd0+?1Rmoeba#fSbmCTIV<w~)PYjsszQ!L|JZN|0wRFV)>LDoGwc9mQW
z%N01IlR2pOZ9m=yp9ZM}-cB<(*pgwJq>yC3*GR$LX_|w>Gi;mbC0|HkbTFf}FzT|x
z=u}x4L^2-Lr-jk6qA)r;wu@{Sj3o2B-YUr)r_3jkF~6$_Xv%ydS-=e`^BXhf^BP<#
zsJsT(E~uT%coo#nsi5*|TqdZBWi0Pe#d5_mmUqcm-rUY|@}{eqr@ZNw@usDcH$<{5
zbWeHHRKc5`k<2ApBPpU@PN@^FU3SK`OWXT8Fl-bV*7mS4Y@srYNXD?XAr%ZGlEu~|
zWmt2@Ft)rY!}?ZBYl~T=(%LhX*49i~`&CJ+Vj1iDSFuj9jCH*-)^XQL>$pnmKs5S}
zSs7DW2gx`-l~#Fds9jnW%cOO1*;A!eu?lJJjh#m}9c@{O45=&;A{meRq$Sd~qC|!|
zc9=9FquIsuGpn_W>7RBn1F|k=I9aG&j3-ropjo3#9F#I~aK=RFQb4ZKu0*kni6b!|
zYHuhN%a}MMW8$cG`BNs2CKC_UDnF)@iDRmms946tu~ke|Oqs~W6AA(T=YFppk8{ho
z)ho+vLvg`@o$j!dBf~R}jF*9UxzqKoN5e7#C#}4jVHxRtR`hO!Wt2HA7u*61FB3r(
zfS=q3Y;=G@yMG64V*(7?gF9iPw#q=(f9?i0Ndzt6f;<}eUBD(szXqJbG*2a*MmU{t
z2H{M?S&lu|{0`qw3VNcK+0T;_qN&(#1G-4ANHRuMn8!PkOryTlAeSVg4>4GQh8ek-
zaDidO7n<mIz@TqN#N}c&E-{fD6NdH~snM4ir_?iUvP%|&Vw~TG{%L}DF?Ro?RbEPo
zxX(1PRaT5eBqNVy^t|76<;X)Z5f=_8CVDSmgRCz?o8)~JkD*92J{fmon-gimr+6&O
zcrg5zQ(|yFBOaJxdrWK4Kf{iZroKkTKT|!t#*3(@Wiq>hU#Bvwm?VNDnUzdoI{Z1P
zN`e*3BzSrz!7`I<AO0LnjKd!%5)B0_$ja4(YY5jmc&}W}81;7B?79O;-uOVS2(HJ^
z@y>VqQD$bq){!N6h=_J3Glym}<M+Y`GrcRY3@Z|_LDqV_XWiZufLXBV=Nlnt4$I&1
z`+{z^ccEL)ZgP_Z<>igfZc;JfGe<$2nUNAaWt^y(Fv^MgBaki8n-V2OXlqVnW*hUi
zouaaXl5!Yb4kz462z|)qQ&sK&yXy>uJ0X#z&++a>V#Ri(tZjJryTcQyA9G=Ar2A2H
ze;BK0a;s*Z_o&-O_oM0ll-tHk&-b2k8_2WW?U`m8k0Csk@HoQbDfcHZy?dD66Pd=7
z2u~(F#j&ThAe9rzxCLGd2F4<ory>pQj8iOQ++GS%$~eWyxLBn7H2Mz1u@ebMF-SQ}
zKru=>#qo5~_A&WbWcdtPI`anLK(k1+V(Fk>7Npx-V#Op*4q5l{Y*aR8Eu)zvFU%zQ
zOrB)RM)GGVN?9g5pqomXg>%pX3=C$L1_c{RgM&}pA*G?gTIVLr%w<ER8Dgk2A1wWk
z`%=Zs=TY;~2bdIWkh!{brXj@(y@7xRY;jCyT!iu=0nWG4;L_cvJ9;G7^8?}d9hjne
z>tK<qc8(d1i^upxhKRR`&k3;PMQY6^%K*4EYBt-zEG;^2`O>uR-`LVd5C-8bfF-2J
zF^7lYZ7^%NXLC&G-2tRlq{!ahK!p1fK%D4qxH|G+!JT4yayG=f7uGrvM!0+T!{k`j
z>`|nntU{X;F@W_OBm=kL$LJa>qob<oG&;V0nnmZP(b@LsRBuW-Iz6LQj;>o8UHLpy
z<--eDbGP09h+<q6$^oAyS++2H8<HDRL`GtvJnBfVaC86gT0w?^4&zu)o)9M@LyqWH
zU>=5Lxi7=w%aDTvJ){^gEF;1pCdaXkCnv^QSVpHeM)Hb4V9qqS+yr*W=qCCKY*Dy%
zAmi*gOG5dL#z`cLGsZB7aV9#<0bT2L_;xWf)$7ZWsdF?_M6yf~OEc9=#4pVhkt~8-
znyGx2DelOpnc^WR%@p%dF#9zh#VjACG#`B`^FbtwrY6ltZJH09KH$mW^yGo<EBsr+
zeD!-q@^yjci%6C)VrjnmRTdVJEW!?HzUs1k@hn!FFAi!oU!mN_9w-}2=A!NeSsEC~
zl3Qjv6&3>1tKlomU_J*vxVI3l#FY{j@(_|<+88YdWxctw7|Up&qlY0;LAu$YvN`Et
zsN{I7fDY>?Uq29&mks7&<z-{W%T6gThgb5FNS3wEDKEQ}d08|oMwH}r;gO|}07n(g
z($S?ee2m8k#}bYs9A7#I?h{I12b@TkNu^0>WhR%V0ZyUIRHvpp20k->jHE7&PJ??l
z|K>P<DlFZ7w8Y8Du+2nf93SJ0!igCsEpg!tpmJu!@iD$g&IUGHx?dm5D}^&NOj_kw
zNSk~tukfQe!bu-X3B=|SW1Cz$7j*NOwE0eLlQ#n!FRrY^UymlSup8~L>U#7r+$@{}
z^Z@U_c^Gf<qYGRb7pid))X4pl1Ii7Bqd@#G4k$egDD%exg)(N0C>iR%jWmlG_v2=e
z6eV=>362qE>3C57*OX3zvCk#z&vZ+Bft{t1oUO(=ZfPGJ&vnV&{jO8f!~49p*_&qk
zY10sv2p$D)n&j4`0ykaa#iLP!<HaE8N@2q>GdXD1lS8Ur_F{K7T7ixE{RsYA<6~T!
zj78URiA&C7U|K-0k)kp(WuaZF-WZy))XFG3P$`S+jE})-{(OY_B?HB2ett3nx!(Bb
zxZ{bi-e7$8+Q}s#xy)q|VSt)k)Q)0=TDYJc<q)-l$k2^g&Z7+7WHKSTT!ToZsElS}
zbcH52m7^==D0aIbmC;O+ZZ?@HU8UisvUIhgI#?}ThSXiI<#UZ1*Q$YOy9`zF-^VUE
z1yL=Feu0wcVIH^9&tYi}w@l3c6qXj5G!>+SSOBJ^lct=s`8~}Oy}S=VvydhFWRIdb
zbP>gGv16C~4HV1S!YzGViueU>;fQ1{9I>>8TRK!CD0fFhvSy$)ZQ*)lEu836$hsrW
zAGC#I(t4X0HEEm-n3sC64Qey5XdBcgYlHfxZP3cfHi$?TdB3y`>Yuei>ko4LRp%lB
z$74B}oEu=irJNg>a&Az@IqKXmPdAyo!RB>M9%mNj+rqik<lGuZZYbEbn-TVkNJ1ZC
zew;9dp=k`m(ik|QW!bDFGs;tZDVy~y62!6xK^!9OviVMf!zd1Q?1ohsd#+^RY?NIU
z^D`|RB3a=OOABXXW#JIX_%k9coRL}KY^p4r%|v8z*2#&ujKWhg9N2AXmbgFA4q~+V
zl@`vJtZ>Grg|oG?aEN4)k4p<@e7SH+-^CDf8*{pya0lUGPEG%KDyArhzks|S#JumM
zV1B51Cz9ozSeo~pm3b$UML!|U`@}5oM^xs0S9{*0?;~N8%tt-g%}mB_GMk7gSqpI_
zt9;r(C?@R#I~FQ?$T}9qvgTlF)*2iw@kz&$GJBf&*eA1hli444V@;TDeyhx$kuiH_
z%Isq*nN1|i?W~m9vn!c>EE#+p;qeZyxnR!^@4LziuO}=)E4_;B-a}FQRM|}=V>hvs
z-FqrWIz+Mr%t_fjH)HpSmFzyLo!u0*dFJ=sqmgL1=9|A3$oB>Qa|!vrFyp&)@$KEa
zVj160scMH6%lN)1<NK-Yd=JMXzP)k|)c=~#o@y@#1z6W+N2M71Dd}=fD|<E<q}Nf5
z9gpKpg~RDG6*7OzvDg-ye`~QV$%<`hT5M-j78{X_Ys=DNTV5_U44U>CfHO@=bgQ0a
zI4(Ndun5ml<6JfNt8pGK@tkiO(Xt$z+=5RI7>*%l;4SZ)fvyPp#tsLN7Z~A$nT_$J
zbm$lIuCtkgK3nE<7nv6Eoo8ao<i&c~raZN~gi?#Sox><DHIs#Qk*Q}CD<uk~`7%w=
z<+w6G4C{Zx-4X6cdxeo>a;p=W{ak5okngPN!ugM)nWQP2irSUD$(;5%ie+p&C>6F<
znXu_is&rh9cP|i=b~~}OYzG_9v?Ma6y+*YNB+xP-b3=U!LT0ylk;c^~Z)z!yYf^Ds
zn~5WCR^f|mxLHOsfxM2F+On3QjAkOaE)&TwF?HG&P$sYEZSPbluQ%u!96y!HGMb6y
z4XL-vWf@h$jPFDa*PDVVB{JQ+(L9KdpijzAWYVXo2xurXKSfh1y%FshSXQ3@DV8z(
zkW@|&ZI{!VR77u95&g0$y@I8LTT}*aRpT}_Zdc<DHNK(-)NIs@!ZIeF(6x6NneDBi
zkld|l?q;+#QZy41MN=WU+f=syie+iukP6AhOi0QaI?6L;y!RnFX7@)S*<^Z{?uBn5
z8~2%3VcY=R9~bm~vRqIK$!0SchX`djxg`~nt(lNOp$9(AgrtmSLIMpR;+@Vz%4jAe
z+cF`+yCFiz4ueASFz*tkLIQ>2;++af8O?;`k<?p-q>Pq@L|U)yW{AlpvikF=UKbyZ
znm-UhrYKZ1i3sg5BaOCfhh;6>;i=3#W-6OD#WMcyOl9VXcA0rx+O{VIK$(X$aF41m
zJf+6dYCNOHSJik{jpx*OUX2&f$c(iwn)5JZSWCHi$-Ilyo3UlNQ8beqMN_$XNoBn}
zpj9kO_pVfKj?CodYbq&axxsrTh^azuj)I1c?gd-tIA3QOnMn{7VcyYZa+!I%Gv*zW
zGVdEz%u_5&#IY&!j%#P$H#F<7sPU>A-vlED+Sklq!H9Ka#J9|o!iY6xMktywLeZ2F
z-!g+FD`iF~mZjnNlo2OnjKE7OLSJUYw<{Qtmod~Hv(D79Vw{)`qE5;x#&@bKMj6d2
z#&=a>(~40>vx;$YRx#dStmTnZB1fgar(M~hCT2x>Q!C21wTpi<tqI!4m(i@EK$kvp
zgnFbErHqy<iu4?(m<^_erFp8^QZCKCS!te@mgZYkrKwoP$<x!)JfmEi?9TsBOYjG3
zyp4YRhemFktY<O4V;+-Y#HBrUmWpP@sAyV@?^G3|Vp-(-Oj?d-nyeu4c1%S<{<xwb
z*%F>*a2K#UbA7fkN0qaEj>+<k8{Xw*!*{DPu2>fLev{_>JjRVUOTWe<>`ye$@2T-q
zHGZbX&((Mz{P=~D)t^HsQRka<`R73U)cuz{q?0waWmIH}%QqjWcRF?dRketf(M-fH
zXrH?OI+HPm^BYBV9q0f~Nq?wm{0M3M2G>Bgat-9(_BD`Ks@FiuD8)|3v9DIIfs|26
zIM<`@YhMGozkLnlf%Y|!2P@Y=zNsq<=^98GP1iumX!{z-L+KjG!|jZGO@m0MIAt_r
z<hQEVKwht211X~!Bk?43x}5fH4L4l_`HrF$Yap-bs>gTLctefvVGU%Q{n-2iwc$|K
zhTodIq&6H=t__N2wL#IeHvHB|KbAIXie(acVOAS1YOf8ytEvrnDoDQPxFGojrVF3a
z0Z$&+6=Vjls3&THIUKmym{W{TA-%--<w?w?%3E1^{C&09mC=k_m$h>XFApK+ia!5y
z_RZ4g%Y@_))ooQ7)wYUt^>SlQFW1#8+UqK`4wUQaAFJxBVi{+y%IfOXSzVRo+drX)
z887c=*G%$e)=c)+)=cp>)J*k0ai_UZb?45+|1A3@_W)S4sc;=;Mc2DB$@Q_fnwQ1V
zFB@1paxwD;tUzP&GF>9Y(;b4!mVEw3P~KvE?5}b`^d=VzWR8ywRxXI*@;`Q18OF`L
z&%^~$Cx0`(y3IhpQaw3@XFudCY_f9BRz`(GTzR}j#SgoMTzM>`BIQo$7KC)GTUvut
zw>P<5TD{G!lf~5ARVMGyRn)KOD(ao;c$XS?tAQg*?re9CC^obALQ^?5a${NCz1{fO
zab>pdb897m*m8|!jra}&Ya@N`S08M;=C8-*-dBu|U03=%pg!1kB`fYUKK5P7iU%uL
z!6ZMVDSudvN7Q)K<r48@nt{hP15ap@pH$;1HJ-+*^fP#O76aQvIQxuysH7GfvE=SE
zE`+e1yJa-v?pLe1TSkSu$<4^ov)YE#LaoS+?dV=J4f}{PmcGmQ*ly(tz}@W)({mbV
z+Ax(-1}b&yc`fjC)KEsVx^+)`-FiU-N?W2A6=h45$XfGDnmp_kGp}Xzu=2Q|jIyq=
zOMsG9;RUvf*_Xep48$D^cfPy8UEn+`zSy8012%Gl_Pu7T(!T_Mc<A5m-#gC9ySP7V
zWD70NJ8i5XrWN|);bi3<mlj@`4$lW(;QLOI6oXk1@tNUR0yHap|0<Y?e;r`XzX>oO
znxzH*R@jREZGbU0VEtuL<ky)X_CJfG#2+hqv`SKCik`HuC7Iu1KNp%b@ss2~lqSEq
z1>s8eK*2xnZ%F=aOuk$e{|hSi06hXO3oNFa9$4m3OwRH0R-R+}#5@&h#3`BqL35)y
zfHo3^_#R#SXesOg?P0VG_9!|CFs4fqt${tquk!q=K=&fQD$!m;SWC(dwAYcop7b5*
z(m?t~+B=c5GikbzrYqgMk)}KCP4w+S-)6eB(6=Y;txRXH=n$l>ceEL>&53>cSrXWB
z7?s)wb8%!pk7I9HocKeNoNdAmhn;5fedLmc+>Rvcu#N)Ot<&?@M1)TT*&gm!5N`E{
z)@}QJTfPlho4ybu7rUtcXuu~3V4!y!NC%>5u+`kDc#9yLf!MR$ZbeZgG7#0zv2ttr
z4q$^Ya7O_^?uwX`JA*l;lUvi`IYiEfS-CZRH!!|Vthc61qEcuWJIb^0Sdu52A%+WT
zPVZI{#T`WKMQ)4ZiE@q+7&STOVndx1MC?9<Dhe1B6GaZ#?1y{}CL9@c1Y1T$IH}-_
zhMpztBho&W_7PD}xQxT`>kkp`kdhomoFf+|a6foZja<mHu*Ib|*yf5<Qx9h8BB1j`
z-mRR-lI^tlyhgWx*X0&6&_&T|xGyGL!V7duS)$9FxO31NcL_!(UG4o=G%?8K96RA2
zgokrr_-KM0pflIlP?F=CYik(AL<Tj<iF??YNwdX?>=rw%CnyK|hk$OnbcyDdHaZ8$
zjObjznYCu=EGL)rw1bUowZ*;d+PE!fiu*uoBlRS8ST2H7CpVRrA0yv8p4>=M`)=!r
zNGWo36Gdx9rDzcm(ZV04YbB-YAZ#<iMe6;F;3b=_k6Ik(*WZ$QkJ-BO@E_Vq-bqRR
z1(Hp;Tje^^WW5BE^TBZm<+GHe2HE|LB%qML(wiU~cceCcc;E}?qz1s*?GIqi?(Clg
z(^7kc=bs6?@9zf;{0jj?|5Ct+4PIPs_N4L4p|P~XarwAA4*YqwCJa4jP>uXU!q{uZ
z{-B(N&)NKOAf4lV2~bb0JYoHpfE0P@qZFBgYn&h-O$`dT7OuCBQO|7Mzd<uMejTu$
zt>kD^^j=G`1gA5SH-O;^g0Z>ph$TVc=HG<PHB(HEadh;XA@2>i1XpJpMONk&Tj0PK
z)XhdH*nSIcqIW^2e)H+qMc87FD*JWi9ffXk8@YTc-R%Tb@7-?%(Smpr!fHBP=zs3o
zcn7c^<ADVSY+bxfFsYvb9=9lN$uJqk77qtXUHQ0nTEwAlpe<<l<;%9>k-&P&J{2EA
zar}>gw^EyXFURir9Xys73y|RxQP_?giA*j97R!;y;>ED>No1agD+<$3mg*-5B7IRG
zc8Y%@xOpmhzt=w(2|SG@dOG15{v!18`;5N~#+k+my5yclhP&dlMs_jI_Wm9MTo@k;
zA!^$Wt9j0CisenHHfiSMEtld(V0}c|nT2>W(7y8oMKedwe$r>jTQ0?|@ar!HtZ%s#
z<<<*NX_yz?Hez}#<0ZF$EH_+u@K{u?7Y|D@J_+IvMh)ysIM}}n_96a#fCK%90S7Us
zLunu8*vrj_h)}tBMRu~3Y%FGMRNZ0ie7uU2fsBz?8*&mV!@hyAuo*AB#8$dg3~Zx|
zfh}}lC$`Upoi~D=En#ceGia2l!Vi)970Ic{d15YhYsiPQIr|8prJ{HmJkFt3)j-r#
z6s3r<27pk_LvAzC{fa*9)+A@bBWIp)^Z7Gi%gf1Jmi>7=e8wZYXTzf)2X!eU=kr@R
zby++Y9ylh8koBl8nFpQVqRb34bjjU?hC+CeW(!*tk~>Vj#K62u44j({T7wO4FFX8s
zpo>^HMzC&t;`*!`BdhAhNU*EDZYY-3jZu`HtZt01svBe4>qhYaLK(}=kxwDuQd4jg
zy0;yF3T%t>QcJ9<w>AD0*e3Wh04Mr$04MpPlV~!l#1toJ%TEaU_$@(Szemu|t_k`F
zGqFE9OKSO&qA1^+Ep^R&*@nI-y_@5o1vuB=4>->W2SV{}FsLvi99(QRvehSA?d(Dv
zBy^S*F9gjZu+qUBS|PE;#C~VPSPFHCR2uWC?Z$GUk`2lnJ(u$Q+@EZBdM=d@>6yPA
zUwY;Xycr-i-~0u-&Zj}hmie;)m;3VoSJ1wa_Jb&it0;&Ev%psSt3b1c_O(thA~-4-
z>Gugng@*;BgHOUS0iN0$8+`7Llewr}C;B9pF-@Y%)iFK1TBd0|0?{-vZ+!ON^zur?
zM6106EXNNw^b%~Kr#aogB9>D&%tBxWdzzWh4DoCt;YMbdXNzlL+t@6hmJsKmz&3Rh
ztlop3?;{YyX6AegbGwzf-A4O%+IRRzgXS>*IKac1+nwaa5#+@#x*SP(6yec?y9tjW
zJeHg~j`ri7U}CT+nB?~kCj0$@DZyHIDh7h(iFq&!gY7G&K+qRIiGpR1epo9wglpZX
z@cI|O^cPH|j*`B({5qa16=e*2WQ(bhV<yz|3a4WxHgF*C2`8xvfhYR)Zsw$h@LV_t
z(E-3PycjSFF9(d}0lb+Q0S^-OZ;7gZv-z9gHP*WZu#f{Vr*{lrfmsZ{1y~Bd3kYRM
zfVHB_cX(WY-u<GBPP~-sunNw9#xE~DrZML<^3yt!*E@KDwBs4@80QO(Yc8;)v!HXr
zey9fxByS{nC&JE1L{|xQKD<fonM>+P){O+vR0-`8XM<wAFEqZnN<$4mr=j*Bc{9md
z2zxphI<#Jdk0yAs9?*+NMQ`#$87}t@<xWIf2*u^l4f`x0l-$+1J4{^fx5gdqrlbJ}
z`*HDX;ByFIqW2Ogw7Yq=A^ay0Xaqtb4hen0p>!F>ScemiaFR~pB9nBE_PKEvcdZ?H
z0fL#tU`EMc&b$e|Pxy2+KOGZ}hJ7qQ9p}V&SW)^|JK<AMPG;gKGV$-);zSTmiY0_!
zSZ{oah++cfYclgSh54Eq;-nj%Wszc>?m%bR=XQonOjK*_Og1+E?m@Z16^q#)!)q#U
zU(Vrtp_i$r-`wyI2xeaR7r^<UEDJ6O{{y&?a1n!FOt{2JdPGl{xY_oMTl~XtWxfY=
zk1msdr<G*R9^xQd!YG!r<(Of*=q)`1wYn7UcEtoFTUm>5%}mpm7ngZ40a%U0gsmbx
zIFuR3Y9?$AbG$Ybbs+0l3D&a`9OA^i>@m>Jc0cYR96AYMP7kgC&4v~UexvcBIR`CE
zcmU`op4_*aF(7wl1W&?!3)!*VbPApZww0A~hv~$4w}roj+jb}JXM00~>XF~ZS2Kg(
z!|kXB3HuIXYoy5V*d(wAu+JxDMvP@9hs1V=a%$_CP_9=T8_J=j;|Px@JR#fwmpu-Y
zu3c%8fyt>R8RU*M@nEP{7?PY|q>Hg9u7gO<s@V?8liDPvPmKw-rkLB+>kO40XpBx~
znVk|AVLz4jy%fXKXg@ua>uhI)a`|PSAxF<NWa&xthrU@U__Ni3zFEOA5|Pj_0uc;%
zaKylvLL3h$c7bu?;dXjF!j6hZ+U@Zue;stqo{V(PCUY**E#5yHpWIQD>{*3CF4mM>
zqA9r)VU4y^?PX>szMsSIFE^}G|1E#7X!bo$L5R>|3@6p$DllqJL7U~9abuay@@2go
z=jC5kG|SB={Sz)nr_HjJF^#j7jWV};%%wdl?$zHHbDGChDXzENj-=V(E<i8156~wJ
zs6;)adqfxqLYV|I<;amHPnQC;uzQok&_681(gmhaQ*9|h3YJB3+t)Ee+)^w}sS!nJ
ziChJ)L6$Ig!Ngkbu(+ZYjZf0)xqgi861hszL56m!7A+VBye^Q-&-H=mqU=c6K-fqK
zbq`5;t=}z$HGUJe1!T^#Hkkwg|1C$$GUtz8L?6;a=KPFPHS=@Q&$QvqYl#-qj^Is{
z=cgw40gT@pCgC#^$V;=(sxOK7gBpJ{!ELB8e=@<HF#c?uT&+yq@g{G+^Q#C8inkp9
z8PHP)o_V2PBJFq=$h%{j@l+9+*p)2qMz(e*Y{J~l_CVau^5h@*fO9g_3;L?^%|qZS
zA=0CTA@y)A!|j=d%Otis`CfBCEa#b+hTf7p^Tv46^5lGb8rqz6c}Ner+dkN8#Ar-z
z=J#a-@pkuQSBhBPuh<Q=pG;8A4|4^1{>+*W>SIZ-XJu6EOYZxN;FzDpOu+!Apv(oG
zFbNkN|0GZjWHcX}WKW7vSshDW^+60KyM~ZmL&>ONgu_`fBb>0j+9K+#UFy25)*I~(
zaG^9!&bHZ+lfd1TsOX5?Tx2>H$G|p<ZT%&9;#Hp6R_usKo*~EL60#BC+KC`z$_NT!
z0?@H+GO?EuO#v2|t4w#w-8eSDOYz1|QBIkUzeYH5t!egW2-Z^q#Gd-J6bZ3BP?!fS
z44%)GL?=MRome5DCX@g>>VixZ!3ePPgqKWYRv^JXVtN)uGA1)EPoseulww>(T?ES5
zJY#w?@+s1#1~|nd!4b(KPl7Lsj9l-zCvyI8D*5uP=@dyYdGk^^m}#8il!GCX1ye|a
zDRMJAmaBy&^P0&smD8EZw{Yr9Qb|lwnXCZS49;492yLzh0h=jfNb?TfTRs@rEDoZ9
z1+O93vk6B9-+_G&YwTQB*m=Q^;4(jW4{$;7KHx&aMTCn9mk=%`Tt>J&_!THu1Rnve
zr2QZY+A50J6vlEeV_Y440-7~!B-fH=9cgAz4A+z95LUrM2{$<5&}eQr%-;|Ww==^L
zcsXsPJs}+BZ4O6AOT#hI@NjIjG8`Ar<QAH)*-UihwJ5nDJH?%9H(m!m97HkNbcb-H
z#k8;_Hwgn%F(Q%`K}{+~yvM?#-YgA7Dn?s~K#W+b9Zb)1kUXH93I~xa$hx#t>shMB
z&%uXnOC>#Rrq2H(u<cl%b<D|mHXJ)R9$9DEa1d#4I1ZDklv#)57iCxA@bd&6V7e7W
zfBND5fXUi(BKpyH3aPF=i}v&*WV=Z;cG7cKvtU@1Nkmopks>a-_DuXJjz$MTm_$|j
z(QJMO<FbO#b`+HZMQNjU-;1v>vTI-?bqv}|oOjoz`WSw~Mv9WFudPik*Dqp}Txkc6
zBkf%1J(ivC<C*#SkX$Fl*q#;TrrZgsr!Q*N6gx3wGpNX#kj)?weui=rncowXP<KuY
zM3Ku$w4cld@|2(xT#g_-HMku1y@W>xSHpf9?WYG{g8dBI_ZcE*8e(TDh6mH+%X1Vv
zS1~-MCZC<B*!hYbPz)OS#qUBBoB}ppgfoOIcC(GX*sxw)VpuUQRpT->E;qqV_z16u
zNq>3*;g#BtUZwr$)rwuC*tPf!^tehEPV+bBrrSe}{XQ4Nw0nQ6x_j@Qb?^Vj^`iFf
zUEeXvy7#xWd;g(!@9$`Gex&{SkJWfr6Y>*H$a_e^3?C17M8m>a(URP3yTOEW>?gUo
zcA5$2`RlN&H`#v4d;$4c6-gi6U`DzncGTDF>43DSzClrutK=bM?nYA(p<4x^egx=E
z2Aq)v-C;o|2tOVv69bQw$<X5#oF}=}%mTv?j-JAJrqN7v3-J@M;r&nO`A?pK^)`)Q
zoZC5h7U=DmiXgy=ZpY|(pq<TRw<9aa9m*qUy-OsluNY}zr8+qbu?yaoY~m+fVJq<y
zJP{)_q9~71-K9}Yb^9ce2z~kt5%gk(xkrc0b6p*JDk@c`C<(dO%#n1>cWa9h{e8L<
zD79IFx=+Vr%#_qvJUNraEF-xGH0I@Q69aqD%$C5Gy9EaJP<3Dr{ZC+cqb3b3_!<Ix
zL`518s7W1o%utRWS2=#dkmN}f;HMOOS}C7V?5m1lf{w9hBbMG5xD)Zp*iS3Qp+Bn@
z)TT^Of38W9btR5N->)8r{z3)ymnx_qsG$Bzh2hs~{6+=!Llsn<IC2-+=gekgaJ4Vv
zzvs;WVa-2tY~fyD_~}USg?3qV*vsKnv)e(9l>~RO{hE<Iy*0@`gz~bPE1_6ZTa-Y*
zZu$rsn*u}q26K^y`i+WEBWYy2nOvFh6^*{!!oE^L$l=thdepRtYwojAE550z!Q*Gr
zYT)rRX$)Rh$8T%#;FREfNXd8Actd%CN6lnR{w6Bz68o7m0lAPBu8XmG#i^9nhH=54
zDyCAVuQkq=V`Wm(u3Xl-zeC71y6@CevO757gvuTv$S}*j7CR1zTVuG+MSZdM<xbuR
zIQlZ-voB9@NEZwjx_YGxrOKyt5rq+S(0~Nz7}D;T5e>ojV0wgM@PDu;!4Cj)4yIZk
zMPXuJmCY&{5!A*%0(9)$FzAS^76^-kCBhoQTEY%NXV5?oZ{8mp7QD~gqOC{NP|wyK
z?j2<WW|t(vcpwc<Sn_8(ax+8Nmv1h!Z#CGr^@|Gj#UFuvS68slasoP%Lp&a~GN0@F
zN{W1{583c=in+X>zJDQHFqb1K!HKXzah87-L}yeu7AbMePZM8~6mSwf-G&o^7&8Np
zmBHe=P>dT-g)Pj1SQO(D;~Uzd7&jKGSo&xeigAglqFpG)C1tKo{GM<jHOX|)<_Qag
z#Y8SKmJ(Uit4T!9Ms2bJumkONwAUvOz@;N$Lm>7>r(HpDF7Nfv6>L+k=zf+jxuLZS
z>OnH0@C4fW(2TIYJV>-~v}AN-A$bPKqT~g@#mUR!k-Q4HBzYZhY4Qf(GQ#D_Td=QS
z!d51jfjS2{xf*Y{Ti*A#ofaX)(cvTTvF4X81zAq?%>Kdt6R>oaF`zjw_S>hW*p_85
zU7dVK7js#vwUP})8I0O)`|;J7@b7@W`M8Cp)}5ucx6za3#->&Y@uWta=*7I1!<^$w
zj`v?+&CA8>^it8HOj~3VS#+<)MlcqcH1x5oQ}&b)`J#f8TmigCat&Z>Vv*HeOkHm$
z*DYwatE6^LjDHCCgGFkgpmI>@U0{)@s*HaG+iHd-ZEGyTvxWkIDSq-EJl7`g1FmD?
zu1|gi`ymYeP{Iw#M{wDgd;+-1$#u7_HEn|si~QIkv<-yKlqI>33a?6XUqvRUAop`{
z1fahx=Z(b*3HAyn%6EU#!U)Vo4Zh@hAjgjv;q7I~=O89wFy6Gbb7UepE<OdWw3-ga
z2G;?bi}e6^q8t^NF9Y52@l9}?&+38wI!CVH=6gZaX|A{3kZZHEbFJZoTp#!IT;Fhd
zu3tDwwO-~1xifQv-L<(P?uOh@_mliE7fMBk+s#wJuSpckEz)qA`%3;F5W!aF?NJrj
zZ4}ri)Mq<=p0kPE>e@kxJ&Y22IAwJwrF1JL_6XAKBF&McIf^t#lV&?<c00Kdc7pRV
z;+D4OKg~#k)8d%beK17A>K%Ai{yQl&&V9$B{L{?|vj9H&ypEogYE<Qs7M0sEjaDDn
zsTZVcJ@pKEg4m`uC)f*%AMOJTus7j;3hm^dxv`trPy#J_=Kw}rtZ1tD_5+Cn0MpY3
z`(P%)bAY+vB|yAPk@E({c^3<dc{J&gi*sAbxwsLzm>{JDLjHB_8yV;Qid?ie9&X;6
zYeD2Qsi;IX>`zQ@aT^|U37SZNMFjRj#vf6ldP$yrvnz_!EUw7j*zwq7*;U1041%y7
z1nlYWiHKUeCm%|0!2KINLWvB&1s6a1Az%=_3y2Y7!J8Quy-yM>OBi{eCp&3WsV$)n
zuf9i%kjYx_7ZRvBr!M$8EU_@tKe?7^r%p}()SUkT9CP!;=k%iYYcc&5oxG1>3GDWg
z{~;`SsV4S7FJ_2{WEup&11N|D`YVf$|0yiR!v#A#pBn%(^f#1(&tNG@N&DNNw+3cS
zPLNG;fTJdgaB3UKAih?F#lJB0{t7elFR5ojV!KNdqKlN}gb*nw<nJJ_E6BH(=9!~<
z=Jnh>bCjex%B4BtiyUc=h*acApWQ%?oJiE~HAekW1v}B+Ig;}OU1;y>*uf7X%Hf{8
zE!5FoR+oPSo(+*aa`F&tLxgP+9yxhH5Rur3kDPG4(P%F(=0?D+lTC%Av%PW<g=46+
zKB;hYNrhvWFj|EJCk7EOOXaH2m!ZoD@wC_3!aoofkx1T5`CEp`qaKu~k&&p`9!|L$
zMY$U7#NESfiRey%3hf@Y+r<bg@?@nhZw~a~(p-00kn(RVQox&}>9d<u-9DA-Ngf;)
zTEV}$frVB~3$3|QdWa~m9rCh;yzJ?e*ALNt+E$6H)stSjWG)%uJqAmS|3K^sdu#3P
zapZ2VO78Y*=PsJ0vRWxovt)apg23AR|KQW!C&OwkkNtlMLa}`;_Jv_Y?6DKbFCvlf
zi$dJNf41n!snk0Ed5m*lsrQe@Hx85-<7GT)&~2MU($C)n&pvA<lB<UMvNh9ZCJ=vO
zj$G>t2aFcASx^LA^S8mdA6h8KF7ka@A?hc+3ARL-7$Q*@^iVtC)xS=>W*})q2S}6X
zV>vRC2e1Z41|9=2ikJQ}9_Vd|2ZgP?S3EQx=B;&pg<2@>;kusX_HcJ!@Dpl5v-WWE
zN3n74J)yl-*Y?mlpCF=b?0_9Jpk7--c6!}V?}xxs)|4GHyj};DQo}L6Vte69-pt50
zzOSU(j84R+j4!QVfMh#nR1!S^GvqPND3aq+v11cC@4P+U4rt8c4)I~IxMoQME!HgU
z=<SA!XAW)&j)FyX$6r|rc4f%9t++o__EC|Gj|J|VgGc#d$+Ht#-<QcOV8h2T8(Nsm
zB6B0jq+f}C8=L7@lleu*Bam9Aw}X=;7-;5X_*U_L8g`jCtwy*<dVhh$)R+Ucc{#yb
zE6*kde}ru_$_me2#{TDm;OFS0iJoMsgSnvONp|YYxvjBerJmUcM3}co8(}UQ;0uGc
zNV5s`eU2~9`c?_YT<iy;eF=_0E;EG=DI(7<`ZXEGhC%j6wmQW+X{V)8VovL%ir{($
z)JE0W`c*)7__9UWQRKx;D3XZ{Wa6|Ak(|(KbZn<-i14ty<TsFcD>EEs8SY$}VL3xu
z&Tv68+=Xc^nqDkK#ma?9gt-r;P3y|s|3AX615S$C`%fmBnVqdysK>z_9PN(Yd+)tl
zKm`<3#D)dC(mP1+RjMLI1Ze`&5fHF|N|%mcLr_G~|M$J@?(G8q&u{nhP4Xt0m;RDT
z&4K;FRF^#>hbZ7sMNXw4^8$zfB~pGGb;l;`Vw_6(BdO%4>g)>aWks`${)0G?WK&Tr
zP?wyaC}$}msAU#A(5N^bkZG?HnT{)!DT&j;95q460G%aWkQ#GQ5@O#^6h%=yg>zvj
z{q+n{S)nt4sb~ghiZdV=oMw0^8wz{q_R}I<tcrS8B6-t2tDzR-C&gLKWHrmP4$n(?
zK80G6pOj*?G^=Is*&9E@l)X4FcB%$vx;OL|H=mHbhJ(*@kaigVB{}2XA5fGkx!o=q
zoFctzQ^b%WY75-tal7R8Z-ODONlTWu!j!@{F!;B_n2)Y}2++f)^WiP}>~MAxMR|b5
zPgt7K6-mpZ$NICwC680xM*{ZeFg#0%=yp$Vc1H-Tp7G<u90AJjKI(FSo{MCW{RLb-
z&w+l*arQ+Bed7ha(uZoHJ0Ksw)~7WHj>kc`EZ`1ut#4s?fdcBjm=nSWZ=|_{Q@K#Q
zSf3sb$-}R{SSS3d^(EeTi32$-pY=5ex)0TGhgDF-G~FE?unxn}m!CctWFKCkp+53~
z-4t}kX*aLZN(MLV<~1ry?B;vCUdWxC;8Nf++_&)BF_;RwQ*t<`V0fLA60^K~NzNPe
z9zGc@{|~S$f{)02TqEA3l4@d>*Ex&ld|CqCwE{7C0`%J?%-ksj?291Y$xQ?<Aw3@<
zOj`YIUjf;V7a5Y}<tW9ro}WPX7px5so+IFl6Y^-m{UG6a936$hf{SlItql=eIEQKt
z$8eIg$GSq&aQr4|FR(fix4&naeZ5&`S8oa2iBb|@NU|(7EWKDdltmsHqKTJ9JQFM(
zg5?k#gIWe|70Piv4YvxK2n!S@_>}|_<K>ND1sbp=slmdc0%aUTY68{Cibk*!r7u->
zuH%)c^MN{^KsB?np{F>*EBdO^nwY0p0{SrFp(UU{Dgmnz<A}(oxy@J!Se?ULN9blu
zQ{ezJ!2u4Cgsy?%Km)6q6IfK&h>B{Kdo@|^H55fSDm5c=FVGpzYK@^6TjHSdhd|Zt
zNz~!07A?Z7)0K$DB=YFI%)0bRBHH52E&Bef$L(HcA+1Lv6s@CRyZRjLM{r^(*)hdy
zEJ##0C`?>>hDCJ%CM~LYo>7ZxN>+W{<l|u(+K5vhGL)rlV~)U6khUaIX`a>)&xIu_
zQ&EYksc4hd)KeL+HuL<1#A?oJ3zn!YJtS{ialvfOPa3ni4NKIvEK%EWF>G(h4}=eq
z>SGl``B*Brj-(g13xoQQ=MdbCd=AM$O|i^TMPXaqEt<?#a~V$sP~8lX2+V|BM!&B(
zoGfP>w@6XDB&Zdl0b=GBNwKn+qD=ISajnvzR^m!#P~#H~mOR+XNlr71+GSz6N?0hu
zZl)C~4=Rg><(!HzeJrTH(6}6vt1&MQ5(9G%xm&}7!Zb3+<R?6BKzCNL<F(o`jm@!-
znBXQKOXcEXvtD##%FE?ugJ=(m3XV%+v4mT;2n)ZBI(^C)vMR&BPqhdspUbAxgg1#c
z#G$-nSB+A>Gk%V`9y|A82}4cTZ5HWp={rd>i$v-c!6LFkP*DLc=~nP_AU+e#;fw-d
z*T-A>Yyh{iW!H^TzQTo^+SwHC`qk2H8&ATxRU}I{O|f+2T-vU4sa6TQAzt1gGU1O~
zt0)I`hbTyaYl~E(%Sv!u7FpIS!C}~?)2v=o4uhs5VF*(?3{COsZqblmO=G^hd8D6m
zzd7vo2>K+V+~?H)oaY(bMmpkq1%GlN;vCw2I<!qQIJEnDjKXbNfVpD3UO2SO=L^x7
zylms)1Uw*~1Vss`TEfp#F1G8nhM~0Yev}xh$_M#86d!H=q;6Lkm~wI>bBHG*xH6+R
zM#3K(_S@p!uY{)qTDPx-=Pa56Ob1eHaah|O(RSbJhT%K?@O!Nt)po};bsUE-qrRTd
zE>G%a;*{1-3$C+22(Gke^ux1gJSt!aDVOZf1aa!EenYkd#mj{hb-zQ_^EJ8m&379`
zL~%z*@~|j2ur|EY2#`<PStxb)^`>-rt&mM!XeO*awYdk)ghS$ndN`Hj{T*PcXw&gw
z52VveZx!N>E!{<9h!;dMH(i<vi*+WnSSK;~Ajuc;qVRJw^Pm=d2(v}KrOx!dC*GDU
ztGXmmTl<jVtt>Nw57QS4dCT6>_N8p^G!^!aiEQs4p&m)pyGJ74akJLd2zH}zQ_q~k
z-Hq9X88N|(<l{xi6*qV}#hc*<FITj|dxDZDlFCmI5B-h9!u&~28SIYc?-Qn?!dz22
zb*q?F^%`@`@zF93qUxA+@vnz}9g}uE*Q6cKChhbE<S3p8e4Y#LXr2e8p=vI;MQNy-
z3ocd;B0OAhSybbAZ=z`z8kM-Q!gC7ccAW0ejMwFFf-Ym&=s~@eNxJ+^*5z*sW{RhZ
zU%*3qJT2biVo{HA@q3)dGvYm-sXd;hJ)W&So}+!7tF?LB<M*`3^HJiPnw^6S#B}hb
zE+EHmq4p-y{ad7cLI01#zF2cwqP6!i``8T0TA#PdwlrIrt&y;8%(iAbByoGQ1O9c*
zj)=m2=KV;b2jw|&A5x%2pfm*XIrhm13FDN^Ci$d-@}SRiQ{=;u523Ok%X*U`DFbAq
z4{`8)hn2Quqe*RD74${%6e8rKoy3*1Nr)z=B9oL~-k>PrR0ny9<L3>5tO@cmOGenD
z_WQ}Nay}e@zyT<q9e^J(A`Bm}1DOM$Nw(&9D)_J&nQgAHL$=}MyNb;INzBB{F9fKO
z`B^i<`I_9ygh`U{oI<~7Yn-piDHJ9-h3X@0ziMk7tQn}AO+G2oo!Z%?-sntG`As{=
zk(?YhP3lNBh4nRUjk6~Ljk8IdJ>fLWCi(0MPjfu@T?^Xlx+L5{N$8A!7yKUz{~tCV
z!N0574gc=)4}pD3Sc}{mJ&Hed*3z*8RPHMJaH;?ozMI_qMZZ1%iWWYD+m%W_94f%o
z^HzMjQaReL{GDA(3_q@1Hmdk=qyVSSKg`XCBLz5p!X%{+C-!Zg<M!c50Z!~NNr`QE
z+Jg~}YjrT#aZ9%>{|T0O{Y=i+{w6mm159pG2I_}{v^H34IIY!u)O<`1HHRS`UJ6m`
zSk;Hq1UNp!_+$VdP7?^xM^`nNa<j#7Z9#_#oQLn#eK<>iEk<YyI!l0B)*84FJ{PCq
zNHZ!YFarXSG|FUcwARLG?Ol`Q)L0$xaXR4Rbx<d0ZKBq2uaKN<rXtWUbAs0N;S>Q*
z(8*XoifRx^&@dTI&?zR@l1PGvNls9%##4C&&_^5ZsT$YHu3yvio5fLO$wrfGjVtr$
zR0?DVhFVd<sT$1SGfaHg@6VwMlaXo=Cb=4L2Lne{kQAP!wb{s)9>zHIdTE@qRR-EN
z1G86cv#`xIEoPjv=A|SoJa`?FK6c@OQ56BHA0FkbpQfkLO-wzY&=|TbW9p$ygit&O
z!h`vysPE;ws3bk~422V<^NVAkw^haQIataZo9q^ap|nl21BGFFiCRAF9kvS*0T{NP
z9hi>zlrhHz9rCHHNwWjAJG`8U*#W9`FVonUiK!#nPF~*Z>-Wxup)ZvaoVbHrvIijn
zUg1s&W(P`x^{bS3=2VHV=rCc^>_AqO;x8r|*L%$~gkJXKh52>dXpg-BhUrRK9Sj<9
z(kRh5Cnt>+NgC|10{3vzFrkyiX^U{#qOs%zu`ouQy=2^PPsB^T5H3Y0mOG+(DT^Xb
z6B~w!xigBqCL#&oF)?>W3E)97i6(#>NdPxH0SpgC#mkZKaU{^zBv=gtiPqI1ZhIE-
zmz&GrUNCIP8RZoNbw>H38N~-NL^3Kwy+oZ+oU(kIsF<KrmWgP}YKl^p6PlOXi@`cW
z^fkZX*O-XDrYX*lV~9m^=rq(Eo=1>W*b<y#eJN$n{Rl*B>#7&SyHt5Z>kyAqGhyMQ
z-2NF>dTO|>!|rJI0LM`{?2ZX?$5y%vcAvfonF+G;{0bJmSbfUV8=gNI+KujEZzGt`
z!XS5mGc#4@NkE1);`xodBtINXjaGop1Qi2|2B1^!C#wn}^%w|0^3kHQn^Fu<BiWZi
zL(g1(Igj%Zi9&8z)MHHLURk(QXac3eu&Dgxf0B8pSyYY+GyoAPOT3&vQkFD<w=Akm
z>BWhCxICqY%M%BxAoZ~$fhrhNrgWhy#6>HEi&kMST1AY=U7$RwN&-b+Y=WCj>B6qY
z_VY-45(0+vipP2ZMCNxyU?P@eKo-7_3{f=2=?fwtij}m<P=Y@AL~+NaWk4nK5P_*6
zPTEp<VmO$&eI16<ZHU0k?VBKi4rXrOh9P8P=9Z)uPeDeiCKI}9+Wp{)N7Eddxg7+G
z4~=k!I`)7lfoU$A>Tr4bE^sg_I|Nn<4rXP^lf+2<ufVwzuK!GEPnc~*vUJ8tW~&LX
z_3S~lsN|VVqi3EoP^)<88t1B9%&W0&bwdsk*%GvJM1od{R?NZN#z$)AU|Q7@k)c#*
zszzjJHI|`N^r}Z>=n(F4M1*LKNPg>Uf?}lTFfI1OVstonsv>e!Q_*@GCZm!RqnRX1
zF~mu-6n6@dFdc<1V?XgA0@{w-i%Je=$Vb3ge?bM_!4&xzP#8!h3UlP6qLh3Hm2xb|
zfmqq_=HsxZ{O5eH5Xtw7k$l(Vyx}ZiA|~J8X67A(oXqbS3E22ngJyS{sItg|B=NPE
zGdfxPlFAvKBa+cIV>6lw&DHP>Lr`ilzuJbsPIE9LR?H@6giWppo4T=Vn9ywa{dxu(
z37WXAzYtuSsx9D?TYQ^rZH1u$)d~l*(3HOoLm*7dLQ_-Gh<TDwU{CWnn1!Z7nm6Jp
z-!l-Y2~31jWIOmap}{q0PRQW$*OUX&%)smkCU|9YT3ymiHHQx=;+S92CA0Y)Oh;3D
zo@!z`nws+#5SP*Byak0uw-Ojk0g+!EPiFT8#B#49_h~4%4mJJh?2em=mgp=R0@K}|
zgLtPE2d1?VEMa{mu^cQ94wP*33S2a|zJYaHgxe5jk}Yb}HL<Zcvta_8)FB|hgTr<_
zs(aRN9U+3=axm>pZ!uwF+M7B8Oz>M=G7CDG_9nSd$l2sLBoi_mJ?&jewm%IfEi4)?
z=rD^yaY-))po3csXJ^2nG%f6W?&rKH;_Oc0yeJyUi%tkgw8Oweq#RKba~~)F{piPA
zzhK|c1N1@LQtg9HF>$pTx6;MMT-{1%Muh>dGK<K92eVsgMg5;Fu~@nk;}TR|9g1#A
zjw;8TM#T-WL>eJV_AZ?44;dKErycsVNB`j=aL~$XiY$Jl1G)X(Z#5>OAqWT4`&1lh
zzTYr0y-&qa6V%$Ygu8A9$qLi^&558)HZi?Vp<u$M>HXHQ>Q0Th$<zBpK1v1Lq3Qi|
zpdRyFqGzo3Fai}YB2>}Mz}#8oU=^!|h>VZOr=P`JZLB}x_?d@c6vKShjVLj^GPn89
z;DI5g>KS?_xI6F{9zV;4aGKZt8w8(@E?)5Kp2G8#zhb(6iwSxQ9Wu(cJ|60*Jj>=6
z46|mSmem;pH5jA`)N^|pAHyAW$}1HH3+j~5ErAcnQZb!!>w(<D$n!Q*KuWU@oO`*y
zDN&9Mfm?lBLswRN=7Zz63OE*R{KJJt(r$o8%IBd@24)&kJS1O*=#|U{Q(&5HGM+g6
zu7_2ZRdkd*vLU*+{tK}8d#I#Iv7itelvsdRZAD3<U@waV`#6H#t87fL%{>VAWDa)r
zB4%2=#+}{pVgPCEMG%YfS~Br81+RM^y^5kI2oonjS>S7kbSN~P%6pnkjOQ6VrK+!F
zZbT%eRoN2Tc0plU)vg4CMMYT8qw)$On)4*KnTAshCZ{G8@n~LNWai;qd?uvo;Gs(n
zMc>MHtq5WA*-+${7hh*I|8F&j5N1F4S1(HGv!UoHFv$mP`8=Cp_Pe5ez_SgepeG4M
zBIL;hHG$PcLw37}_e<K#kg(lTVWd$x??<2n4UX9>V4_JRJt${q;?>7kT+o=@$FS_-
zSqrtNrzxWGID3GL*5K+%R(p9i!uBc87O1_kELUd8q$&tXF-6U9GRL-Lx}7gVn1jsX
zhLZ*|M^ktzDmf|x1TP5_Ri#rfLRhT}V97w2T-a}FcJ(LGLXYJ533izqOCrO0oabd?
z_NLC`JWo<|B6*xAov272=cW3SoyRE@h-e<?i%gw!9_zIYv<bxEJkC!QgY!601fisX
zVTU4Ffb;kMRHqn-G)hWwe-4KdG9A4K1NUAUoEQZuSw+7@PK<)gj1ogp))(SVX@8yd
zOhmI@Q`uRc!daieSzp-G1TPoiyeZ0gQ;c(^xZ%qu3&_kBur1D&Ad3@dkWQeixCF{d
zhpT7;m55KEl5q)CDlUO=?gC!SPM|W;1mdDzmPj!qHy3p#vWt2-j_*(%UnZjQ)l_zT
zOLBb6b9^gse9L<1P_=R#--=iz%@Yq*)}k<Ea)qnP8M58O5wG?lg<_|Uc(vXX;z-G;
zmO)gcWK@r@{WanuUNbJ@weE;`?P$cg_Sb1iUhc}PDkGS6+Y+(I&myHRGaX_ESfucN
z(x^zOA7#p=jantc9EzpAib|V4B4VE|aSdZjTqAlfQsNp%OI#y7Z9=mYV!uvhCZegV
zsq9p)!KvJoQ@I%gN^=${E%<3me%gvlS!+)9Him3l5$T=9N!;#v^5(Ek;&yRmt6e&*
zqDkC7zHD`fOX7}kN!%%#8oKh)=>xj*aoM_`Nby}UE?Z1wm#qi5Y#r6HW+EDEO=ZWr
z1IO<{j$db2Jb1rn6y#VJj^jhL9KJ4AwLeTHsDv{XHji-5p7ye{u2{Hkrke$Qg|N6h
z1B=TGV_|s_ESoUCM21l#)~5&pNp0`clIYK%SmL?<*=6$y<EQ_x5#!s*rLhWb^Yc<?
zL!>0~vOhs*{o18q!Cx#AecBR4AZ&)z!&y~eG({o7V+4{<s;+=XN>T@v7e!m5Kpw6r
z+~A1@eojS6eR9#D47WFmx<Xc(<5HW&1m6H?@M{*;6w%1*G=OeXo{C1L4DUZMIJ7h4
zlHWreA#Mn3z~Hj`us7aVZ+x-dFcIU8Kh_(4MqQ&)><bpiCHg~1Jp*$nhVv+tgu<Xi
z<-ANJaspM&>zxOiBrjd^kj&;BY|d%;lCVo4*{Mwpip+Ha)wF<3ta5iJF6T=ympoj1
z!YuP5D-BT<XLl@xyOf?p{pGqu<m*9%2j^Q7mD-S~{SNcXPklpis3h|%5X&zu!p}Q_
z8VU|wK@MFZHfON8uy+b<Q@rb;7GZNf?`D{bvYTS;rZ}53*_`Fw0^1Va9Z*Ypaa<wx
zSbPDswBgI;RF!3~A~<R8S9ns6v*AgR%5RkCH!2u`+$vcF19?<}2;{}`WM7K<HIR?i
zzNg}ARTt_SMG++`)agj&CbeS<wQ~vM-2L7`@x$&88Z`YA;6A0^@0KF17I<PB=uKuw
z*%)S8WU5CwsU(O((!%Rl5l^bks^WJXP_lgXZJLft{nAouAXG0)!?NjWFbE$)iFL6#
z)llfPI?xamd}=uK0239{)JPC?B8nj@TI`I+7)7OI>lZvs;5G9h!}gigZNFK-49G;v
zt0D-gha!|j0|%mNB@T9ypNJGJ1f`fI8)neSLGdc>A@@1c$hKw%zn3fay<Cy^466qm
z=H}P)7<NcznF&tbUU;vJhY}~HF0rWZVc+tF6UpF|No6%ZtNEBg0bZPy=2GgV8%{y;
zq%6P5M$t_2mV`ovlTi^?de0v%tih1IPWd9ttthL7*>N${HQM#y6n}zV@z4&aOis#9
zdA+nriU~diT+xOozPMa;_A-l1R!gv2lGRf8SS9I{J%Ja?`C6h}8sZs`zc~!Tt}Ciw
ziWL@=zcybTrdoq8&-vXq#n&E$Dc<x4+QJ~|l1DG)Ew3x1OCJ3l@mwi7f8{=yDpHd|
zXKs5R0L3fH_{eP|NL(%PN=`n!v@)wzxG+@Z!cfhk4GGn)GEi%<xhAW%Sgp-!9aifa
zc7C&Jh*tJHEv`^lyfn<$lKS0zsbK|BHr;8+A!@}T;!Uv>@YWu3hJLq2_mj7=szGhb
zA!ujSg1J5W(Sg;DRy{oFWHo|%U$`L(-$z))srNV-mG{g=c<b{}W`D;sn+THC7-#U#
zj1c->*Y`j+V7pBThFSosA=nzCe<~B?2Rg0+&m!0~<~0%h0t(;nMiYtvof$$z(=dSr
z>cqjItNZNYs-c~!I@?*Qt6f4}4V8p2F^b_WY9CW!U{P`tqNE3UHOPnXZ%LXk8uY{3
z6_ieLS~=p>Um8-QQH;}ir{VN*IE|fK!Z)Vz5?HkeH=jB0Y(8<A8p=$#`823r>E@IB
zUOen5^5Nqd&zYcRl!s|FMNO$Y50)m@q0=@FpHH>~c@l6IK~elspCR3^Kv?b(sJ45Q
zYz@EPs^$;aS^l6^zzj(b^_N=O^rl-CiYS?rIe5by3})eD7Jg<Ca0{Zk1YMFaA;X_w
z7569F<$VbxG+x9fH1kSiUX?Ykq&Qy5%qs`;%E`QPF|XW)FAwoJf|9^I@-mOQnny|;
zk9^D{m3ic69tD_3njzC)2ai@9hl2c0Yw4atPz$-2pl0xkh55xI{G_PiPqG^LldT&3
zd1pb$>&;ajWMVN||K8fZhBu4Pq13t0<M}m0a9`+WqRPPv7acfQA)pC5&0qZi%EYA~
zf$K0x+VsQ@SYYAq7AEvHv~<To;3ixt@o=*G2L?;@5Y{aiY|46@d@0e0e7T@~VIt~_
zgX=xm7e3DhIk^?VE{W@XN*S^=^$Oak;iwa?_&(i!7ltw{bH50MMiW8ROT8nl&^S;S
z>HyVGLUubNVJ?>rS{$*MC{Hbs_`$M?TZJql&IFkwUD{=cFvp5<9O<MVLv!R%c*Uil
zqEPrE?&&*)Lx3PofJGH|3e>7x!mF`bot8S&Q7x%vsy{zA#1)`U0e6384HBnqIge_t
zpin%a^N5LP9@Q)ft4JO(5luaRB##2oJR)&tMDnOM6O>0b39dRMK16SiLn5uNL&TaC
zhlE`{5};yjiq6ZRctPi7NW7-=G9j9m^<(priD+mNBYBy`+vId!HgG2(5Dk%|jSPPw
ztGcgo19(ND;ZQVzAsPxAD4|erA!ugE=9E6|IDJ}h`n;*rhlyzVw1`a~CZZuqj-(Ht
zNyEjK&Z>*1PpkO!X-!`M;%%KiZ8%B7>C-kGGMzp-#Q>c?xdg6~p|Z|5A9Go67n?pz
zL_?D&l0JE3(x*MAPY0w=M?-d^3e=t>eBUMt%V-^8CZZ9(FSgP$5%nS^65)K&0>Sr?
zMhXP4;p2-*__Xg-F;++RK}rY|27{|`ew;+b(JBzl_Rg`<Vj}8iS|nQOG12N`$cIRH
zbl}K5{2{TQt|P-lG%^pz<{lGK#swphDHM&&BeA*1L)RP`mOvR|hR&730vCo*t`vy|
zzgukZnTYyQG!pz`I{3(m?uLA{F^XPC4*X-YDe&`k;F*X9{xSMoid0P|qHK#t0-qTT
ze2>_`(`Gt-MGOZ%ODxcVFCjkEfiD>i{Nu5KXCmrLsYu{UM+5%^i=8J8`BZcG&?!Ko
zu#DJYv3%;y^66<KJn@Tu2IjAxSqg{uh4Q1o^W~$Wj7SX8X36IQun=WMJ}U!MS(r*F
z8jC9@Zn>UfFg))g2ggK?5S<^#l>Q7gI1qgd`9eeF!~Gs=q+e`6j_<<~HEQQyifa{L
z=EkirH*T-Er@-u0_ZhUvuW{S>I-B2M^P36OdcS4Jx4T1sz#`7&MHPHQ64!SkToe1?
z?FzU(EtUQ5C#sX@jj+gyob?wGcicx9AlUnXg8d&PxOE*YxOE<)wV{Gt4%04&YnLMg
z2WcdxIWw)QyuWgk!1_iUBtXF$tq-sPclLBl)Kd?2Hr~~Xq2Vca8Q$eZ)gs*Q7#nqK
zu;X!}6NMi^W5?q}9U{Yy$HzD(e<x^v5oGpvV$_;hP139o6J|Au7Mev_O^&+Ny{9QU
zzEgF4r|Cq%eUj8I!@ZHzFPo_oXqHZkaoWjj?PQL2GFLm9r=3jDPTtc_=4&Sl^uvYv
z;UfKTv3_rf;5?^QpW!3=!uwY~5VT0TbLboNyOxR~6pwKt#A*MbXhr1saA{i>jThU7
zPXn3|wq22JOI^5?qAR^S0~=2$Iai9NM235FA4T2joLQx_adz18D&83p$(fJuaJ*VO
zo)>n!I?gd}H6(vB5GCs<f>9AgDUkRbOqJ*hEkp$ummWf;Pc-W|EGp9{TIeK9YkB=4
zeWQg=gIvdhT>2x;J?p1aA-Vi+(CNETr|+lw;U@iXvwpZmaQ*vC@OxXelWp3`cI{+`
z&X=7!KX+-DyS2+b+U4ik<zDS_pLV%lyZl1CJfK}3)GiNcmtSg^U+ITm3-=-d|BYaA
za#(9ewDv7(T?MN)?n$E0#YaMW&@)_$(H0~R=4q_GAnVfiA}7y>G#6p;mUK?XWqjBH
zeKETG;c&U>9)h;Q<PyEobiV=hk;%TU(%Q!+3((ajOVBk~zE@Fxj~i?;=o#t;IeC<K
zp@>7Gyt@G8F_EwY+Ht|u3E`GP=AP6cKdQA;h(IN)u{?t<PFNlvqWxfJ1$WVY7fxaY
zkX6J#xEGAhT0SRwBLd=<$d_0JWL4oo4BToko)>OyXcq(v$%}L~<|Qq`E(^B-Y<{Fi
zS9A!j3bzR!{iL;@^}}Bfkjm;;GrV%$TSKJ?Z%vG6`D^*=`)m7Z`F<78z;_b;EcSon
zb+l2ruSpjCoMQOqFzfh!7vt~_g`GoOjY1rH3*>bka}|CwiRJ7K9m+o_lz-{~;d{cY
zYt}PqseEa4obzFF{cnMltj5Ek&EW74ZyXVMOrP}`$m#-PO`&Zt-R6xu3UiI29U%YJ
zm4fD3Lc2i!$IEa;nwjdM_xhPMGYK-luKV-4P(`j2_rSV_NH;UtgPNjf*n<JQ`Zemo
zKwYfZ1B%)pllxI+%;KTFAO~wQznO0B2U$y$#ickDkRfalt<^($I5}FYYe#GKFwH+w
ztB31S%bOzICMcpKO!qTrBegclba#LnZL-}M{qS8A={nZr@;6SG@A0}sPtfirYHE_!
zCTneq)}~@@=xp;MQc2gZIq~(YPPBf_)d8ge8LnUR;_Daoby5AAul--3>(@f<$0DsQ
z*7a+Nu3zt)hQFbe;cw)tYhsIpzo}~IZ>Bo?o2#zIA4p>w*eG8dnMcE^W6NOhxNFS7
zhG%M~Fvbq1Z&*xpE4uwEd|Zl*UD+8vmIOm|@zbOHv6f#7I;f@;&(r999Ih`<VZwYv
zR<cVVVKmi{mL&6tiR?^eYJ|(HZTV&KfXf3QipxgpM2EgIEGF6(<Y_M(BHs}03gVlH
zM*STK4nw?KR8fb?bC`&6ma0btjWURCc`R!FJ>8-3lw2*M3J^t8a@1Soi32@Z+XSSS
zIPqjHS^B8h^XyqO82ls|!*h`>V5B=Hh;4X6KFDF~Q9-rA;}F@2o(1hz5kgNMQRgH9
zUzzxV6;_7`!IMYrq7+Xa8R~vKNbCiPC&ueYC32*C>qsTWMJh2aQc099*^x?)i&T!d
zNaegEQn{j$;*}PLcqhrFgb^mD6up!vW;8)^N8hz6J@W9YG@a;l0m&x;h-XtcJ+K@G
zJl-tC>A^&HdgSBuz{E7A2NTis(3DOOE&!>V$dl5!oaE=CF*(6Qy$~rT=JGxagF+c~
zZAzU2oDAp!7+jLl*!c|0_ZaLfF|(-C6NYqtI<vY%ZxrMLHPiCb`6Pw>be33#e<M^c
ztA+i=;kt;wB-BKHlFVvReqM~t#o3(6Zc_cUi#f|rgC%LKmhe}FxupMDsHObJpq6H}
z469|0Kml1IkS43jsuvNQ()MY%s1{jL;hqC!^OA})FjeQ|CrQGwt${|zplwe63$U!|
z{}E~}_Ovz!rVe{im(_aypJ7|ye+_B_BakjL0tL;kfkLu+AOn;74X1-u8Jj%esb3Rv
z)&3KdUU3Jp%3PYT+LToqg!m&G<Q8#3cA`OU86V_UaY1g)L2koAZp%S#$E@44+JS@I
zk%QdH2o|Q*8bxUQH&EPc7|1j`2eQno!4jAoHA_O~Y02l!M5!am_dr?n5wL6-Eu;SV
z+m;kYry)o&NIq%yz|-BRYCMx63xg65X7Q4E#j{vlycRjJZ1IR~JqeS8wMS+R#}-*y
zV0t{Cp@`RWpM{<4()$w~^>l<#SX1SF9+V-Lc`aJ-?zKf**P08H&sqT0&uW0xAgdu(
z6Ie}THHp<^R&%hLlhs_T=4Leyt9e;XVKpDCsn!zslAqNA79FdX#%el;p&&w2$fDEj
zGWbNh!qx_ui?Cf$ey13_F3xHuKgnXX1V1UsYAK7>3YF%!%2>2os4T1HSS`;g&KLJu
zZ5^Nb)$yxed;xn6?hbI$a6IC}nAWX^?wr-#N%9o&CDWL94o@XY_WFLx8zCkOy?(56
zYz9>cXAq{j;It}os2b-G#uLML^+m=NJvFkBN0=l=DKN!N>^-O4s>!+aJq{zL>|%<t
zE9il%T0Eh4RHO#{plVa87Dq)X<t2&=ueXYW{h2V-VJ@e_T$my*Y*#loy>~cl#}wK5
zgWw{?X;Fmzs>gnH75)INu%dl42JXYo8zjH#lV4&O%?vQd2JG8~Fh{1CV*=bNaYYnh
zR}I+}b|rJnn2N@%5r=6ve526o-T1h|lU*_ub=ibneh_xqlzFZWyJRZrvKhNv1D81v
zj^;dzvj&GzQw(UStsz2~rDYFWFdN*w${sRBY}l?P+wH@-=9D2!MZIXnUhKyNGI`ND
zA2Ho83a}SEx5Zv)K5dxK4`Dx;B0f52+j46Eg^fE@2WweJ6$Vh*+@94A{G_9G63;tX
zBnj`cNUq**kz9Si`UUEP)^(_z*?ku-eGgfG;mO0+ZK#j1+SQ`v=H1v$BUYQR+RUO8
z#=5ilQFikfn_F=$>A~in*4uFNIGdkf^OJ^?A4d)rFe^K0{wyawP}x@ypC7(LbfiOu
zXfK??W^Jd4S=A|Owf77cSCBr9gS7o*1n*j<4?aJgh2~I)G)UL$jnrL#YDR(|!MjoV
zgP5iW^S~Nce$gT|T>15tO+ztOIw;(uOtpKQ*2W8~Fdj}29K?yZzM&Y6+!y!sqMF$-
zI1pT)La)yd|Kg;Hd7ye5PG+#KJ~uIP&W7hXT58uwErdPouE2?qSxod{;`hwZV%S*X
zdMNll3@=cpfj%#SOF_K||8eFk`qrTpAZ>9Y6kG<wOY|;%J#f@srq-Q49t^$#s;@;i
z%)DY9fch#&?=_Cr>(&uGd4p}=<R@<#PC2uI=R-a(Tc_u;`1JHe({uS9>8Vdl<|$W7
zPh12}>50Q0C_Qn?1EuH3Iw4W3C|TF&^!!ApC)S)f<;@JI0*;Vg#5;kTa=tB&j$;Jr
z69OB4DRO9<nFp#evBE9sq+xmpoi;2hcB-g`PF2;}siwL*)zwwHYZ`m6ahHQ<lGq8C
zO@egh=4Ac(S4<QS(gl4}B4o`3vh|4KA}5QiseBlOC@p-!lCYR23L*?3(eDB2Vt6!n
z0<GZlMzdf#zrlIHtiwlmm%}k5RA**lL>KO7>C~BxyNU4-W7PlR;anY;d4jVCH&Igs
z!hO#aL0s-k+CrpNZL_*l2cJT9jYSv`q0gVcG7%{11Pm%?7*}41=>(L?Q(hP=hEgX$
z8W$1v7To@ecPSc`yw>B;O;#nVidBnMn^i0tQTX=G{%(q2BJ2qvREZ9WSeKfAQ7ey!
zI|WaIkXR(>d5WykiNY5NyYWIq8GKN`=LML2O^L)3VYc*>#FHrYE=@i2IgNaVxZ(B%
z%}Y+e7GET<gVY}(_4B!}MhLzn*69aQe}ps;p0^`}><6xIMk(%e^I0=KRugg2JgsyR
zU6QS{FeO+t*P6qS)45i(acNoG46YRtQpPt8sb2~v*PQ|r4jBSxNm9n00UEpDi6$vS
zXOttSXW=PwdLA_7H1S4Gj|YXECJi~g5E^otuEs`A+X*<AHBsWY{Ulru8~0@z$Vt=6
zz0g7gX<FA1%@Q#(1Kq;XHisr!bCF=M3b^Vj5pZZ&ytK`s8QR=vW(|xrd1!W*2@4an
zie_mv#rrUN;;r0<onp77&)`6wL3oL#JcGSqVT$hMf)oIW<Lx*QDiR2+JPHSb&kc?Q
zf_4<e1_BFyvID_HG!UAiK;Vs=crPETsTKt(h1Z8*Q4wZNW&U5_B+;QenhH>Yi<hPP
zCqg)_3m7;Y8a+Yw>gf=>ODDDwcT7ax6=Zj>X?KN~=%@KHLGG+lcss)y05zS}LLn+F
zg$=VHFCC$Zf=PEAs8?9`-m&X2h;1-@)2=K+%TQY?nz5$ScJSewH%JrKwwRC_q_lP1
zl%kxzo=Ez-k@WRO($^PF-(nP%Nct9|@Wx~^eaS^LnTcrnYD%Xs{G5b0GoK;Xld?QC
zcb!S?8rRn;pt7>->okxh+L9&JSF+@7e$o0`iYoy%Q-+!W+tM6{$tBe+5M@%xiz!}b
z9*DBsJ(-`R=0pfLKh|;22UQOB#SjaVoW&r@hluz<xC<i$-*e$C1zCZ{1Yqa9A0gy_
zRk=*_VWOf9dec;<nX(XF&k8uK$UUsKLMl4qlqtGi7il*e$ytd@T4h$NaCxh0NmTq2
ztX5+gQQdkIPik2Gpw_eoL9NB++Lj4E<+*NGFyv@49$a%KIH=B9qLOgAik1*nMUboL
z7#0zc?87A(6b7oKmJ0<vkpMLGL=vH)Cz2BydLpEuCz2N$dLnX>_zkaV;j}njIF*hH
zrzLj?r}yK96B<zxPD{0L!lWY!r)63yE!Wx#Eu7G}lW@X!lwCr0v`flg>{9YqeOxE)
zIx;KhW|#($W!=3f2CaO<5HCN@Ews3#2gE5|(u0wb9*UIogh)wGjFj}GXi2XZThi-s
zZe@$J`mrUQiD*gJRJ5d1$<v<}r*+9=f=XVHDnYWipesPbbR=rD0yK(N03TI=9MQtx
zm<nLD@bf}0-B@wqZ%Sbn7j@xp#)bcqmkTu?!!0lZPlcKZF4Tr^1kx&}C=gDoT;eyK
zR=Fc-l_!!`c_V3+5=pCk(X?t2n^rC0L#$NbMGq0t&qOq>G!;p!Af;8RxR#ViH8g*;
zcDCa0wttcmsW~hlZYOZNNM|k?xZ@$OXAB;<IYpEoRg4yVx6L4GU2ZiVL_1ED4Ydp(
zC3u<$2%X~X)8QzZ;vM2sJUyD?9ph4*7r2MJ2PAHIfjedoO7aQk_p^8i39oM=JT543
z9J-=FKfqz#Q<Zxy4?a(U+pBvm4?anOi1b<>j0TC^bdCmzdo5k!dMyvd^;#azmV(@C
zd4%H3y_Vo)_+LmIaJVjHh;2H53rF&|NF;xYM)J2<B!7!X^S3LpkBE(~oY~oRp&MsU
zxM^o1n!lQg<gYUwLFvxJf!j0H)Cl3Gl|YL?ovTMVr@qO-ohv4?J6Dg<#|b|8xpT!t
zv~#5?J))T~6Kpd@GVVMhS>A(^P~;N2+%u)iJxkl3tUk^%w?50$Cs?*V$>xSEe|xd|
z6ua!rE}v%GXV~SlY<`YAT+j29KCHgLPhMp6ORT=keEPEb3d{6Yz2c$QSbjHW`gO0k
z{|$ccP3|3bVf8Jxd)uPBbKc=l^b@RMaF_Zf-I)7hEI<bc4$eS`Oo}@{8zc<7nr!H=
zE(1ai7SzRU?NV=G2z^8wqKLp*vk<T#sWHOd9*ieoJGMoe&auFV+8SJ3O)?lu4U&@l
zH6%VnYvNk%4t8y`r(MUaVb_&c?0ROGz$BiAq^6OZ#7W!+MB7A7MjOS+dLA#*Dr%CO
zMH5qRa*8N};5G0)0gp?H4x*y!3DZ<&X_dt1CUsz^Ve+scZExiBtn8+;v)#-rU^kan
zaeM+wEWT83Wg(LErLw;~y14Jee5uHl*ztCIg#J>Q56|WS`eP+XDbqjAY6so4?t_W}
zVcec9Y|Dgin9y<5Xgp0-G%9a#H?UoRw@(<NeTqdNJGM0es>5$%w$vl9<NDx+cTWL1
z?u!U&$U6<Ly-!0CGctWsh@cuylPdlhFz~_6JpbUE1=7z!Lzjpx12hysa|r%9Fa&9k
zMo+4EDNZ5X(Xu#h3E{XI9*VLMOH#dyVMyeP^1kpdj1YXVUqEzAvWBB=_OqI7QI|gl
zr%O)Q<pOeB)Zxj)@yyF=3aj~8O~vTS2C&TUe;8^3YLZNR$^>bJlnImTr>_Ive#gg)
z`DqavpMFlm?wo`Lt$*>XkYys7GT5`i%({q0gN{Wl8VW0BxXCKR&4H7bc!1K)V|I4)
zssgw<_$oH;)f3oft;@qc2pcXBEh`|m2gZ~K<7QQ1v>1tS(Zro4aTyhTW|u;c`4|LI
zR_fiSB=5{VDF$!tF>8>0S!v@%L#v9-7OOU^j!h-PV^fK64ab)GP=FqAJXVJEk}tT6
z0X9Bvuq$d`d~v?`*_Vo%e}H)gSq-t8z-l7<lEl6w8#0F#4S!DNTvc<<8OJ#nbFQj6
z=Vm^6Sk2373aj~;bE+Zp6Hgsl&P!2K^DGd@GmUxH)I8IfLqS#xv6{hZVdh!HkVOkd
z!&!{OSy%Hc7RR$V^Q^0RW-^B?R!gv2lGReovoxnb8AFyO&N{T^m~%tTxm+CQ^31uR
z=3IgKRAjXhtCd-;!knuz=V}J#bfRId!JM0E&NbpV*JRGkH0N5(r#7o~Sgp%yJ?32B
zkPV2Z4sAo`*;@0YlgP5mTO;P#TJvno9GbA&l+|XeHfNqK4B65Ho_;P}t(fPdCAbW8
zmnvF@TQko`bxO8j4sBU&$7*|4JJ`)2*gMh%PhDgu;u~PT_c7m|I;{7_h4p^s+f#@2
z0p{``tDRZx!s<iJ_hCamk{#Br%(Iu~*)@)5H|E((^X$$X9%c10R(r78lX*UF$S1PH
z`6TmvM)Q0!j%P3C`HbfI6m#g!>eH+~!|Jol^EpF4pAbz`2mw_7KALBrIG!&s&pw*x
zi_GCAR$pedFRQOG&sPok8u8TW`8xA_S@V27j^`W9^JUHRP3G_xt8cUV4y*kHGwqKr
zwE?*IFp~WPwG4Yr^Bu^C!$u3hAi<np)0}a?A_+6xuSgp1S0oMhE0Tu$6{#5D+8EqF
zogL^=n)_Rt`>1&Cqc!)pG<SS;5+hu_NE)tQBn?+Dl7_1niTea_pBNMNNt*ir&3#fl
z_sN?30L>kDG7_VyTAQY|=~|nixnrtO&WZ{9Y|VYB<~}=~`y9=EsOCObGn%Kh_p~-&
zYYQ}YjH2NSIZrg*7i;b#HTT8w+?QzXBQ^K;HKPx-wp42$YHgY3z8u_fePFb(tkm4c
zYVIrJxqqa&kJa39e;}m{?hhml_Xm=O`vXbC{ehJ3v=<w#BKMmrmkhU)qwr+PC~7CW
z!|Joz822`)R7xcrDe94PtP4Dg{sTrkj8UM=qCut`+i4frg8&WQy$Xx~CIgFs)xa*`
zYv3es8MqDz1S1(J0?>}JIzUU{LEv%VMW7!r9-!|p+QGC2*ajQ~z6Z_%KLLLN5+CZc
zg)lcz7@)-iHG!r;N1!`E3kTi+h61!-U=FYpSO@F`z5}iRG!I2*t<hQw`lhK2)CHOY
z^c_fFEOfm6FkmvU09XZl27Cz|1AYc>0yetUIe<byS)ex166g#(3A_Xh0O)I$KARQ*
zD*?L1U=KjkDQAG6fm?u$?hVaJ(bNac8_+@q+S%0w=moq23<TZ<W&ulq^}v_Fao`t#
z<~$Mr8rGyMzp4YYI-x!A96$#ojRa-`9|4<zeZcp?Mc@_?K;JV3C=Sq};!S`DfZo7s
zz))ZUFbAOfMb`tuP`=$l?KBUXY{tEv%f(hf=Eo-pHCe<w*eiCL>Y(|(sZN<FFL1I~
zY$xIx6GO}urdnl2pZHZ_-1m4}jm4*cU%j(ZsE<X=gJa^Hsg_!s*>;Y3$W&jOF;D)H
zla-ol<J1jNNToliCJGEg!I+h8Y}O<Ew1OI<Mk%#Gty0*#s{XT=I%=J>*HJq?=UjR+
z!nyOwBzh9D-s$<;qt1H<vD-M}l=Gvb`ui67h|5i9xJQll?DP;Z)H}|rE_rbT-JQfZ
z?`*GH=>5v8e)Zyr8_`J>EUHSB#i9*$z+5fWXX+4w`G7huuSzw{Gal^jAU?4+SZbC%
z4{?7>1$(L2hau|IgsR&}weDny*+Lbmr=EI4n`%c)U3;kM*Z^UO_k}86N_BkD5J>a-
zP1JKQ8Dfi2$XK;WU@tTionc<6%?b6h&Bdz0jJ>7gCf3{;FRwA_<^ACFsZg86wlD#k
zY@##FSi?uIz&2_ItIgsh*y4Gtnb@v3iPLeRp5J7uiE^%FVl(FN)KT$+P-9K(=qCGn
zJo><ledpsiYMu4DrOsHt+3L29>PH-=*>i2R!`_F4ji;_7xRq7wo`!%aQ=70>vu<Yu
z;R=F1-c(IGsg0ZfFfT%3RJ+6%LX9)ARFbUk@o1>E$41ec0KefNp1N-RZK*l-B7_8X
zG1M$mb$C?uM;%nx#8~iNU@9Y7<*0x$a#P*JVpuhYD%uAU>$t=!Qx&eDT6bg9Epwd2
z(EEG(`A`{4-Ln3()DRqy$1Z2tLmaiwxfDa4iBPLNYdz{W_YYJCaKG*O%~fZ-H&Kow
z)HSB|nX2kbYNfmk-(c>C75D0fIZRVOiE9FT3D+X{r%hF<wYvZHSkoC(<$hLuD3`<a
z6;tIXq$Zkk;r0&m6zqsm|C!h_CVn?nRt2Q-O1Qd>1j$e*uq65}b4EGU-yEskM9m|l
zN-Lx?JE`9Im=<HD>hyy8(fk{(U><}WPbgT}m3=q$mpEvuzs>bheJgJu5OBWM+-RyT
z=6?P1Mstg)_Os}mB~^hY>Kwa+xnR9)^HOuQsV=HpsPwS;#DowXX^~L8i~8I=VydGg
z1t(ftEVbP_16Ql0O314k?m`IHNi~M#z*>Rpd54-~5Q=+0+N}Pu)G_-UN*5f&Q>(3w
zmO5o!0BhLAP}`&`(p~L>+%&`;=4oP`Q0MI{wp#4$a@1bun4|u3K8ECi!!R}VUeq@6
zmB7xjZ%lQ}`AhTtPMuNeg1V{HXVz{6;*eC&4i##G`~nGdC-IFujXZ*IXRCPXr1(*&
zVdhv<eQM(L2733-N5`Z}UuvoiCI;OEh+QI<x*-OU-M4u6A_CDDiHns2>$&;6*Mk${
zyufv){Y*94oS|RpCkH7tLw$ts;f)yTnpF9UAQy09k+{QLyuNDm3HfscvG1o;$qK4@
zI}CD-lhVnOo=Va!CcT@4^p%nRk8GLdmj(T@&__ZR+bLwJsX~^|D=RGW$&UGCr%LjH
z^&a^aR#FXtM3Z9$tj8gkxNy9aoQ=j?zAqk=E5(~~Et&?o8JApc5qnMfxlnR1?l#^p
z4w&)?p5knC>}v4KV<J%=7b_h3J8Z8*lw+iK7r2`^S$w=X!I86&;BtXkT`n{?O1a8h
z<(K=+5lWskw|nF%+zox&#7k$)VV1mtsw;m-ahA8ut)?6yasK33IaJA6@`@!_;TmkY
zP7ZP8Ho3tg56PdDJSIm=c~Krx@|qlO%YJI7g!>(~oTxe(`>DH&ZmyrYxj;>z3D6#(
z0j|e^J^;=64+SOyi-3;+^i=}IREY;vx3@5Bu96F>PsHb_QU9N@WDC^>A}gTsWvR3B
zCQ8r$XIw1?LM+RXC;<Uf%%cK@F#05eT96~+BEB4qx1@SiT3=(8-XtY6@TfGxMe#w;
z8!D|mKsbvkUq7Lyh*|VXk`#w+S;QlYE)cSG4P$ja{RWLGuEzJm767rpHW|a9)eLmd
zL>{m!3N!%P0bPM-fH#2Q0G*D4cHj6Ipe@(v_ZcUED*#rF;POH}fleO)b%72*ci<&p
z05BGyUBn*(8-P6k-L!iSxCUSfp6?(^g-$!gX?<{GpdCQ_HlGGw1BL+9;hzO822h2J
zjldq@FmMX^37}=Vk&b^Ubl&R%t%2?UU2ZiLm=4hLsLue1Ohchlo&=-=rGc72bKqg1
zH}D!T6qp3e2k2-wI+Kl7PW=E}2WV287L(HoA-cM#HP9V+378D519k!50B3+-0b16R
zh^(Y7l%)V#!rTnFALt1Pl&ZIpxmZ>z-d5dus3$&vX^>F8o>9+}aYVN9l2Gk`qR!7+
zrQWqRS?Z#7%~sc){t&aUJ7Zon(TYu$n0uTpBQEgOLY6<8f1%|z`=Qu^IuBd5$+~E%
zYxZ?Vjq&{9QIlMJ1d!!5yFbp%b%sMWgPQ8R@4#+`wp-yGAm=!fQ;amR!wg-I8Mu7M
z5aUd>TZ}f<0`pT)Q{pJ}U$#>Pn2JscHqVmX2vf~7aUKt(Ni2m<OpMDladv2BW8T|l
zs&~Z~CL(=Bso~CQ+`a0Iqvpm`i>WX433CUv(cDe0<|5*tmPqur*UL>({U+z4e&W$;
ziPn6R#1|@2E7h?(`K?CxT<uW1we!*Ta7V3n_Bd*R=Tq(BXnUHCj@x@k=|NJh5|^nZ
zo{q>3m-xOxi!>T{8yRAZM0Xp_#V~Uws0k9$nQNl^a0JwJ1VKzT)iUl!%#-RM^{w`q
zsDk2si8XIaOtsbg0TkN)7$4Tg`S58R&+Tzs_DMC<Tq^O2f$wMm>Htw^<$1&#)GmB?
zsSoW<h{Fk`+ILqysqkG;YNh#!iEqK}a5l?PFP{+VPce}~6-%slRAY48x1p;HZjNg6
zo=~gBmmn@XN(NM7Wvq3KntqF#ZbVH>-I$j)xat9PXTB0R)k%EkQ|U!iu`)0P)nIeD
zsV14z@cc|r;hL?Sc?_(CR5!Z(@sOHyOH8L8;La4h`mIo>DYbtVzw1Z)(W>JH`#;b(
zkQ5uZy2qm>>V{J15=SPngU!}X3oZ6QTP?TOBWdu?Js$0~zp>TdPCt)Y?tYAV0#|?7
zzdLH7XTGZrxL>2A%#Uu{c=VwMj}Av%-Sud>gS5Qp;O%N{kw1&cA>X<oJ48|L%B
zUA*tx?^h>+7lQ2Xb>DwJwamZTe-CPQU|~Rg9XJ|byLf6?Xk<uD3{4NIMWOdYhB%vo
zEX60_1$5-^K^>OIrJA6oqj14xJoSxr%u)kv>iEDehWb54Rj8)g;RE)syYmc>?#!Tj
zKjJ+5A#oV;3=ZzhpnE^MGtc1a&OGBs@fmdQNB<*7@Y4Tw5tC<dACqTyQ8Bp$yLjr(
zJe!g)HqY*&_RFuO8mLAf&tMl%@mIhZ>oPi|^d5a7FeQy@5<k^VT5m|-K_PPjX}x8-
zkOglGnJHw}K2uf@vf>RPYYSQDmMNPH+2X8_9fj<K>t(u)H)VGrAN^R!$KIAbh6>rU
zw|snskWT<lt{1YGkWZmg^kM_~QUm$&3$kwu`AR$aYCHK_d--}V`Nmiw-|R2sTfp1f
z1<qTCGJ=$`q8GZlupTd9H-VIi;)swlNm)S3hvF@{OmvXTMGFkver(DO;&r(Z54Qn(
zfP=tcF;JifJV?lI$@CqWzK1lE7cll9FX9pzc}WcQ%Yo)n_)f}Ta~gGsmk3;~0%aJH
z!^t#)l#ys5<v28qatbN4&_2oeXrJW!q^uxv4VgBQveld{<Ti7QUtTg73VGQ?sDCBp
zPZJ*iH%a-+{6ffE<nC{B@(<Y#l&}~ir<-!PoM*~0a;7OK$RnnlF87;q9y;Fgeffha
zKSIRN&xX6T<nB{)vYl-A$zdLORQ5oJ8(DB%HjpP|XL(ZM6X6ty)ABPRf51vud4^19
z$#f3S&jS}^WAxA)%S%XYeEdWC5eY4?kaAU)N7Gtf{tWyA{3?6N-(+ue=2PVFvZcH(
zd*TDRxBLUo|HM}bI@UuhIaWbgqDJ6bMU52l1B@!lr3xAQp_(A%GW0p*ax$$@UFAx0
z_K|v4t|F%&tNB8%MnfytpzW2Ps7XSuC8z7uYE!OPmxbJ*76`djJtDWEvmv*u7vv7r
zS?*L%$X#e?<!%gR$vq0C_j8Q&$i2jHAI2HvR|@g@9!Crv1&*mc@;E#_fxf*wr}hbX
zUcD_ZsO?hTQd^NTxHRG)ToNL0lW&8pN9ACv1)}z-9BPe534via8Aezel^kh3C`Va+
z<!I}1ImX(FOe19xDXU1?Z1odziv>?Uvk-x;)|+yhwHsl#&I`H2!YxWW$#NHNme_5f
zZ0v!9&#gfSJf7^cddvM*dHIF)l00C&EDu_nOnJykkzZOR<yS;}4dNRRhplJi5euJo
zr>se){Lvb1;-hQ;@{p9DEM(5l)+6#4;8)ywB7Y<08mQmN{yHf)EF{)#>nkDug^&Ni
zuAhyx?{5zla)6Ee8))~IgUB!#h9P7a3d1lm42NNa-9?VHTgy>)Cy9=hkYh-B*ItLZ
zZ6meDkuu)?1eF`ECjygz$-op~D)}|dcI9;YIXMFlW&*S9m*i|<4$O0bdBA(Xd|&~v
z5Lg5(29^Nt10MiOfe-ESa+&>tTn>E&^p$pB`H}sKTxGv0KemUVpxG#UYbcDL*pOpu
zZIr-uM6S0x$_@4da--ctermrXHvyaNK5~ownEVXbYB!YI>`}-<dp5F=l%4i1WTA~L
zw4av;?I}VYBIQe*{Vu<<_XznlDc|5kOnKObq&z~(xAp*2erG=`zekE4CGU^Hm*aM4
zdBPqi<Vm}$JVjwVP0A0XoFU~bDd$KzPs#;SE|PMIl*^?2NXivbu9ET-DL<3)3n{;n
z@*64FNco+V>!jQu<quN+B;_V4f7!o4UfEYsE9}oqeBS%zP+*v|64~m!A;&q$RHu@h
z08Dgh$w|;B15;q0>hzJ*oHla0(^t-Lo{%%4&vGV0nvt@`X(&H&I?A=cI=Ei%Tt|{R
zt59Pd$m_$-Ayb}rn#l`JCwb9<D7fT2EiXIO<xjwGzzyIp=W_(lNtAy(N%9{jhrI2S
zlK(pG<bMc5KMcO~2L^cB%7M@ad7hPnJsS`UPYXHJ(-Ts&ryTBqs2S_|TF5D$3qsC;
zySW||k44zdB0uy%3as)#sIK!oA=i6qU}$5FkQ+()l$1@RZ1y0~TS)ngl&z#}^OTp{
z!D|Pw6XspecLRGoedXt#$K+m*l>31Fz!#o}<pIxg@*58d)nO0H(@D>p@|0%?GM1=c
zNcod2|MNf~4RxWs>q3T%bqAtexs#9u?qSp`x2K%wju&#0n<6K>&&w%9Om*AKY3>_x
zIv&h$JIk5wyF$)#AuVUqn{(W6g`DgDA>=$4BJ(}>9Xa3aBNw>O%7tzvxyVJiT<oIs
zE^!w^q=VT9ZeO|7MHa1bACaH9z2sW=X}Qilg_`4f<py_xDYv;03fo=3+~Gbace?k<
zUG7;(XEz}CxRA|mNbZHc&rOv3p??7!01g6&fG>fsfUkjXfWyEM;9KB3;CtXGa11yO
zoB&P&r-0MI55O7VEN~7u4_p8)0+)cxz>mNc;41JF@H6lW@GI~ea1Hn!xDMO^{s8_2
zZUTP+w}8Kae}LP-zrcS$KQBZkFaQ_`3<3rNLx7>cFkm<^0vHL50!DjN<rwJidUMFJ
z-eft>+gOeVJpq^qOadkYQ-G<!G+;U~1DFZS0%ikqfVsdt;5}fzH@92>eIc+2SnN%a
zOQ63Gd;ly3J_MEl%YhYezY_XKz$)NlU^TD??|lM&EwBz)4{QK7dV}&)=$nAez!u;$
zU@Nc<*beLfb^^PA-M}8;b6_v957-ZU0UQ7h0*8Pvfv<qCfp37rz!BhE;5*=Z;3#km
zI1ZcuP6DUAY4SAmACR7Bpq~ZK0q21Wz(sGCyafF+@FQ>qxC;CP{0#g8{0jUATmyaw
zt^+rKKY%}ho4{YdE#Pn9AK*6dFYq7G&xia61^@$rLB2+EF!Uk5WH}W2FzCa5sd5C&
zBY{!CXwYMzzYBdV^l`v=U;^lg&?f<tfhnM;LZ1dq2WEht34Ipy+0f?zbAfrld%%1=
zUjTg}un1TTdI|LReNV{`pf82~A@pUw`f@q+6~3l&CG?M=uY&%uFHNq7z6S4o0(~v?
zb<o!X8-R^)_bK#E&^JTh0{5Rm-wJF4wgWqWoxm<&H?Rlz9M}u&1NH-700)4Bz#-sE
z;49#3;2YpDa0K`k_|8{O9*2J?U_OcPoPvHD`VY|0;Q3kT=b)d5egV%fLcaw1GV~uo
zUx9uV_zCzK_yzbC_zk!Q{0>|PZUBD(e*!mwzkpl7-@re>ZQx(vKcJr<c@7NlC(D7*
z2LXeDA^zNQDD+|ed8qU-i~vRgqkz%C7~oxCtUpzb!^`8LPXIj;`XuO+fhoXL(9@t#
z2W9{>LC=Cd8~PmRbAfrld%%3~Spa<@^hMAY151GSfe(PCz=znFwha1mU<I%e_y|~q
z=O05~4Xgn^0oDTRfc4<F0sd`-`BUhdpl=4Y0H1;03Vj=}9qxAc)8tN=cLBSBJ;3L{
zUSJ>W_e1{z`T^(%fkVKTz*oT6z&F5Q;0W+7@E!0ya1=NO90yJSCxKJIY2XLo3~&}W
z2b>2k02hHvz-8b^;0kaR_zCzK_yzbC_zk!Q{0>|PZUBD(e*!mwzkpl7-@rder`yo~
z#R~5Kp!W-)00ffd0O$ju4}v}z`Vi<tfnmUKU<5D{7zK<5#sKdEV}WtN_&{zs0s2H>
z5-=H<0!#&_0n-DCat8F7z${=kFb9|m%mdy7<^v0Wg}@^4TMT^(@IJ!x0raK7hrlw>
z%b~9TRstUZs{*O=W9X})uYvwapp;w-eI59$hrS_@A~!<+6#6FUn}IFBXTVlq8?YVN
zf%kVp-vxa)^gV%!@^k2WfqlS!;0xdYa1h}*1b1J;{1xyu@J#?+jl(b;2{e`8LjNw%
zU49S!C~yoo4x9i^0;hn}zz@KgKnZyk`nf=Tc^>)&;39AdxD5OVTmh~EKLJ1EFq2=P
z{|fvDTmyawt|LG<p#K5;KcU}*{ulIH(Eo=155jdD`oDn|@;~VPf-PkKAS6IAO%8-U
z2>M{?L!b|ZJ`DPB=p&$yggy%TXy{|0zYBdV^l{L~L!S^#mJ^{*0zDa+0!#&_0n>pQ
zz)WBkFdLWy%mwBF?*a3H1;9dJ5wI9o0=y4=5X>!?LjMq01}q0w04sryfK|Z9z-nL(
z@JTRHu7$o1SPyIn2IWTRp8}hJ&A=AmGhi#Q4cHFs0CobqfZf0z;B#Ovun*V|d;uH?
zR*?sx9}3o!Uqb&X=#yVV|0Y;g9)^Ad_!jssI2V(UK@3*}U3n3>1Y8bceB;O9K4ep{
zoBT64RLa32C=)^$<e3%PX3E7O^lg`uvWCblWZD(_)FW?%FxV_Nrd&nPQwqqfz&2oe
zN?ExB*qO2@NuEvl*Ou2(2Cg;b?<ogO`B%#PBsnGDN!!T8v>46wVS0|H$1t<kNm@@!
z`*G=iP$oVobB&j>*yFNPCt3b!gN`$bcPyk3(ySou5YM`iWi>cD2mB1&1Wd5@1G#{7
zAPcAnV8x)(6lf1T1UwEr54;8p0I;ou_7!6PGUiHw6~H=RJ8%H_9ykZ!q&S1N4Oo~Z
zq(eFAaw3{#EC<v88UbwpT5{L}cn)|C7zm68Xx4E7uo}Qzy>S>g4bZIP9{`fX@B?{(
z!axOpmLFo?Ho1*@^<4}?8!GJ{rh?7Yt0UO9TDGZydUYV4B}Ly&ioP3bioBboBJaW!
zezz>W@icLXHQ|k?-=;@+A#7-%o_@s;ja5o>^#rV945^LO6R**eicv#t^~4K?XshbH
zqgKh!F)z_pwYaCT)$4GugB|pYF}{eWoAC5emC;q5<rOe-#>~!mimS2Vpizvm1D;}+
zxY!_6AX)Y5bB8H8kLuHxpJki!w^DD7(XUhq8@CIUR9|&`(!l7e%27*oe2h&cvrUy%
z$A|dYQ=^0$FFwQjaQf8i>YV{Zqj24ee*GOaK;VTH{K5gD{I%5cgW(p&9M#nGSWQjF
z%ot+@^*mO;;bEO_YJvDvJ80GcX7U5iW6j+jRZl&SalT913g*pZmPUUmlYqh)h%Nh`
zkk$8?vhge-+kPYEqq9u;;`gR}8^dBl#cYh|ieVV36PBDIDq)COVihw+winS6AS*~&
z2dnj{rW?c<NHoMm9uU6?t{+riF>!A^4ZH>n0j2^U0G|S%1IK`$fd7C*)Uynr5<r*N
zzXD(<8P;jj!2(!53Q!&03w#Su{e)LI^blJZP?u;00BXx{jE!SL>?UFzM+7j4c~+>a
z2%fqn{)2VkuBn%MJ&Bok3EVaBa<3<M&bkEdnslKj@pCTsdh$Q!Qv&~M!X=<$=34@>
z(=B%pG1Du7*twQ~x^t?fMC?q<9YoB`N{QHM7C4K}tB6=a8aIXXFPAca_M!xui@RHZ
zZc<c`W@RZ`8%bzDs6>@VN!bH@2^<B^0#^arc2f$d1GEDk14!b(1wc^Z@DoU4;1ggw
za0oa9U@{kHu^C1(kPegw>HxHc<T2o7U?_m;U}F)m4%iDE2Yv=ff)@np18snZfL=gf
zU?4#If9C@q0b77CfMWpG4^jcmpRTIcGK5ea?o&OUpasLhB$Y4C5IIzi{Hkac8NM-(
zW2vNhlh!u$mud)B6yZ>Q%&kwAGo_j*7fP&(T#iX{ER9rG<ZTGbbXB6Xs#RBYewfVv
zpc1ROCZ^Xfo4;XJ`Cd;Z5Sv-ndzPByOn1o5ZuzBD-$|@O2eE|~l^(LrVs<lKt#!6K
zYL9aaEjygK1H5Xk_c)dchlvGI;*hIWdpCI1*6@?_z8!vb%s&cKvFR#~_$M$dsOAQj
z1l5h;ZOrK+h<81i6IvKj>q47C>QLw~CTr8x-h|r;>fOZ6iFXkP5>F(lEjf1OP$zQ!
znv<*#Bz~8uP9~mVx2tn($)OJA*pO4*$~ih0t_ZkGIP|_tINa+=jBx0EmvFe(lRJe&
z@4JKpJ&6|%_j>X_ghTKDB^-Lk3WwgY!r=}gMmY426%M`c5)Q*@IYe~s{4Od+KEN)X
zijffXUQ}L4qq!7cn$oiolH$0KIfh#@eM&Cb8xmu<fXH~m&{6@FiPy)`o>blspP^Jh
z6>MeTkT*{bmDvc@G*yaJX(mgVEY(CMtya>uq_a@UjQeFVOBTnxZnDful_e_4QjKNl
zys}IlSvIdMmsgh0BP-;U6*FX|KC(JSl4~W%+Bs#NV!Q;8gc8<X@iMkl=!F5=MqCG=
z#k+5!NssD-F=?qLsb$Jg<K!HPRT`h;fDW~v>1kwKql_welek7AMF+5@1463%`>NNz
zL(?Ty7j2S;BZVVPYMQZRVLT)fy-I-?3rqvv16BdsfP=ts;4<(BAR+yz|C|#j3RD3a
z0wh5n0eS<k0wh7Phlwvyi<O|%PbMjfUY7AE;KG0EBh#Jgm4NyH^^YF{kPO~Ll_x)i
zK5R%-&%UOY_HMwkcSGDw_xFtSsEM9w5H*Qvn&+CU`g=!Ut;rq4M6Byl+uZ|La+0WG
zh!mAoPF1gCh%}Wa4Z8!#@b(aa4ilDG?-2Xp=RF?GH4h>ylGi7b?f*}lG%wJ?eOjlT
zrs9YL-V<K+w|9t7ZSn1eU`$hYKe?PZB3Z3Z-o~E95zCX-C#j1`*Ra$hO~nvpB9W;W
zF;rK%hA1^v&4Y}tt}5104V#f+xmYjM0r9<1zY464yyugxX3U#$#A>-ks<W8PrWa#~
z#>$^Zr9Gpbz3!lB=9e<Rw5OOdVT&UZM|fr4{4%A7Oxu<$%XE<ypOsiaQGK40jc+(O
zEKQe6s+}Ey9snH*&>t8B%m9`Ev|@ELup2l8Q2Tx!xCYz?T=1bUz2X4%5}N=W0Abxx
zj_#(Hm=`3LtM*4iSt^d0NcC{LJb>t0$jvC>S9UH{xidA%xk{_fkEv&<Zw?~A>fD_T
zDXQ}$WGI-W?SiUmdo0R23uh^68oK0IEA}-A+#(cGgRGHQWs<4VQ1BL6>)HIFz-p}z
z)FuQf6D#Jnn`#%W--<l>!MuS6SWyF<sTRo}B(JRxKWV5+;Lx=N5V**U>Qefp!bQye
zmGom8I*3W6T$N=CChLlZWU-Je9+H{cEm>t5gx9A+Hryg)D@@2ekR~6@BRdz9j}(+$
zh3qDH%?g&R2wYI7^9;!w{&s&rswxH@@kjz@FdpYXrT#zO&O0!wqI>`M?#w13p$Gzk
z6az?A=^_Fm5CVcUX%@t=B%833WaDl^7XiWE#q!z}EB4+M6$`%l+Pf&&^|d$b{yxvW
zdpDa9e1G3RH2SbJbMKuybLPyMbIz2juiDBW?gR7zCK<$2z!4w@s=+ECW0)Ny-afp$
z_|+0zb`v;nfzQAXKmt$3aA&X=;47F;0hk0117$$VaaV%Vz<O{uz+QA-0PleR0+ua1
zI&QitXahQdJ%B!yHUdlphXF0gT?kfz)4@gHI&c?w6#NJ1JV~7wsWZ3@VqRylwFf=H
zKA;dx0}&7`%RPtiI-p}19|oJiOJFnj7<>zU2RSUa)qcEAK+AQ9fH7byI076A764Ya
zI;R2c$-50~09vy9Huwzu1hf~gCCJ=|H<Ne?NPrdKG;lt+0^9)Z0*`^0!25uH?XAde
z%#z(UV0X|L3<Em%Q=fyM1LlJoz>-|&RDf`Ht_61hg#0d@?63ps1UmJw2TWE9Oa9+*
zx6maT4(8e%4mM2M6&RBar&^7kw0*FZ%^P4x)L=TF!9A_)i1FJ8+uP0)>~wEW!;{=Z
zbgZ3@VsWqQN2edLeXyJDe4ve9?b=POs?>V)&SPwpJtsjl8s9!Rz&8Hf*yHWZ^w0se
z!$q#WKJ-M0tvd+N<Jlge)V->RUx08KU<b@6h&@1XW=61tRM?r+1>_Jsw^|#E8jM_`
zvFm-lWUNQ(L~hke<EKLJ%Ubx8{W>f-q7;7V9?_<chgtljr=G<ACGv_F`K)f_CtZ;=
zWcwgn8qyi84PDV-WP;7w$yP4o3^tozYM04nMF{Hy<!gdHJe?GTD(v*5?B)=*lUqU4
zt>tx|v0g9ole-~gzYN{LJ{#6uIqnkeGkVIf*Op+Xk?G(c?(wW4ww&)u`ibmHvtQZo
zw~t&Jx`91SNM1fM<=R&H?0@-&tPr6+w7=>ON?k8Jd>jRP!B3wPvqw?TSG-lB>s-4#
z_pZ-oWaEn^_RtFZaP}5ITtTCK5?ol!_MUWPrMW0<Psv{I-?!4NWudfvfr665O>E$_
z_u2>i*fr)xOgnp(AG?voVk~9PeE6R2lWqxb4BLyc`24!^yGYypuE|>O$6gJ;9kwTB
zo#)4%WEY@)G3ym#C<!UISe;PH8Ae)dC~%9B7`VkAb#3eE_OMf+&=aohHPBw>vZ-48
zo_ZhavI6@i%1>nfbmT33S$o@+d_hcKP^uH&zl7CmjATmRe>`FQJ?<9Xb+lb+FCim^
zW3}OZql6LJN<SzwTtxbbe)?lBE94(AXmXd>SFOD~j5yZ)kBQwck)EyCCY*`HZt-Iu
z){A|oYYYjxbB$Kqr*k<w#ID30#9j&AW$X*A9OS*Hgzc^2NBlRxqJ4_5T6T9RzqdlC
zDE5gT+g2~OM%T9b*UmHKm(JyKb92Kn_GhL1l!Q`#N<t|=r9!@+l2EOmQjttQWdrfL
zeoA&kI9(8*uhK&G!t6ZUtUW(UPlwM*?Q>iQZ`2E4AQfp98W*eXLuv0P!NB`TFz|jV
z=zBj2rM;ho(;m-y;Y{x*mukJApSmU&UuE+irWI>%+Os;S8?zk)ZnkCu>wOM7!*I}u
znTcyEGTcOSOlcDi^f2?9m{^WEvZ<Nh#FXclil2>Ha&_38>awTlLZpIO=U$kDBb1CE
zMWmT0TsCyQ=>D6nUASe<7Wa`b8_G5?s~N(>dQRw49H}9km$JpAPBC&i`JfljX$bp)
zF<>${3`7Bb{V50A`Em>IYDbK<v+!r<+c{-+>6s+$Wy|K<M2+M2w>!7CyW==2wxO|h
zje9N^OYE?j_8IMOHk@~1SV0_`VN6$JnG@QnxoOS<sut(EX4j<2M`i4_m)Yw!w%hGt
z2JCP4+12cO0;+tD8G4u*xx^fR`Z#TfnVB>Pvu5uQ*Blx#hq-1}Q*(GzGrOrdqL+#6
zWQxyoO$o2-N5xdXO3E+aEj#Qz|Lx2CZUwR%wR=m4w6_A?!9Xw)OaimOJRmQe)X38T
z8%rFvT02rBp8~Id_rMq6C%`F6>>%@ZbM@m_Dx^MVI16MR=fK`bhk@C-3fuzNSL{3i
z7!{pQz^|YYYM+jA=nC{f-jQH3m<{HG<$z6^-ti4=!uO7Dc#hvU!23Wuxpahs&NyxX
z0@ZOa;R2v@0(G`s1y~AB1Q!9Fe*GYL9=roS2fu(OsCw-{FR(W_0L%m>U?Ero&IXr*
zo53^SP4EeDQ16-oom;KlT?4@ca2S{mbX>zqa4NVI=(vWb0DRYZ4_m-ydM_}BKx;c?
zhCTFDIIA@q3jgccH}bY&<|^_)ZR9ui){xzncRrgkTH}LURU7$Le$2}Zeo?NxfKp<=
zWq~xRhGczfyM>k0_B2m{YHcsG*IWBy<IP?!bt0GM-JNIe%X`Rs^BuL3q3m~M<ay9M
zPX6^GtauOEO-An-iZ04-_-7a*6!}qC-cMceA|IzBt3&Iw;p)*4#iLQV*Gp+2GR;m|
z%#Pk?$dF)FDmdGIti52X>^dI)tH|l0n?iPDh+V_$*o&omQz_5XOYtwqY}=kVeWk0#
zY*D#A3CW}2TK7TBf1@1_iri5f*%*4Bokiw$?}?2ewwC#k<u<#Q{S3LS{dpO|y(Zd|
zQJj>T4zg`;++=&$9J|HaXh-xUc4XVL*bHXBXICBj!~QA>Ke}$XG_(mr*R#io-LKe_
zbz)Bku@f+O?M2zo_!*y!>yFqRer&D1!?I5;b**mfeS2-#uFiTX%dX76$iMx4bVHJ!
z$nuKtksmuV%dQQQzOomD?GtsL`AWM8Gm@@!ZFXDBQL65>uHB`hor;H`PN>uFcJ>@c
zI=by+Uvf{Oz6y7*4IhR|bTN~X-i5;=p5LEaVkF<_CDVsGb+faMU^4*65~)i~o@JNG
z-XVA;JJZkxUPPzxVw`o6j=k%CuB~^XG1t~|fP`HiV#n+Dv5P|M=?C^d>;tpM`dM8R
zx>QNeDCuK(haBz=os6&4)0~JsANozk&G8vY9|cL??I)Y74M!t93VU(;8H(K(#9o*G
zVl@_xy_H7uZunZ8$gd*qxNINZCEp%63H5L_)g(InKwGY%j^NGionjGGqHX4U|7I^K
z9eW}4I<kcuyxX65QT1<NqXkN|v^;Wno@ro`G_mZ(UQvu08O2`FwOfO0uShk!6@|;Y
z_N5DFpUF-<yHOkR-)Fl=ZKhwkKjTC6qzBpjDe>_F`$@b9SF1~S4{i<~^wLiYv4?aE
zJ4?LSg;FBFprnpA@pYJOzX@^1kQeS%8@>WJB@>!Ixa<jJx9H2>6b|=)s)GOEot440
zT`JgMd!tUDV9kMU-evdNaR0G(!W1tRW92GtbF`bk_&yy0@o(3z^Ue`aO1w6FnR}IM
zuT_?}i2iqL7X2yiMulMG51Rz?a5}N8d#u?h*EDZqS}ZZGe`#!nzHiO6XN;Nswl%S9
ztXY0Th|No%X0tV^r@7WGF*jhOvKy(RdBA-!Y~FP5wB|#Xg&;q;FNM&NSS4`>W0jN!
z9S_+9NFf>y#sO`5p9M;Qwu9@q{FT5B*`{sn*&I2c8M8h2v2S=ANi%}oY~!76zo9bi
zQ-PRu@Co;ql+4|!ZlK)X!2X+R+sd0ac;A?opBmF`sOh!Jm_gSWGvQ`q%9@#ZtxT-H
zDd*OCltbl42k{g-?^_uchB9}yvro*;HSB|72G2E(Hgz{^Lmk;Mh0pH%P&ad7=t@+{
z&`@)ADA!yQ>dChr=bG#JV(kr~t*kI4a8Ib%+{=dnUP`?ypiw{K1z-vgeab)r90N`P
znh0?7Y{z^%Xs~>iAK*W*&#`rgf#y9rNn?Agy*h|=w*@8k6n8yr(#IaP&SkI9i-ZT+
z!E@~j&+jtC9xo2M!aEpd2pr-?(8zEZ-=e|yYf_+Hl`&&X&ha7BsFTTUXY%$njUNn~
z?(1E%2ijww-(53^eJ(>j37LWkW^_+)enBK`N^y1e|7dV9I1(g*yv5gn$H7*hIgC8?
z$If71Fbd#Y+G&HztceoG4jyhNOtlxf?AE=?WetIAG<a23+hegr?Yr)0B(=7d4Y>Ur
zx2-L~gYZh|TFI)@tUWjUaJGFT=Le=A{_@g=p$%-SHh*ez=iD&<fM;^-A33Kp9nqG3
zTsyj!Z-*bvCg^)Z_K(nq);^V`gK2h<cqYCp<S{#YKj%jL3U`oszHKnMs~tAn&Z&X^
zZ!j%%WyoZAHaXC{Q6H1LB4qNm(Q|L5>rOGvx|p4=4Vj(Wo0fe|tKp{gB-5s=X}hkG
zY2Vp&7-4qHH~9}YGM(6W*9G5kSB!@q?M=_lrq@r_nE<iXSS@IT1Hd6beV_)c0y=;6
zB5)&+TmRqSb?_mOOJ9pVH2KsPX!5Bapml;Np^ffyUTf8l-x>PD*i-Gl;9A4kOc{|Y
zWg5T4em-5vh~(MCN|!^T5NMheQAgM=;ViUKdr8y#nS0K&$2B^&k-fdq-H<rXKGo>u
zM)tKvmow{>$L~9h?Af{Jd67+d7d5eGHNC`prcUIk#xFItzcxOOiEc8ejqIOsFh;&H
zY>Uggzo{9XZT81*mXRZR?hIWf05N_sh=WC7HPEEq#o!honmz;G03U&$K{hnhWL_80
z2Mh&n*p@73T?`A|+@`h#YQ|OWqh44lhaYZjbMtLyCdxRk;+qg3mypZyP|J}Q_B!(<
z;S*eY7^|7?49Uqu<RbH$n4Rx5z81a_g^HweXcK#}y_|+2cC)U1>s`CXwTmy)fOQ4W
zKH}PH#+;Y5Oh_Z!lYBxJ1({Vjr(hd<Nv~-T{=jo3aInB-T0HfP=anUPV{HuY%zg*k
zr*Rsg&xI8JkNd79_6TIP317`oGR;hr5vJ**%!cf5+U{m{?QYuVm=3MYZh0m@&veQ&
zo!gk*JDF~~o1Wdxp50BKcBWrPGa%0loMi^(nSFLLgWH)QEzPjq%<vqu|NdrVjv2L+
zDP-H(0Y{nfIp)BoX2KLRInPWvE@Te6z?egyG$zvC%x!OqTbgJ~Q<~%X4?3Dy6VHFp
z)>Pz~${Z7KW~$iXxumgKHo_eJHA`RJ8?($ckjUKOc4J*t9`1xZ)>UOOShDTz5h!H-
zt=+!Qxa`b&$z5n(c5~RqJBrP&Y({?9ZDF>!ZOm4;H6Pe@&HL_djH|5d1s{V?+)n0G
zM%~XK7l+*N&mZmuEWiyxu`V!9=D^jQIFhq%d%O2w$a$c#dzhR(9L+=PHdrb(CDg^Z
zdl^0|`zcAfqN7}b5vY`~MD_}B7Pu7L0PY5lf@i@8;2ZEO2$RRopc9ZNF9e4GM)tnb
z?439nHMiSqqP-sF(s4WD`i617HLGB^j`lV813ATigMm82I&jlA@x*VT6X9K@V5I&Q
zI@XJvrPDH2Sx#ZWS(46hgzZ0xY_wXg!XA>qm#ot*me~7cP5q)nK;F?3oKwAn+K#rz
z=|rrTLmzX`$#_EB+XDREwV~56gF{yt`zX85ITfRRY;))X9b>{6{C(|Rx4X8mDAfxH
zuW>mR<ut9RxEQ;Tb_jbj{aakqY?|4r$TWY)H672krvJ-qK+iQJ*amn|6En4$nbyKg
zZ((M*X6826%-zcrA8n2doB1dy{Y?d%q8G1r&64%T9R0Bt`LRb4K048DW=?h|n$z5u
z3<oIXuqC>=xeVsJ9A>=2Mf<r8Cc53FAPE{pjV&(G_^cUSHTKaU5cwLT?*fm3=fP(1
z8Q2Ca&vym=!F~Wvo4`a*qq7>>@0l`Jd`<Y{EPH15?b-H`>~q9$_Q~+eVf$hDv#>pt
z!{h9$S(}kLj=d-A5#4xYwtX#i<F}1g=i2l0KJxNOMINjl>1tQQ{IJOHw02jd4`$O>
zE(;iltifQy+IX0T?P_oL&pmt0i#!(kH+;h>PL7Ko8$YwXz+M3p?Trrgyu~%}iM1=k
zr-tqAI>l^L_&+K!9vOEucb{SL1z(4yH8*1jxUHFKnPXZZp3Zm8E(e%47rLe`&bVD`
zOxKHz+54nyGy1x)nakjy^=YDdJ1AHVG$5P+)`E55W^f;nX7nO>7kmz+Awel`?z{!R
zJ;5Mw0GI)o>UE9+CxCOn72sa55uo&m$Q(%Cyq)bl&~aI#+Qs%AW~Ur#YgRxlqiZ|u
zYV%Q|ZSA${V>El$i*=aadZ(M@+VO`FWT7U>8NsVi(a_nihq{~`DR1!E*r4?yEp6|?
zwnnC4ZK$p0oc1l0c9jb3X8(}cx6a-YwqJ!W*7V2YP<}U3a1q?&&8@HsLzhp=%DDWw
zS%-Hi{2>i9%oa?ub1Li#SN>FD2iZCEiD`yon7u~h&hyAkRNEhQ{^2DQbQn$$)_{B$
zIv<`;<o(*nHMm*q2E(y4TR1cTd4r$O9hq)?ccL9J(H=D0KWNc7LycrRUyS9ACTohx
zy*^}GVk5LY$h2={I`lFf8=2i6&oV<7n_*k68Tm0Y`2Tf1iCb4D4>s-(S<XgUUFl5M
zJG+Cu!B}u8m=A)^DqZ4wKr_$(m(K7)&VAPC1k@V`<L_A)IfvitgxssWl;1+@jr|%U
z++Ll#(M!44yepIVWlV?M_q>#s?VFafPrnY^E!p>>Ksg*kc~drP2DfC}_i}&Xp1r)9
zAHmDMg0!ea&><Bw&M&!UudiIQFJsgGTo#*V3Y&u{>k&Y8P#0JOG`d^?)CC>@>H@ET
z_rO=+7m!0f8ePzZPIgxrR$r+bhZ#&VBHx5|6j^0(gj{B>^Pb6wbhjhs;ug5n+S|fU
z!$aMPa3J1)IR-+J)gk<T8{i7ZEwoN^TU4w--y(a0&f8pt+d}aZv<mzT&v{qKsK%9x
zy(<URGRy2tBDry0@DAuStT_#Dv8MaeVYBaJkZh(I5jG?Ho5I%_4d9X4&@u)s<M7HP
z=pb5Jes==xK@Ttx&`ebfH)nB(s^jL^y;q_Pn}?`Gjx9r5yF@-i!_mzq=gY9^GQ)Ih
zZo20=i^G17lgVKi)Ext+fWyF%U<o)L;FokR2RDKH!BgNB@FDmCgb;4cKzq;=3<3o}
zD@kU9a<CMf0L}sEk<QKF0k8?Y3O)iq0%NEh=m2_weZXjN5I6$N0|~GU91rlgIPijZ
zsxy^zsI;>QYzE(gY{=CTbOSocc?h5}y=j0$_$>yNU@>4y(qZ1#(NqAddYq@hYv2Q*
zses=>E;QQ(bOwFEPyqFt>|!fVhV~7GSk6>~clS(})%=ff&-U?GU3^sMVH)aUoyaMn
zvqN@s>dMB@3J(4Xvp|L0>O}6)+TnLO<wRF9B2Ck;e$Zt|z<vmd62xC~InwJ~%XxKN
z+;QYY9a#BE>b56C_yM<hj>o39PNdGOGa|d%BcgT*R@@El?G$6X=o9YB5a#i@lz&&;
z{~3{=G~au?@vq#>a%TG*`h{+*6S>kd&F~zPuDX&D=~}PC+s9LN{)#8N1_Wpz;t^mA
zK4;ywQ3Uvk1qs_lGV0tlvmyFfEXn1X%xgH5H=0AMnNH(!!@;4WaCYbh7`Pp7KHpi>
zaIp74JDADZ&2s<xGXz8IxMMi!rGdnYw@bX(y<BFcB*_r`hCK6AZAN3P-OrAf2A>&X
z>Ee34jE+0m<`&xMe~^ol?ZAEP(2)x4Gt^$^?Qq#q@Tlu#A-*dU^Ea1C6aV>!Q!jHd
zM{j3TbKF^W*Tb-Azx0A9x=*?GN<6GeU#G<pLF6g1)0dtlQEof-wgs$ZDz~wOC$sH!
zEcsFVS$B<2|6^v7$UBnaeq^b23K&IoINnm+HRrontjIohjhi9GPj(Hx((sL4vnvW;
z*Ym@s-(?{)<aYdLkGp0vr<Kk+(U|f-LuTpaA+rW^#yPTyFMMxD_-ZMg=0P#e9ZVAE
zYk<7op&*>YwK$Lv#6oiD9o{Yl&4WGyUIZTi*2p-2fII}3meOe+v_BXM0wJiSbP|GE
zPFDjoCwdxSj;+p-_Ae6G>6Qp7Z#f-fktYmy=2!EgT|sZ4Wp!i0!Jq^j1(pNWPdev=
ztH7P$alkBy^DfZ*=+8h$-0uwX!CpX1>c)bDK^f2y_a_66F?2ZV&S5IZc?oO*Uw~gg
zHglfMfj$t_9~6RVKy#$Wf(yZQ;BN3Z(DJzt0Ly_KljAt;zyNR{I24otEtfkMaC)P2
zDYy|l3|;{5f-k_YK%bo71#|_$QT8(k7lWg~G2m3dNtMoZ01uV(ICu$g<h%0&F!+S^
zX{Fu3USKE~2d0A}Fdr-hCxdn1F0cu_20jGe0evT{8R!6df`MQ-7!PKGxnMCk16%^u
zgL}Xe;AQX*_zL_E@)}bo&;|4Z!@w9Y6&wMM1S<ja#NM}l-DbAO9(Dkp`gEYR?b6Nm
z=6uC;po7jWKAGt;dJ|@fyM6qO5Yx(^GLx)}JC3Y2EW^2l19^2@Mx={9qSP+MKc=PP
zj=NoSllwcTa~diBU06l5qsSc`hK6163BA7yhjXPPJ-lV;7izWE^PZ!+frMAICg?^T
zMtVZdWmv&Ia7@1;sjMS1d!Tclm1Qr_z7e1CUqv>Cp9$Nq!+ZdO+cF{}?M}UH?-7n$
zU~@;?BDqB~Lkt%Wp#MAWfjWkLHirT?6llGxJ@6oj=%6(o_Lf*~AAj5ZNJf+9rY1A^
zY&P~QR<ZqcL`&<jpg3ulXEGu)EehGZHI%L4&@S!l)I%KiFpF(bIM)j_-a(+KUctAI
z%Sd72_!*4UMNFoC%6ddkN%*TsMp=t83m&WYw(a9@yO*<Qjag_-O6@qZmUFo6N#QfS
z+pY`U7vhQAyvRGD8)b{=>k=^>Ry&Hktb;69*b}H#%w|Mt>XmT&xJQp`!y9-(4a;5G
zh4xYS=de9C>s${0%z6%+TghLB{{NG=(mU*i)T3{ie}(O4@09i$yW_}4O;dl(Vd{Ea
zMg+fe^Hw&$i|2v9-Mw2&@E?a!&XOmM1;KY{V)_HzJv&M|nSt7F&H9;o@XR|Qw&z@%
z^<tKNHhW99{Ve-;I-8e%#s4Y&9C7kv<l{ZPN%P+4W<Trab!GSl?W}tpf|7Ga3^gU_
zRd?cpbfmXzm+*-up)bq2hn%^5k`~TfntfZgeIWZePFKx70kPqwKl^`5-+;@XgYHvL
zf6F=;M9_tPUU%AW!uITJRtPEQzlznZOGXTr=5mb9OO?SV?$=bU!IZPK?&doC2#w73
zS3?(A6z1ch)BlKVWC0Fd3W*o)%!r}j1q6J^^Cvfy@Rq(j@t8%^kV5!ZvGec&VNIp(
ze3qFJduNyxTYB!-(5=?~H@xkC$3EcrNW6pRQ%j!9h`n0Bv<)R})*^t@?KM2j<ws=S
z-RSKu`5&=6I2F@#OnJ{`#J0NW;h=6XJs^DQ21Dab?p?H91Mv@;yWz7oxW+ExFibX&
zeheMB&rAQGNjI50`J@34de8i1ZVbb6tbS7te---xD;Lj`+JieYV!zg^+B&|w={P%E
zIy=F^L+$>BUTEn4_Hy{~FT=6RVJ9Y{>{hAw$9sxDnd~6>e7-z@D7=q1{VP;!X%NmZ
zHHc-{8pJY;4OZx+tPMhS%?%PV><wZW2FJZ2*~;g7^K13OH`8GBIJw<6*3yAsiVlR*
zboh&KLlx%s9VFM%!8jugYZy!?V~3k?p2=!#vZE$vH^VV}xjjr?JJW2AX+6ty_?AtX
z=Nr@GvK%vVq$yw)`@kh;@=wOhWTC`ib4}5!EG~H{Y-*m%GAFR$#hm3HpTp$YRXOHi
z_xP}R%H0-bTI}91(_uXPsryUVe9!#F58y|z4g3Ut2ETw`v8OoI7C+gkG^+U*wjN6_
zFt+e0gv>@YbXN>2InVG`r6#`?7=2p8=9us!cx-b_HiscL%W=+5-H0{fd1w3bI|0l9
zbAaozN)oyBqy3qYT8Fw{q|J-3<-Un)^slZBwfTMRoR|z}+<&RBD=;VYW1i`rFne%B
zix%l#$AQEfLT3N#T{FTpBRiWhcmby-oD~#Y<(K1c1W<8~1Pj2i;0z#V@Oq#rx=r9U
z@E-UY`~fss-v+ol+dhMsl4C6OJ_huSw}E2^i9=bS_L|3xLs%ZpM9W*DU+5AO*!!`b
zT@+p?n!utFCi9*MZQ;r+x)+NuHip?RB~x6pE}K0`I?KKtdeoq=Fv?xcjt9qG;Q2*U
zPIQ;~=~3uZa-$J^QN!%n_Em24Vs99`nMG(`%=1{U2<sYNwOVqvH0}jq(>Q6Gk2fvY
zM7J|5RCZ}$+H&+I+j~sc=U5iaT8JU6;~BBkjC9SY)~3KUqjxriTpqv79JtCg6IL5D
z>%uHk<C+EV)hhSZFz0A6hvTN&00YZ&y!PXF2q*vni^#cM1~hfL9JnFdb*MdrW^ZAe
z?qT<xsZCGqhI{%ffx%v&U2v(R-EEW|SE?z`-G+IAgKWzuUHdu`QFHryjq`)vWMbHa
zV&1Bp7N+sVtN@>BS{-9{J;gQcW}1%2vf_ha&#>vm=?qQGo~-*GcoJ2QnxS8~rl8!6
zVI}>PusMjIBla_MXLyUDb~Ul4-aPjrQ{K{4{M$7P-wH9axR`URk2LxmGjseNBX**4
ztAUpEoCnr{Tfl?hDeyA*2>b-X5Kfa<y8*2d84eBvnje;*c^Nnn;2v|<flWaEW#+cM
zxoIsiZ^V1$U+x9uUmgV}0j$jCqwRP;6sS-u+m;Q=&Fo~Ra_?Z(uDtjs>qL=ttn#cE
z8E9J;a?btpOmYmgOBuYsZ*(Soc%Z!`bPZa(K6vzwH$mJ`%JtzVv+T(^?3?3(@53vl
zJ>P^DS1+<Hi~Y~n=G>ZNZ_j<u%X}k>qx~lP7xb@zwoc^MoV#-Pw%gNWImcIbug6(x
zUuKna!ztgmw-~#o*=d}mFozXkpNCkpDTB6N<mIgQv-m#JE7|teoDDgME52iJeD1lq
z_S@W*d3JN&=S;6MchtHU{Tpw-ynpTU?Z8p#5G&3|tD>jfqHjVyjAG!WoToikmtq5Z
zu?sa`u5&j?s60*5BU&W#ik>--GQCjSP1(ngcAaUYubm8G^G4HQrrAj$)Ak7tZ0E3m
zK4$l^ru!?#416%x%sIaq9|-z^Bf#9pE$a{Gnn&Dgjd_8-$wztUs-McQ)>{mRq=Yy4
z*YSin>*MlPq`yy-W0)8Zaerf$>$M!}v8J8~`n4uh_6O?I!D2l1=Vjm|a1OxD&IVS@
z+qKA`u$`Ak2cOcOr8hlE)y8JGvb*-OeFoUQSzge_HX_=|3%9rJx9U8P<DgA1%iL)G
zelpQ_>tXkpm->3e%nZuxp_LlpwPDj{qG^8|>lpVkz4kMGT+`Pz{Sa#GfT4f8W^Y#H
zxn>{N3}z0F-tL;At{KLzVgH<ae^5c}ApsD3t|@fQ7}t#5)r>>@H8TftUhzx_GneUe
zNw>NqX3TIRefEK-am+M1$~0|nnw6QIVy1byX%RC!mzkD_JFN2X9@5-Y6<7+6180JZ
zz_s8`@F<Xk*b2S`T0x>YK1qoVpa+no7y=5wWFTn~0rNo((2&Sj;Yp3l`E_0EEU?G%
zWi$o5)&|ENl}?=%1aoXl764pipGEV|VRGg=nVQHQB4_GLg+b&Q4Q$)khHtNC$+a$T
z32}bQhSY-}YfVWIxtFDD_Ity|6K>mJu-W`bdz(GgMz!6dy&W)?vZR6|Qo$}Z)Yvv@
zD;HD!Kv%nGANkKCQHN=VBO*4*(Ol_3Z=2iFE@g(!%?!2dVvqN>4fVF|x_E*_*Lxho
zqID+l2#!hxDW`#2Cg)je+MmjR(bjZbV!ACcy}6I2*IrAiBm4%kp>SsF3?ME!J`PL=
zb3g(d15N_x04+wi9^3)2fIaSbj^Ee8R`4144*UsP(_ec4?Lr?5rh^Em1e!b44s;e5
zIJbg_fZXNp0lfdt@1U9S=M4AYcV93T%mA7%TnboS?wk#-2ls-<0c(34Gzy2=Ugu|!
zOP%E=*Ic38<RbtxdYuc<*5z}^wb{*BP^3A$T-&TQdl4n*a&3oBEM0ri3l2E||HP+W
z@PJ9oXGm9XV_SE!Z(5w^b%P7pI`NFBCbhu@`65e0&Ci?<lyxQJh*Froe+uo%aR=Mo
z`>>9mokewnSJ{UxpWAzx^l4hh(UWx?=|IFbZ)3+ya@>qSn|5{*{=SH<A1Jk1O>D=m
zj++j|Z2y(|km?0gAZAy}`*ML+POJ)@f?4RT^GGMtd4iO){2Mb;t_z-0K0j)u@hKtp
zff3oj4itMS+YA&r)81h1myJ(k$z_b-&DOrtXe&c=jC~+ahV2*OGqUY1Igc?o#_W^f
zFT(bW?DaWzMeg~GsxkX~&Kvp^>e@#3&%6uRY7`@Kd~M{NoDXyC4O;uTKKH~tdqLwb
zy?gG+{WjO$=|z~r55^(Q@no!NkeWLOQf%csHTzwYlUXfF%2qi<|LZj`v1dXrX+4+@
zXCiiK_?oc2Ae(*qianWiUbek2`%yo(q4B9r?0ZeVqnccMy73o{?KgFjKJb(NXq>u>
zb#X0xt%lGoTFmw{GgXe(%iSXN;66X;ceP9^{WUFfd5?Vvj!4fO*;ApF(3tS4l4f^@
zp5l38*JzAR$DY^56t+2YCtI?+4YCIw?1he6!o=Z4E}zJKj>ow4s&-Xe?tM{<*vBlw
zO2wE+c1r9@@6%C)PxRMCoUJ9DSU0(tVmr4qEoYiGk7Sv?d8S`eGhipPcMG%61T*-X
zu$ea2%s@juc$%5r(af<Xa)>GIYGNlgF{fy7KObVV3jrf<y?b&~^MT7(LB7y;PJVRx
zUf=P2!_%CrSF<m~6I}Ztw7v(wgDeD0Ga&1y6X*r%3$WRwm4Y~^uaMqCnpRb@?AG}=
zco}R45@4uItbp(p)E4}<16@HMurJU;h{@m(Ac3aE5ISS>IB*8|7q|kjjLJJ_@<Dzv
zSiLXYvB1P(w!k}Y@&|tZ0NK>9CFlrxfc{_@kieS?5Lk9MKCh}kUVFPcU+{JGCD8%Q
zzoY|3_jF}Rn9)65Svt?xM!VR7sCrqpZ_*y?EoaHHV=B|ZLxNya+qR3HfQLREXk)vK
zx24Q6uw=%)-?cBeU%-3a?ItagTwzX!rMk2IXmf33vsq;kiQ1>s9jDnhXiQ#YsBJgY
z9)OWF)OH90yZ7@__P34t*kR+f;%KMcy}&5z9B#|2?SiX#+bFT@d8`rlf&)^)@zyl6
z2@beyD9|iwmun__ylvLjE@zR8Lc3^Y#|yQx%U2VcY=`Y{3&wgw@MI0aadzpZ15@ps
z^3=Aksc`(|uHE3BIxv;l<%@&JA=WIlM@y0tYGRA#*9i<SwACC7d5EnaNCx}+GXkkS
z{mEc+f6@kf`;+Od{mVEo%N{h#ahKV?`*{I$-<(|AjdMJpAEQp{+<-cvRL-Q;hJw6F
zOr0BGIM&4sF<ESqX6sPXr&;^h-t5xYwBhrb-CCONA6qkYrrCeI8FdvJB}V7ug=Xp&
zb{f-fXCLR9=rgV<uQW+^S}!X#E8IibU38l3ETIoc*<}9KX#+Zey@1SXso{qMICJ-b
zc6vmCMF`|;IdMnf4NMWR`|$)Kjcu<%cF1sh^r^VEy_ti?cK9STx+`7#xTfs`!}%za
z^%y%T*vi!SBiETm?s_}D$2*3sEx)_#jk_TFXvJ&1wKH=xLZtjQPjdz*2j=d=)|j3*
zqjo=v-kmi2+-=MhKH)PvY3ATBI6aju6U*LWE%~5ioeT$)fP4ipK(~Y|I`^&<Xq%6R
z<wTd269^yf_+RlMOEGj&xQxTzZ$7lqY`JN@xsmBZ3-v|%4xDZFrc*x5>r_)2M)_b5
zFbIgo2LlyHE~S-#^I{wCith&@S2wumT5M&`b+ezd&47&>EVbf>Ma%38Z}(qcMyLn*
za5SHFN?*K5pZ<LT6IDh|Fwh!jSJp?EoIZxHZ8Yy{T8=SoWBO`rmw~27ndu!ieTJL9
zn~fP+YR2_5<2#uH|LZ!tQM9^ti7~9a3>p1dV{kTf!+ED$MscgGisoQfAh9otVl0r(
z8UdAHDOdx}0+)iD!NcGgz{IKpi+Fmf#QaVmAM6F{>#7OTjs+6y@{?dNICldb9sDx*
z0H7myVmuFKYXiE1y+I+6_$~tFU@^crbP%4-zrZ?h6VNtywhTLO1L>{W=H3)NRUa<x
z0kpJZ5;z<j2^N4g02X2*c*vNHK=<@?$_?HLD%;0bhF0?lWN*(%cMKoqk#4p3c{00B
zBx44$d**axZ|yuU)^gn5tj0mXJ;R`W6M38^KB>sibRiz_dfE2zYgu8;gv=_QWb3wh
zNb9^_rx5ib-<#izz06+gUCD@yulIuO<Cp4a9RCI5Gha|IQYX)h$c*}H0UAj7f`!z~
ziLfTeonbEvVJCh^akq<P%skJ?oOwPxU57io-4ol#Uzg5zfwu$aaCT=iJ^Ww3Tjsd+
zA{j3@Jo5!}>%Cz6_-1#DYwuw_y~7gt*FtQrdyo%uZWqazuA7@VcXy=SS;tRnb4Je!
zyBMobp<+L@+%}(VAH?D2uv2}lmiB+(NgXugH^S#-*>zdpXYsXH4!Dxi{6_eluzf%L
zJys8~YvtX#M5`>`@NReyPt_tmIDa9T5tiSNRl+YxS^tTbe&^b5d)VHG_y=)d)(<X+
z8=vQ9!Y8`tV9Cgjb%ogg@kqK$I`TVg184QaIsRQA=^U61d?(Gj;c>H)50F~Lun&R}
z!evfJHbi=Tcr_bNP0l>|OWMRt{u0yuK5KekZ_LowIb^TEjOS;@Z^j({y)kp2V0vVr
zIdZ(IxZX91jhyGaHG~qpj#&-&`)u>NJHoudhh@zccbwVk7O-K6@8^B(R<JRMolQTx
z-I*1c#jMD2Vdg?OO&{5KVJOV!_S$l633pr)!eh1`518{vN|>_l$Q$wG^wIEiAeaq~
z1Pj3`peY8I$pCvA6UbTDW27o_TxR40%Qt5ID?eu4mu-K{J~zj{o3jn8jTw`yQ?jwz
z*3yd1!)0&IwvXnpcnXs?O||hN+KdPjjX!E6!|Syz$MiF%f0-FD(+tcvdv`M&CoKOK
zH5iRfF`gS|BAzAgSy1OKhSuDO##R-688{6DE@~|@zaRVuybV49cx!&L&t&mY;nmr8
zWzIu6_WhiTa_!2zf92VAd3WI+-P<~Q+x<8y0O|Uewd=y`a~1wv=M6n%=nKy_`V(7q
zsJ$V?mvGmGAIJ1;q<i9azmazI0tJ>{q3w&u!$XYf!4N2hn^t1a;>;+{*U|R44_t5S
zxV_Ok{=!STRmu$JhYlhLNM)Q(#vN`#B_?MB+uimudzYA@>vK)4gDDT2ijJnTqlu%P
zET3<VG3Z&YS;-)|${lKsbtj>74L55z`ujL{iaFlhn@Kps79xzSbKJ*`xmrfn4Vv*7
z&)_1;_Jv9tfM%7nJ4KX}bN_3g<@s5(fSf}efd<*V!3b~=kX;o6HDDz`+jA}i_~TiS
z&S{ZA_S9DJC6Kc}he5Ix=nP~>4FluAOi&D}z|jB)fBKC6z%hiw#B&b4%kQT^c9W$Z
z%)2yh!Df_<KpQ(DVV6kB&kP-MM35GQy4w7cSPyEiLf+L4&(28C49BX|x2J<6Y|+Od
zSZiw|dkd!^J1z;VplB$(I3qnXe4M7cf(PnEPGhEmyibD*N7$P(BcF!0!qAP*Yi1v8
z_O$obPeXi?xxvT@d~q{<rB1{<Su=R1PULPa*LcEif;J;;oyZp~)#msOt%e$5Ps;I%
zaT=Xsgk7H*d4j1^`(f_0I95j3TN}UK7&p)fUgXQhpEa@DI7nJo>O}5ul5wR@<l!by
z)jm@v@@&)1P3>0ilf#$?jau7Y`><TV)wdH0r#d)7E%rmxK78o9wwtl(Rw8No9J@tx
zpvFP$<eVNdtp=MmtC>{9CDHXNYxeJJMm*h&E>(?~9?rQ^Y_vYd-PxSy_AwV|dg5w5
zJ)WMT-h?*h&Ev|<=msQo*?+ZLZ)_FEZN%mljuL*G-RN4=*ryN^;6A;29X2{%+D6}5
z(}9Fyo>Fdf7cWQ7bqJYU&~i=4bO7Bh=K}#?2p<C|02BBWzzlE%C<9fPY>NR$F=~pB
zf7a@si}ladOyQcF`N!K=p}FvXcV@XOrb3@#?ENgl{gI0p;P*n@{GW4eK9<duHTY+H
z;9G6Q9+D~Oy_giAa^Gvj%ZNXJGy<2*-Y34`0NgDl=-Ax<0{9R2Y8mk8eXvNFb-9-7
zw~*ftJpUXNx}EsN%&b1bul`o?zH5m;#yznSl&=FW6w0!K_jJUZd4qTTLAe%i?Z4#z
z8Px6swa;CNj?M22Jo_qmif4`_eh}}yfNRf?{|BU>N<QCGp4B|xq%}udk>@SE^JT7U
zsq!7uDNj?jM3V1fp1+j)|CrC78R+vnWnRVar+P;VM<+KqHxXY(eRn0V-$?IB+Ma}8
z=KbgKjvL79G0-Lt@yWY-mOIW!=<qo2*?B2+gVt+#{!a24&-MA>Sl;^$>8BFjm;0N{
z!PUg?ThOvu7VjkAn<&>*^7#mQJW2f??Cm)Fk>6LmcM11?5B=`s_uu4uAHQ*~y+<A2
zgq~W8owF;;phxp8&z18VT0l9;^C9kifoJwS3SQvaQm*Yz*>mS%8*$GT%KbCxS5wBj
z0WJ<_Khi&i9=nn6AfDUG^V>iV;`@Rjq-`YMN&G%bS>NyHIHO44g))rdS)Hr)bW_-V
zXNDr`b`I~%?&3I6>er0leD2%SjdxJ?Y2@(+d9C2RMYQcj+`A1Lbbv<NxPC(kVe+|(
z^iuA9pL^$X-+12hHQ_jUP9eYjTT^z@--V{H@y;&1=QiT<PV7W^vS`Skdr(H|J&k8S
zfp$H~w;$I&<CzDzw*~hUlFtd0<-g>SRYBdU-xA(&DQUJR<?rJ!B$&m0_fw{P(sE~D
z`*F`9ylVioy>$ZpkM}>s?^e?Ogg(y^-kW<n&GBd0U`?;D4xl{htLk6wD!*@vkFF#A
z8OroJ7(`w{KjTa69*sqB(Yb&;cjh_qM$iZ6Qa{mOf4#V-{+QX<#SiN10Z#<|J)ioj
zUSx&H^2_%c<yQG7@t!dg>0iqoXA${-Nx23P-Uc0?g$~6$H%#RLeaPom-fzbF^%+bV
zD!8t{qxn@mM}e+~Gc!nDpP1VBoy2v$&)LQAyQ;tX=T80oeyFk!oI^W-5xieKJ%!&R
zc<(PG=mUg9Bk==}#aZM%i~E{4qm1xd8GIs|b)*g3QMN_!lNrp?dhSbdttromZjV9#
zcC_h$RQV+nb|Vi<nc9IHxxYWZyYcML2g8@VqYKaOPy7V(Swp^2%F(4G@`|*1;6B>3
z0J`i;+EnU(KYV>S@ivE`s;RBXvw-&nw0xJex4B1uhw{u}(0(d>u!vvd;-G>alG`s*
z*7JGaRLawe^eXP%gR(DK?)SY>+*d%^w$VoGC|d{anMnL#@Hy|;gEmM~p5<U~>UccQ
z4IoYJbsKp}*2bZk_(wc*+#LEU^%=&!rM&YU=sKBqNRCRT74Ynzyz3~g^?}asQm;L!
z<2}4v{q9+Qr&G^y@UQylm()>wxPY+6ob$*(+S8Y%l7H&wfech%m&}*ES3fT$pY=SO
zw+x!ZkZaVfJ-?SzhqsY^6X}aTlYeKfe?&ghDDzCpI)Hn|fq(JN%AU+=7D6BH-#N;*
z7--kxAb5+iA4eS?g$5^6-{HuLKHT>!`P{~H`%x!Wi8&W?&;7jbCdz#|_wBln@ssO^
zbN|9|^kc5~AivGjdm?!bp)5tvsDkiv+W&C!IS1Nr<oVZm*RhmmG0(0APx9<q?wJRz
z_95TX$?tToFXWl)dFC5ve**k)AlI%XJx2Yi$@?j;jnzB(Erte1gAd93F7o=Avh}2%
zmC#`_bsb6gO7acw#uz|e_fUr$dAIy;N0RR<?u&BYYdrff*WaR?lc2+!)cJm%JBjzL
zf<GSO{aYyS<-~vBy-RrJ5b}P6_L;SeI#TABxS!!BEnChfZ?CWOy!yBLp8B`?bkGmP
zE9!sJS&pDi_mKbByn8Km63_gH_-4wLpONqHlvVkDO<7k$=Xbg1cj!NmI)%AUI+7Fd
z?^!UyAIq!BTe?RuF0Y~PQR*rAdJJ^U<n?LfqkbixZrP4&#Iuj_`CGceCh}ByHO5>_
zy`>{ah6Q!gxTCS=DDn@+n_vv8uiL2Kt)&iuo~C{#y)4jq0y!SYaE+yTltsFZ^d{*~
zze1<>1ALjO@iq5Y`U>~_1Z^enuY;B^QqN}NktMW6Kkj{vcB`TNzTo#1$Q96SGS@^~
z$q~s8l|^HhWJ>+A2W6D35luuh^=;8uV_ZZ1Tk=_T6s^P$;sw!C`dq+AJ7tvTGxAe;
zwq&H;0L_(l4{dlnH2(T<>JNW6+uhGgG*P>1Y&@EFKZ*XOGOA9JfeRvjSzj8A%;cKJ
z?#%K^?+9d)>e8BgSM%KC&{5?Mbdrq+`?`u`)nk<91M=$xzr09T^ibVIk3gmcbyvN=
zgr|R}+|uWQdVif!NBP55$HC<HJ#BPMjElR}_M1yo2kN&GUd!5-KFfU%({|5O#wcO+
z&x_%!x5(!je%Fw83hjM9>D{2!5by@i-a-5#t_6K?199otZ;{tmJS#ab-BUV7W*_;4
z_Z-@bu{MOPpuYKsB3H;~B>AeZ_dW(5gl>1z-Z_)taoXurXxJMbyoffrkotVi`*~HG
z_kJT!)j5z2-}CNFy+(ap{k?H_-;Pn=&|lCuR7aX1o!?sOtNcu<pJ!(IThI3O57AHk
z{6)&SEaTZq>ZWJQD@o*<+U$DDufKp-0{bN3m9M!k=o7tpr*xam<kOY<OD~lUyB>Z^
zG=|=^-*?dX8fftd`A&c?S{NKAf61>$xKHm-@P64TJ9?*dv{tl#@Ls)BwnHcC*JPmI
z57fUh=~!Q;1oRdUh{t3@shn!_fwb#JXnqat)q(sTrQCtt8Q7f9lQxpJR=<9f`gG^r
zwk0|@?a_;}_2)U+Y5B;d|8QOUuH;TH;?ijwb1$n!JUJs?lwKhnBcI{~^!RL{Pmht5
zXEwBYiF-75+>=4GBj7#JO!jDfS(PbM)t-`RqKRx@wXJC4CXfxtA=!tbh4hu@XZSYZ
zZPZ_Vx?i?$w=N(Yj45v&?9040lrb1n9_&q<LBl|<YkZI%Bwkf}$S#u2ESo5>RqNYE
zlJS!Ffqf+VYB0Psk}?!SgFybv&YDP?=E80(^xJ;-?#M0ZI)$|5@X$o))*pI5#&z-Z
z)6iMtj@m4+*AC{MKX<7e$287}2V~o6Ogxx(RPkPod6M<dkyr5|U*3E~yK>p{A;{L=
zcOY%SbB{u|+%|rD%db#0(znU}3;neWgd~sO58_>3+d+T%8yd2M8nS^@KH15tuXtW{
zR@tP77E$+LO#Yqni3Xy(+EMm`Y;BDpvPT0MB3hgVeLjp}Z|sNdK==p3vPIfak9~ND
zWN*KHeIEH3I*Qju5^t!Fsjc5{MxPvsoSsX*(CiTI{gJfZl=&IzeH-r>Ox_28?|S)T
z_7u04HgBi+x;=FZ#^a%-e!az0s?X9z)Pu5!2BT=_7Vz?$wC@kRZyxy$pM-tKHQD_+
z46whDK9M$BLZdCDY!CBHA$h92<LS%K6F-u2K2Q7A?@MdofeP}>or=A}@7vtFm3PhL
zxj+sT=leQQ$)Pkbe-n7W83Vs8qm6j(Z1UKR`pAvki{EnY+X^iP5S~f7T0^T2d$4wp
z=Uy8@U*sK<8P^~uF6G&SC~H3XyiWP1Q?L2Fdj#~1kbVu<t9eg5^4I`vms7vo_Q<3R
znnrm?AS+~tOJ?;Xzr5kT{hC~jP63@n54HUr<SBVjpB9a&Q>JW@UA2jK7Eq7(XZSoK
zzL0zoud2Pyp{>NTlGEZx$p+c`nZA|4A0VC=4@;j3?Eh-&D*l!|UQD^AXWS260$C$o
z-j#YxqzqqBhGaft5VTxPoum&|(;igC>vwx{pXd<u6OAWr61<PR<p+@c{|9-*X{Q0u
zrw9xpFO8|)sCz5u8rb}qet<9GhrmxDAAo!TygvQj$+VT;>y0_2>-_`g`0rJHuBKeG
zDF1P^-5!*u1J^`<(N}G<fO-ex);?VCPFsr}qP1u$daLfDx%?DMDfgM=m4K%+%W)ue
zQ8@;*_wDQB$af`m)jQ-P2z(GK-=-0M86^iK8#HccEIEeukS-`6t9%P@(Ee&4wT<|w
zD|}Q&T_kfh(N5p-*dpSe@Ef$lag<NxNJ7_l2JU#g5Bit#55}l~SHwSo-$;BTp81}7
zh|lCxl&m|7eiqP8x~=5@G0SLE%Gh>ht<N#wHPvw=G+N4YqP^%T+GNT)(MI$ZZ>#OZ
ze*<WbE1~zzYDb<M4NZf(YN?$^bNFXvwx6HqxH_LPjdy)apVoWN<yyd(1+?*L(DOvf
zaSZf79hyH#d^L1@6PhlgUC-zFx48Zz&yS@(A9MXw(pT`F&uEh)CZPjUSMiaZ#xHb!
zp6d^g$2sKRv<Y&M@~R%<oyD|;j{LfeHrzn}-`U{*=Dzy$zKXKQ=cV>|uDee|`AXFW
zw^P<YAN!QLX$+E2Ree?dY4J;va*HR#YtQknfcB!bXs<Cxyd#<id?p$n*dF$S&i7Kk
zE08I_QpfBgeEmndMHkvrvPnA4I%wJMNXAuUY#Gn}$nUGvwFPwej=Hs{?pX&yGt$1~
zop+JnZ~RLC*BJLSbbE)oN%ly#$?wzhFqTMiUnlaIK{;;c+Dzy^nD~F7)1S~*a`hq7
zcaFo0<lUU|&F0>m{gJnnXJ6?1DD|JibE9}p<Lx^<`#5yVS%Gc=&7b2P|KT3-5mia^
z(XG@;eDtl_1X@WptGwdx&b+T1?@)PU$EZx=+al7BhYxld0y6TtpSD(BeYrQ_)8Uj)
zX_>q}kLShfqNR9FV|k!EoI<|R+0<^Lv&MM6vlZ`>Y!Hu#=iXV(i}&+=n&KVty6CR9
zR~u-Zd>eiP*|-Us)E{>;#~ig`eV!EWje)iSFRAbT4xPkfY7@1&+BoQ6;(7Vtr57%2
zkF2168vpjlfgVZP0)C#tdxjF%TB!Cbu-R!NwaFbBbZiGth-Q1hPj`((uJF4Tb(u$A
zqD4R}^@%An>yCTDSf?_ICXyxUN0MVf{ibEqN%hcp{RMnEn)itAnQaxwq{k@7xs>Z*
z?iof|611Vp{dPvVpP>(_+;1*}N4QV4o6oPxJZ0=((_d@C+Cw9~KPYE?S*$iy-xs}`
z_wZ%Jr@Tw`%ajK}dj<9Hb_9Klw)hBIKgX~9;DH<n>KD*-ByDprG<}nHRsHtNsNb{D
zK=qTZ@d5oe$WwAfbzDGQY0@<Bf6e=Jzx0%#PAb33t2*pSxq`CF|8Zp7x^hl!CweIF
zhT0@3vt(dTXs7au4xeX~;W_$-$`HI?eMWRqo&kNuGaaF4V=Ak<1!W4xILRyVt@Lc^
z^csuBUn-maB<D3A1Z{INyfFhhs-CKYbbw7G@Wrn7`B42xW1;xB&yc!p*HF8>O`FB{
z_Vu3#`4&-@lChMjt6%5CDMKJ{bi|SB8<hQf+UXP0WRqyDmON0oXYuSw?UB2*oopBN
z-C!O|_PXW)<a?Yl%HJ~-<ll9LpTB$w*Uf__`}lPdJ>{#b@6U)(Kf4AQ%=`L6UoGhm
z^p9oGaX-?ooP_>I{Zuy5@%4P4mf{KVNqznnO;_>kFU#=J(-*&jK3VfqJVIRK+mcST
z`m*@sWb#)XyR?GOC`$q?g<r3yJ{KQ`JnMnpkF1YVUd{@4R`Yy<xOnP(x0b%KnSTAH
zyNbqDw8ebJqCn>g`0POHCBEO0&qZw?(4jA7lFcdmQ*{ma=xXYyx{9^|Z323Vj-qE~
z8zkU~f6=DukD_;d`rZSbW!o*H%qMa0h0sJ}-E{J~m*>8qZqo4@>TBW|(NTS^A=-QL
z2ilA7qV-((W{&cuoQo;TdT4wKzva3IdadO?&AF|C#wU}uhP3nfy^Zv3ec%)7el7Pu
zOnf->Q9GPWyJWUag8WA~wQch_&xrS>S4+Rv`27>SFFiy3R)4a&ms9@=$|t)1&h?F?
zhk<-%8=z5U|IstI@ScsxDz$-p68BIpjb-9h=@vno|Nr_MBoDqpzKW*O;Y4rAVU_16
z+Tc9OEgtq{Bs?n~5bf7Oqx0a;y(p9E3vDP@mo@0J)J626`ssRHOq%MEnby9MpQiQF
zL46CG`)L|m1|*RI(0ed_p(XiWwKvzuU*%F+<TF!QBuCaP^!5ImDAPvn>(2e(@vQh-
zJQT=$$=KdplRqZYt_WnTWU=I`Xfd4oL<{K*(lw>aJkj2#%`VX6Px6<ZDBrT|v%pp@
zn9Tbqhx8@sQ^B~P_9~<<`%$mw$wz))>0HuXB?~l$2K`O^E8dnJ_7!=pgBQ;m%b3=d
zXCv@0@usvv=K`M>v<^aZMmKPeXulO2Xl^l}>wCOI^v~=|!TK5L%t0A7-iYoO(ho%E
zvuUH7XyZ=2OJh~e3V*!HT8te;8QRQ8XXk!CY~|HKbPHs?>RZym*T+Q%(XTDfN#B%?
z`5k3Em}f;R(LwF|B;`L4x~rc0>p}jKyJt@Gb!_SJf!;1%d_L{3xq#8|MQ$<tNB*-3
zKf5P$Cxg&I`F#?edW<@}3gjdGH+^Sc>T@UMe*#ox(CIC%iB2)f71Tp|zsj%r=$%0u
zi3c<FoIo~8w|flPAl?i5qWDgH_%d_~culnGNqHr!`_PWEk!8E8jn+}$Oj#ZH<7=p!
z<n)~@e7^5OoiCt`MnNapBhnuO9qLQ!yE@;uKV_57;GRE8dk-4t4Xw-n+o01pXd=5t
zb1&)#vTJ11NHzuhA$v)EN&GIEa3}BnkUHsaG<A?p`bJmmSjzt{be}w#d4AH>h7sEJ
zInu?SQ@EdMcrtF;8lUFDI*nIolVF`jAvBWTM*faWUrgYiS=rXtL*$S7EhEnw+Rtmp
z4(L$iC!I>|u@Cf8yJfDs*)+nJBRuZ3K$ywbTFG;2$H1SUzR;OEs68b&!~<$m^$C?h
z^wKz1LR~cvRY)6t3!No9B<s~Kfjuq1oBVQ_eKBY=@kBuDPsvYuUDrYI4B@9%Ky&Eb
zi!zP?cSGwtXminX5$_#AaRQo7q@Jq(cLuq1H1d(_FOv2-ZKA)AS5Pk6=@askFLwy<
z=n79f&$S?rq41{iP(P4fcon>GIPEE#XnsKD&z$E~9fGn?QJC^++!q}spEZy0I%T+t
zd}QAS_N;8#+ey>dFFI=uBGWc}i)RAca0dBG*ZGZnpQDedKdTKyH?@J*)~OGz9^uO)
z`BypQB3&nawo!E|=6=b#DEWyW=20(|<3z3p{aAWvi_U(1Resf3b$AMzXTb-VWz>5!
z+p{Cj$u~6;nr^0?6Gu{JXdlc?EbUNRCzVySlZ`rqcF5#`L#bbG3;H2-TgfwlOrq9a
z{Z)6b-dxL+H35%`?=>f&{wMxdTS%X~7kUR{OdyXm21!p`3k^C`{xaUZ5gt%qd5tnI
z?(6HMf6%w2C#!Aa(5)C=T*vROOW-y7nsmZIAN+2OuS?h0jRU>xAn23iT_y9O^#u5s
zcP5WUc27rE!ar)iDb=*i3ZKvSf)1Zh_DsENLPq=ENEy|>s;Am2&<z3}RvU|l{}=z(
zZ`*+X)K=25#HZ)c?-x^c@%oWPv@PMIp@rxoJ`ecxa_A|ZF6Tb+^5r~dy3y9e^BCjx
zcOz*H@w3*b4O;21b37C}s2x}G4*7<}OKP*szN9|1Kld+!hB0WWKC6DXXFue|D&_{^
zug@rh)?J>;J%ywnL)o=HQM6Xy987t_K>FtmlwI^x9|`(}`204?E1sRr^Xh{k`2SV<
z#XT9%?m`>t*?=z_(nl}h9<5~%p9Xw5cY)8->Ib5;_*Q<EM|r2lo`uw_l=rIt2Ib<l
zo}D52o!Jiqxf;l&v#9GHR8I5OM^XNFpiL9XE*ZKyqwEL6mn!>T@xf`7L*tY^uC|WN
zDWAqGwUcDGd>bw4zsEsK+1VNkTGHmKm-Gqg?CZ#<J$ZZt{Z67xFK3kF!~(w@K|j~n
zB!Bm*)Klw222pm=K)!Re&p6utTk_d7!k-fr?E?L`zKtB{F_}7y%BZphZ8jU8(>V7%
zX@N~C`fBVCXqzdoYzcE&TtAEQT}YkxgdTTMKhbxmjC$?9#IM&1x0csNK#TuHbCpf?
z)qI5Ns5<As%knvkHsA3M(LK<e0=-GH`1}s=`k-1l81Tfe@IXLQ(eWMHUhR4s_f}B<
zfEJ>S=2DI&?F;T%1>ZMBL$$fuG#Kxnq+S<7k81e!TFM~*p7<cp1H}8{Ysu_Dr;tq7
zoLWP)7O$)(uhS{VV*0^0%5o*N-Wl4j<2`FgYts$;1^RR*ZFl&s8_(RvJ>B5NTcG{E
zyf0Mcx68(ib{WBSwL!o)ODKodBMpVm#BZWgKo9lj8;<tNtg?$oC3oxh<<p>(>Zdjc
z=%qH)+=2L4HjZekI*XSAURGQGRbQ5UbUOEox5UFb*#}a4a%%wmsd`*S8$Lo?h$q&;
zm%;eim}~N*sSoN;c8cs1*#)w%WgBRXyvo2KD9-cbA)XL#+(e#|^BuT;0rKQA;#w;Z
z_|^KA`0aZ#?a*qpF9TFR%~z_wKg_eTT}7wNHqWGu_*nE1e>X&fpwEfk_fqCVM)-6S
z&xtNGpvQgG={k7iaoRg*Yqh&*H9CV<t>6RE>IvFj@6Bvq@#9~$t;#6<QRVq7Sy{+?
zZ-O3$w6|yx$Z^Sj>8R36rKd`Vmacji<^FhuucK<5lU*-6Sbu7>k7$pV7!O1n9b+zk
z!I-^iE9!b9ZPJ~49w)yqiAzq&1{Cc((J!=yN8_Y;;sSV6HjTzp`3}|IfgLX%)%dD5
zkgPa>c8x>3W5I3YDZX1qc><o(G3uhD);haO{k+bAJ}ToZ>aTJHw9TaJEb>)52IF7Q
z4|Y@z)gwlJdOxpAzyBd<ruUapw_uI4)`4l=ruK&H2QP|;Ux)u5ghmh29_>@l_ov<;
zldk6{aNh>{&vbsLBx04-rOwFm@_2EiI-ZyqSv)mbU6ZJsR8wB=Os%P$7}2Ez;_>-O
zr>LlS(W0WV^7!0HdC`JMq9|4wt9B+vst%4-&nw`@xsl@elM|z3QD+LFXh}i5rm{Mk
z7@0^!77r^bN=CRpwj`PuGGt=3BA!^xqgByF-7C|$p`bjHOs2~;Xdrh~L@TNi@zNnf
zMo#hK8F@{h=;b3zN)pjza#~fSI65;HT{t}7`_GvYuc|4JR7b}oqEW9L4i%0jDyc!T
zI>G<W#8~C9n#yFXtTI}X&&A=+I6oWbfJkLYd6a5XQJyJ|RzV>>R$Nh4KXqYoa#3lb
zqNuz!e~)q^sxu=Q9am9R?)0fiCi;ld<$cQH@v`!0pLy|$XrG1ASY=rxQ5~(Uh*Vei
zIXIq}pHw~il$S54*o$f(87;2fD-kV^Mv~D!#qo-&Sa~$DSE9O4d2DW<6lH?nqC|Ch
z@8YT|XKJ*noJ^)IPF6=N{AP40O+~D@s5Dw#Jg+EHQc{=nsjD^Ac0m6k2$3dxQF%O4
z5^zZ@Ua3Z!8a=8emWY;&i6<r%mPae1mDS1M7A}@?o0E)IJMl^yX#U7jC`g&gViYGa
zraV$M%%gQ3>X*kW%kuTl@V|#tX{ITHh!ye5KD7zuvC5i79^JgQ_V{CQl9no0Irq#@
zbBK!fe~J#T<%nc8tz1++FA;^B8C{@cTC_~`bH+xi3*sfwsgcUED0Ce|J3A8~{g_0&
zq99T{FN));@Ti(dx$c})Q!zK1m|QyD%P;99DQ~>86e1^UDx!3s;KDR`J5pW{hlv+e
zJJX8eRnd~dL?WJmMdFESHP1gzZ%9!*pS~1NL_MlcE-i&wrpL#XIHRML$ta>>N+gzW
z%A(ap^Zb7JxAdM`4%m(*ydqX7=tEu!rzGOVNSKtM8Iy=qM3etK*%nvKjh9P?7SZ>M
zOQKa?*^pQiXJWjh28J0Ot$=^Zqth0~N~=d!mJ}Rvh$ERiIuVIgIx{K}K=VCLt4Sna
zI-g6XRY&N)qoQRFd{-Pz-8HVV*yBZ~pgf+8POgd~BmIO)u+o%BS=2%5QXFTTM0-Rf
zn}WD4jVCI`)yhl_9d%?Vv1Gis{{RWo>Ud2>WRVIqB@tZ^i`OK_Rgzp&T#W>oQtZq`
z_#s2bl-DHZO^=VJ>tvSrZ()l1BDLD<VIHNYMizP%qeE#(aSGy9i>Jr^k!50Jk=IE}
zBzx&t!HwyWY;sNY<Wj%iOEivR^hCI%Mx$66lD@bilAQ01QWN>THd%XzCpG?-`u;^H
zsHj0`cr~X-N*+y*rz&u8B32#EkYbEs6C#V_HAtA6swxe}8UZFo7;66UZc<&IERy74
zh)Z~dA3c4-v?6%N*K?xI^h88Mcx}aLj-rxiq<UVFWWE%cf4tBo{<uxK%VQPH8hUb?
zehDi`8<3JUwkDD&aSn`?muHCg%xV<J7ZiCStEj5Ds7!bJN}8wV9RRbXd9ff;SsAa+
zQ1-lAxa>@;PQ(zt)${0&rLjfma)~5rL`v{XDS(TsXk}Ezxs3B-^hxo`!bQ>I)S&-Q
zvuI7_!dPVq@{kd{j#QMW9)~o6VV$X{q9vo_3oEC`5s=82_+n>bJXx(?7AuVvM=GnQ
zc|CGmWl40AqX8F!R74VsC7C8ht0N_m>PTT_iBmv7td35KE}S@Sl-lF~G@5uB!$Ex^
z$4mIXeriRHCxBtH?XJWsiy02<C(lLL$Ll9j29K4cSv!T5$(lq|I#~l^xG<6^U(Dkv
znN~E9;`-{`-_lO0*QB-4<h)2pd|?sYtHvLyGPzY<wAdeoiv0RiM@x$4F0N)|^%aD<
z@oH4o6wN2Z%V?;STBBz1xlgQGD<x6HD~qcZ*Ct3UBJ=;P#tm1NBNJsw#>~RX1+_Vp
zQ<iFHB(@5J>AXlH6`T_Hgf|LW*+isWq%1nQGO(w-d&l@)$Jf)aDkL!@a~YvEcrw;9
zN=C<%iBpz{FQn;`QE$wMkE+3Vsvd?im#nU<c;<J@P(68jemAes{(v}cv@;#fo?KbJ
zcm|e46?|TdGC6HR;i!K72M_ZV*Ls;2Q=R-?!(pNAZ%wPb!A%pEEcEZIwU-*qREhb$
z_RP;{>LEj>O^D8=7=8wc;dDMJD+ff9qoUEusZchGErE98^<^4j+!TiniJDp&pX^(s
z1@ofC^U+QUJXV>!AetyGhi-lp4zCsF=|L54g!3S+Kgb3m6{EK-qpK8n#(->+q~7UM
za}o+w{a&{KPC~(sE}{=@r_&sjL=qNl*WVclBP9#!=TurOg$gl)3NZ!6W|Rh;%qrBN
zV0b2}uOkoW@0;qr9p@=)zRpqGv8!at7e$JTYbrd8w}dC9)A^mm)Ay&<RCp2%#-LP5
z#`5Y&`9!e_JyDt>7A8gqVrz0`)RDU9Ss-b0BF-vt6Tr$-ihU(bRtwFRzN6IZDr2NH
z92`Nt8HX7kTM$k47zx{G32#TRmOIe|Mc9t*WhmF>ui<JGU+N-#=@eKuej8y|ComZP
z7C~0aP1_j;FjGaWG9vxdQ!f7L_isPaLFQr^Co!Z8qaCglUrV1>9j}@`FN#N@T%;?H
zCm1ArhDg%asF|mz(Xkzj!i1a^`}yPln3_sY0r$E=T@Ao9eNsxmKV9`?poS!h;N<zK
zF~uv9MtZ-S!*x@W$qBKFSaso|;%Kx44xIqM?`W_`+D!9?DbHV3JD#V;RViOiFnrb-
zcpJ28_T9J07ZclEO?q9lw5WKV`g$FMi2hd<l_9Xb(4o^+$LTPO>OYfmIjMK}Vj*SW
z*HZ)v;z&vcCx_vtIO@N(lqwbZv`jh-37%tsu8T<y$2g@4lAO__Q@)!qkd9!=CQQIz
z3!;T3WznRxBuvXhp)U|ANXCd<ioR4{6qTQ$Rz6Ufa*fWA6BDq=Q*F}1<F7_)ZmK)<
zWx7>TBm8^_7%eZVj#We#M9MV+@l?irytH^;B5;C57cs8kXW5|-tg1R8CzvmJ<RpQO
z4nn0yRg6_eaV$EMD;bXF)3FvzDTe!yck?32c_|ay^LTmJr^jcIm*e@+GG%<4nNije
z(;5Q(QIVH<(gj6fA+27#D1vR`8}%utq|D&*#lDFswMC}5+C61|W~2nw^;|7Gtm|Oa
z6nC!%B;VQVw`8WpieQloR`pcFPpvNZ$E0-M2u3V#e5OMF@62doZaf*Q&Q!^K7b?WT
z;W-U+gW>pZktC^Jo7P4>R`dP1Nl$cF$4csVs|BSEcHafb29teH27NwwY)ovCYEG@H
z<K@^3YI(-%;`x}I$dOuR$D3DDv^W+;GS?FZ{)+-(;VCEeZ-I>}gB!M&CZ5Rfyejo_
zM!>7l;??+6#xm?zp?Vbgy*<-WJ^}rPH`We;=~6>b^?pZ#JIyybI=7~*499Nf&5>fy
zF<ey4_&(25La27hqJ9oFM!-~}lj6oaKow8t8+84&&e@=wmD@AAGcvQpb2_JUL1{vM
z!u%SwN^n?I!C7NU7!LxS<8R@zS~_i~Dj+P%eD~qBg^{XZg$pRKud<gX>&fY95ys;M
zz832B9IQ;nE#wZGnFey$h<~UZrhB|HR_u*>PT?Zv4bZ7dYO1Ek=SM43`d+4h@a7(P
z&Jz%tTG5c<Y1Y%@g_9;vES%_!uBoU>IXEGkQygK?Q<tqXVUVnuyF&qz5|+!y;v!7M
zihWI+j!>csI5m}3aXfyNMb#CK$K9TH1|eO;FeJORLe7MuDtQ`;Dm}k;kvG+XvE{jL
z^k{}pM^xB;%uDkD{0}*{?=x~b(p5|Sp5u(cZ7_N<JVSZC@t=O0s7{NsQYq)tsEy~=
z^!qTjS!t;^k%6gR;wxRX3WM}i@wjir$T0C7NhD8*FPsov5G@~>azHWdfnr=%>3RKY
zeW!jg5VUA;sexx=Bv$EbyVNHo&wQT_H1ID>i<Xx<QsccWqb1%H9AlFPU{CL_?PoQ~
z@@TZm!NZQnpuji13#;O|==@>bYpvSy1isu%3#pdMF;sQh&y#c+^Iqw`T%A;ZlzEwc
zrh?bNUn=aRI1?4o#Db_lW9T)r3}esv>9vhtyOdTlYVqXcxROkorrbDmlCmiNVj1Ah
zL@$4T$c7q`f8Il1#aLI@4OJDZ!ud`Ilh?60I;Gh6;`uf@l-*thM+e6cE1rjhP5Jaa
zXY2HM+6$L(f~E<){pu1KTCB5NLL6_iXJ#ky)}?&Yasv9^uo+c!bx#O+cAx)nn)_ij
z_`a;tNewg5m^t5chpjiT)DDBGzU7I#1@Tx(eve)~dw79fJ+b5hWp;Zr&zt!3?nWWS
z@!@+N_Ducy${aa(9c&Ww0MV*Z^xyiOJT-VPlykbMB3U-CqydZEo5(3zP*Id<$To+e
zHI?hs)Woo%>w48v{TG#XBswz+Uu~aE;BCR(iUE!x=DFGa9-ZbdMw`^I5%+!G|MXag
z`@EzaX^*Zd{UV;|=c!nww35V1svk=)Eb%82QZqYg;gF1#mEzh?;<2es+oAU^HBl@!
z_XR~#T>!hc1f!%ho+xC#bXbq_NX6WeNPeU`|DPsDwKr*06iJl9X4P~ZQDaEIz5|Bz
z+Xp_5p+*;0RB4P&jS0TnFS!HJH(E4A)-h<sm<aRdkgF4mi_pq7&tKnN2nFRGtO|-z
zjWwfpI>mDGOioD1BlI!Ra7EE*^luqXqM;{&$CWHXXjGwq#4)q|vLTP^Dgd-}Bvw9T
zNamDnt@DN=WY}!d=vb5CFA+cilr%y6W{Sq2N*GlWD=!&WGCde#M>Bklt1OK>X`!c-
zdb1HqO1UD?MPrf5{Ni#{u6#^hO+V%L8`QgR??L%J`uFYW1k(Y&pp*LR^=8d^Qfc{;
zSS{i)-t@S{a-_I=3d*}4!%aC)#F)y22#$Mh_ZuKRHcgLtJWl&~f=|Sk+^Z|e(o#^g
z_q7U!dhW#lO2+W0;l4;2C-uczl_$^^3>sMfLhVed=bEf1mQx;w`VS$hw)aR%+d=5d
zI-lx^GEd{dy_Tul*SaNT&GsAQYiH0ymM`S=+yahNe5Ud>lY)~;8WDt%9MEMrxICKj
z7<kmyS_{3H0WuvAbO;hMv<G}tl_Eag`4}#W+S|);pH#t2|8yTu*3NYL_9TV#W<05{
zr_e~n^lGkRYi=Rce-&j&MnzeW@?|ES%+8#)gCKyqF)F0baVeK)W}A2;t|*AoSR@Nw
zt40F;WC2z%mFTI0zF99>f>N%IUrWRzVg6uoN?NN(82}hIDCg0lz-c0_kN#X&8l>*k
z`k|CLfE5zVxKd$%evtmgl!xaSmNyt?^XJMvaR5P-=ygml94w2$29mmnw?IDaiE-vG
zWdlvZ6q+uvHLYfDZA~a#rTExa)}vTYY5Tig2U%2-V7O9c7W>W&^>)T@Z^kt>om~~L
zO4-OWs+a_frtBoyn9f9bx;;;iN78?q|4Ny&@U13)$WQkl2GM9n)6#B8`>CGiZseqC
z;|ejts-n|a!4<7UU6DIyn(v0uY9mZ%e<g{ySH0C^-tuy!VN%tk(Sl(m8JHdjRpxKf
zIz2uyU#}e<Ek|f)vbNWyy$1FST<ll0;_7K|3<dOjU>FBUrgf@uJ0{{)^J2vw=Xxq|
zN{FY}WYprZ7}(5qpzSU8CUn$MQhC>3%m@Q|1Dnr(%hPw&k#AKsiLz)g7m1eVyMW~t
z^QK!ow(uDxZHWYq^I5a?a_Y^(GS7-19W9O2l*6sXk@8}VAoay?RiYRlnsm?|XsWUv
zz1jTwPnMQyHB9|f`d_Sm8e$>hsqudcYm&PJPTIfb89$!ZSF0C$6V9IZ@$V7O^~;cy
zgz(BRg>khS8$Fm1^JPb-=w#f*MrQcopLlT*x+6V38ZDaVt#@Te^JZ$*4K&G&YQnS+
zmU*y@k>ye4H6>BsXY;oZ_%D=wt^e|>neO!BB<6-P`aK$TNz_xQJx4_PPP`-iYsDq2
zz$DA)AgblaBol1%o6+%^NQttXOM_HgD?DUri<4){5LAS@?Lz4<OxOoHpn3rmLrFzW
z1jCRw!!Rn+z<ghQz15pMW}4eqowgeX<-U+vj2gxs-n3c#|Jb|lIH!uX@&7>(EQqV3
zhzJIyDnjT@F?3d1V5w3zEV~N}%Wl|0ca4Y@fnXOAV?`7tDq;a8iXtj%Ktxc~pkfze
zLq)*fHFJHE%?8x_xnH+_|M@+y`|-`pIWs4Fa?YG-XHuMBI6?Z|<pI$(-wpzmNO!oI
z^=ONJdD>VLW5#6}rNw46KZm(wGFVhx($7pFu@7cj`pFYVCnd_^WExuXWn*U9$WuaU
zJ*lxBvhrj~gHC%c*7|mG*_Zt1bTQ{I-jQnN{5njNUuN1#^;`-kxya<&l<p|`TC_Me
z51a}gs(aEHww>%3e`*&cM)RBnIn$+J`k!2zW;SvQnVM13Li#ZZilvD)rMQKs_dt0b
znkq+&^a7a{RGcUarsT<ZWWj_?`FnDkk2eDx@-La#Y3nIwyrOtW;lPY|4`aMy@uxM4
zc5TcMMX}T-vrSuNysuPeD%$66qL*$+p-j-@16BBP5|_Rd7h1)&pT>NIbW`t^+DHF*
z<xEbpK5VqBl3qzOXTucm$+ITu%#_-Z6#A(}Z=z4+Pio<2BE9`~<$0I<95bC`+AU*h
z&RsM54VE#sVl#21yG$4{9Z53kojV}M^fNF&$zI*v%oL|$M|S=+8CWtGmziIvv#)io
zi9D2jGwqn2>|aEM(qX0^$OIHgt2b-n*)BQAbo8Rk0^8|4B;RD6DcveEOe|fsrej6Q
zewnc7E4FRPc$L7Wk+gF4O6}5ic@=5SG6^;D)Qp!FW|D7e5g^}Hg^4a&B>ekx!tGxm
zQ)~-MY&mOBat{;ql$K=LsbBKIklIoChLrg?Qm9C$zD^R=`c<NJn`qQ}y6e=|3RkOq
zfjZ6COq?no`!f?OWs*s;9my{)zsaSHmo68%&&%w!EXAI=hUi}DZ)Q@PN?Y>V7<6On
zoTGkHzT2vinbX4KRLZyH?kqcwkz7@gLQcwk8BK~G(bSQczLR<*=oFa4+pfh@0w1=o
zPK^1V6j-LwA_dc*z9}KsS)##h{%#3tOI)iDDNTj(c03>@QhR$P?Y~&bnv$0#-!8SR
z@(O~vQi^O_N?HUmBkoVWN{KH|Be!Yh#GG+H?dI$qZ)8%*m(oseTzi2_sc6c8AsIa+
z-jsxg&9RVAFtor-`n4yvSo#2q&0X@pdHSS&ndN_kmcB{va+s%)-mzw|VWOE+nK7}j
zG+&xEGJ#AwtYqv%3Vzd>U_YtpfJo|au+s)oI<8WJU8vZYYJ+qer5BsZQK?SuGew-O
zdFIL))5dM0)Uy+5ObKjTRdT+P>QA-`YKqn@9b%F*Y$gXvB`WbD|D=eueStC>GG6BG
zO1G<dcJz|mY?V*ivs!!L6%#x5r#)cW9Pd*)A;&z8WiM&YnqfFIvrCdo{Ylzo=Iczp
zVocR1+l*!C2wzui-b-(5`THb}{_<QfJ;0`?*UUytDj?03r4J*iRZW&Uf!tKmwZ<St
zqEIZ7N4R;sNu@IBT?YAhX39_bUPB6{05UE1R#JD985t7#R|*ckD*D-_$4}0}bbUWZ
z?jb#|w(C+xYB@t@E@585G#R0FWJnQS-btzh-ASjoQ>X6Q+0su=K*jwM-KWyhD@u&i
z_@wS4trr~`;nt-T*V|W{D@pB^{@QHQc1!ZmYkp3_1epRqr7&COMB8C~`Ot|%(v*EV
zf*PL;VY{Ak<KwobhLlo1PSZUOi;fS$$|Yi^O7xRkF(u`Zr(49+rR*%HPX}$}{W~sn
za~E{8Gl^|o&i?ZPQ-$r+iIdZ*Qy;lNb7U^6Ttdlp6HbqwJi2&IxhLPPBZCG=hCCik
zZHt?wzx@i#Gf-#GNXt-KV<|H%&3B{y*Y@^L+}3&0^;~h5)SnJ|q?AJ`!KALz3gua%
z*<ou^dd11nlXjmj^Epfhsz;{8%Ux+|8U0JmBw~t7Day<%MD*~PzF&FP<d2(?^LMWh
z$`eM)xcJb9);H1%O8#<=bC*P&wER^cc{@`2)=ibte6i`pG8xUx?~_PV)saaz$(>J$
zD5)^z6qR>M#S5kZg*tLib3!NK{CSg*S24JgE0%TTlPI;POXg_E<K`bJh5q7+Cj~Y=
zMbZImuPc52#jBA;wxpEnZ(P1DmrRd_7P?Xl)8)y0gCWxk`{YfSC|{bqT$q#Xk-=*-
zdOSfUu8jNN{9RdQphw!dG7&|ltaC}(4x&G8{L!7P7be%IM~l5gn?B3-D<^I$xyemC
z&)kx>u_*(C6LL&zL}m$^R+p^}mp@u%cX=FZReWecft@re<$!5Q>eEsJW>SVJu%xh+
z=d{Vhye1&@wy#8|%<)K>_E=!XH~LG`$>!oU0|#=C$>p0;l1Vvc=4hqF+S;Wwe6^rY
zdRSyu9FUP|;>jEBwP0ponR%Ud`dR-R>EE<Xz~mEUr*z5R!Bf}vY!A*cqb<qp%zyi$
zHl5pA6PLk4`$Du#xa^mgRV0mG?WQ*0Mz(oVJwrN$q|z6E5wHAjld1Pox|V;<W?E2U
ztLbl%`^9v^CgP;WOlv(dPfe;O=59{R7&TSUamj-q({sj6{?lt#U!gX4niQV$6p?RM
z-f480b8WtWf%)0-Qc-Gp`m8Z0zS~TOc;s>{vSpmP!ldMrj%0K7$cr(igsISeu2{ay
zREd-bd8Iw&<q|vnBI#1;)akz&%9RgB4K63!RE5i@o0M0|sL1J*9DQplKFpe4++V6~
zj*4@}q>wW8LUVT<*1(XPRH~M_@$Oog@*|xh@hKWoO_7ScEu?i2w!%qZ+}Td6wVf+N
z3gv#4?^bFb^3B*;C1%!uOh~nJ)6DD>%{k>(Obocj>p*5+nwCBBk$`g@Wh@;~KK|9s
zz6UAa=}?(fVqX4~4{FA~?Vl?>(DE4y%*AP22=bXs2drErrht)inLfc>jdG6d#UFp0
zLPu<dWeo6`GC%i!Q#$`2+F9~l*zO;F_0B$V%%Ep_ft{J9FYwxaDEmHBv3#rM;!mz=
zC0&80RhpeuI3rW0`pMv4Uhd3){qLb<mx+gV%v#HyT!}R;Dci@D+?yq%x$+{446jmg
z&|BEttXkie83=OYXL!u?aMLYJ_e-(O$n@e#yiUv?ERTVIyKqhPFzLV{qi{Ndw8FO%
zuR_?Ks6454739bmi4;`Di3W&vKRGflX`MAr=1ol*C*!-es-~S~=HfJ0yIjXI?V-?g
z%9}}5oT$PAJ0RChCbP5fhsjrNki=RNwuX23oJM_hRf_z06Et3xFtyXaDV5DVYdW9g
zGsOFI^JLm&dV$m_<n~Ot<)zpimv1M>q-;pEhmJgD+}zXVbtyT@z#=UG6SG;(=#A-f
z(te{XX)4&4l}(#B`Sh1R-fgij6D}mP#F^4IrZhB4a<qI&H;uJ;DQFsrws<jJb@H9W
zmo6}qv0XY1$n+4)YsT6sEVDjJW!B8J9Q~+L63NvqZ+F-(gCou2r<GEnoK1Q5{B<y?
z0rKbVl_Pb>$b<;{cC_ua)PbJ=2Qxk6#h-RS+49eJ0%%pSzfM1wvsXSzq^s00r9x1l
zWof=5DFEbqEuYY;J5=n_kxIC)!fa(JUAmenu5u^JC6J%_SG_~=#tl_pEqF|o$&UP)
zSywvNWyVzG5hxW?djs3@&OTkV(mZgw^hcRd1ZicPO3f7Mosni)a<{ZhKi2odGb_By
zNx)_pzftWa{kQE%`)Z0QG<C+5qpbY&%7?NaPHRU)Otnxd4_en5SY+$EdYvbVD!Gy6
zZkB5PxC#1*Hj@raU1Mm0>F}0!!Dq>eHug=><bDyU_KlMlmdu2v@~Wb|e<kP7kyj`c
zNEVcl+N61-ksg!yJ9Nwmv-@RBH=S8nr$WC<{9TUJcxfX@xtX%FeZg5um6T{HHFVay
z?OxIkWez5{PvXGlQ_7!`bJzzk*N-_bhviwZnEH#7NqYC~`PM5}zmV*FdCEvlRX$kZ
z%!&n+RNT!0SvSK>qS2Wa(jb)C6Xp$6`&G*5<pA3+Y4#wa_@HmaK9s*IKN1C$JcyEt
zoLs4poANoSd5|l2M6r1|+3!#u2E|?F#rhoc^~-Mh2KZ3ciXx~DWmBV*e+?;|k|kGl
zf!vZZ8&)4oi4I+Py(-=jEO(iio~kFy{H0O=^pN4Uc$CBR{OVS@(`+*{&&<~xmnD~~
zbT`vblN<}Aq8J}UCRVO!U1cdF>5Y=)ug>(}rmhkp{kmEiu@9@E1v2|l7m4werf17d
zB%d)cMO5z?^S2a^A1KM}YuFme6SS-T>GH6b5{kDRY!%xEQ|EV?YLYqW6|#_?sJt0+
ztJ?=;c`kMln?B?6b7l6NDKkwanWIUAoAdh5)TZQVZCW3R0q*klsW}NndD${Po0$Bg
z_q@*dOq~fTvwssW#F#<g%)@SG)ATmA1G#f@j<~P{YmKGcukj~<>3fvxwR{A~WMv0!
z;xDu4dQ&n2&p=@E<xyVeFf}Q;vCHdnrt)E1qS|{RFI&iEW9NLzvKkZ2YcAtuow#%<
z|7;Jn+;6rzk{Devl_;5bG2XnJS^n=X_5$YXw|#dyMkCjS>2%CCZ>yV{w%lMnnQS6A
zg|w~y{c5v5>MN`=V&CI1m!X`3<o1D-ho;A9sC_Rb!!+6}E^SrbwJ)d1jCV^l?eO&;
z4sW_wtlIvco9#(8GFu$#gSpqZlsEq5%QV9PvP6TO6;)`L=gO1@vUyk4ycxj)4ze5>
zUtEP+fLsK{9@D;-<;kWO${y0iJuN4>C(ZW9nLF4_-z+W}C%t18-W@ZSk~AeuK_XAK
zLi+|YcVN8zudQym6qs6)GMOXg=Fr->j7rLpQZ~skw1R*5@(oia|7FGB_ORuq=EoW@
zhehjM7_W}&gG>h|5|5}v5526Aq2-Oa1(I)1^Uau{Xx2$dJaCyRq=__EdY$CCsjD!U
zzfn$O@(aDDlSUdG`b{Lw+2%z}X(pH+3Z2s3-K<w(AGz_Ti>W15ycV@arPXoiU6yJ_
zg?%ghjq+xKec8_xqW?^B{TJQkQmc`tlH8HD&R$+hNzs;6y|e3EB{o)k1hT?~jOLZ`
z5RZ=pcg-p;%$MqlnM#*j_16+pE>?P1wLG@39VGXNl*u!FV1*8p_@X89Ku%c)ib}ma
z{`51*;JoSNWwwoe5Aj-VnfdPQmss&-U0WAOJp9c~C)xe$Ux?iQ@$m~&<BTtBXCFNF
zZyF+(aZ)=Z-e}jiV6@(lT%}CZN*G=E(=Gv-p~hO5cIZ2)ASu5Pq~I{i2FNotJJVFi
z<@c9U%8v<COd)Z2;bA7YlpMBplU*oAYMAjB4flC^yxCAFEpBPUm@#<sjEleK&o^nd
z<(0@x3|nPNq?IT$4L)gkI?e~fI?|4pyD|Re6;J&Nx2HaS;%ohw)iktVNZbMODz;W?
zlAa?;1B%ihk^yA<wqwN(;eWe-t3=<vDza7KqM14zW@|5uBo_MY)n&V=4x0ie<Mz@E
zKdeW_Kt0>$wu-;E`EUOv<vn$B^`*XBDjja|mpP^SAy<RcI84WbHrEp~#F{isw1Al`
zka7hl^=jMqh_uaEWL^-E<-%kYIO&?^4zUCF1*YZuXL%gCH%y*eGI?OK?Ql3OrRk6}
z4>oz;m^;nR&6HEG)kke<{4<5pl$1(+(YWcRk}rj&y=0Q^ulT3YF1&bSR4Op0nQtDY
zx|)KWbtA>RZJ5~Q+2rrCZ?4&;9dl*0D$|~WA#!B1GO>MMD5<phZ+g|_9!<F)liIoQ
z`}XjiQqr(q%8&_Wfx&;~bpOReS*ud^tZPG?7O{CIN!3|9CG|m9p*|!JFR7C<*;;01
z#+P5QJtyVIOWZ~*U~N0%bPU`)k)^GXnC(ryNG8-2N~K4-#%&`eBi<8UWL|EOxd*1y
zkWV5_zk)(pQBSH@X0nt#KKZ99pl!X{6xcdD$dte=qmx>tlhWR#lR;kF0l|~V3}5SA
zZ7R1Ft6Rx;d&Mp0$p6bVv`tOTp^Zvu!{nQr)T}ws%T{Hy<EDI>g7hBgGC@_AmX*QI
zl>6M4Dyfn3&ajz@m$Iv^txG@S;n6ZYepsKF+{I?Fze0o8M5K0^N@t2)XxTI)O&=!B
zVOh!1bfqU+!`ddY%Vx+;n!Iw2+{<Q~jQJk*T@ZT;q%PMZ&#Vh!-lnpz`<dFG8DW$w
zL~c78!L}YDuPZr<C+GPxr%#mWD>|$#>yzY7knWxGUH^kZ;6J>>CC@+G(-sJJyOd&{
z@fp5uNrU5NAY5KvG8Hpv%k`0tPusC*S1@Vj{D1gfby0jtl}@cYcIqH~De~b5XJp9m
z(xlR2(><Y6g#PvNOB$)t|B^G^thJ^iq7}Lc`j-~T|5};1XWF%~?>fsfS_%@o`h$LI
z(?_5k+VU(<=dwvEc5DS$?|Rc}Og!D448WOk%hcOsU|aSwwE<Z%!Y*uLR-o5DHrosL
zCw*&jA|;J^{B@sLDz6bqe(|+M?OYWbo3tRYxkrys570)t)|d07f9MF^%B=nzUo%Or
zLA^yx-?qt7`WehXyv+MBn`D6@Gu`~KSC)_5_%;hL_)6L&Dm>H#<w}+oa?&}m_2+-2
zGB+iqPC+k*44tyX{GZQAl+R~MB0H`nYh6f(a86P=BVW*Aud~I!bbATe{uEpDwLy8P
zNkhR*ame$PKkAZ-nUW&gw2=bcWK-T3U^)%*<lQehzFn}iyscjB=$DtJ(-F+UQk(p?
z%rqafXhPBgs`7Qpn;`acX_G?=-|}&=<VDrYTaC7!DWAW{K4HzPDe*T`Xw8}KAZcy=
z&HO{=jG6w##C$*bWco4e=^0Y!lQYy!mzb3*8|T2bvdiZe#us9iq_%bKkueJCS(nzE
ztX@}P9RDA=n96G~+V_|=&z&lR^vM<3F)UkmvJ<f#IaBS*QAty+?O$wLJ6ZuLU!G2S
z2J-dlu;a;z0!P=E>ep{r|3s8bmnoG2j=U1-;E=v{(*-olz6zKjPqcW6Wz(7ECCyT<
z88S24wxMK(w#>XMZ}rNsp`8SCc(;@-q9pg!34R0R$4GoS@7BEblu|QDjKxojziy>%
z7<z1_=_Hr59SO^m?=Cg9Y2%NRkyLpJIVE3h+o#4#?Pgl8^!cX7#|yCXzthwe3k&SI
z`LnehXp+ecW0UNkd2pqcNyX+lX)l=Mo2p_pk}c)G2O_1Dc{<3YB$LrHWN|h7`n!}T
zGA=r#uzaLWFFRZNv3qEkif=qkyjFDNs*aTVNv#tbs_>tG|7|eSSToNhd8$k0zgQdN
z+DJ5SO_)Zi6bsTC;l1yHKGPmBrLUZ6TXg7jbGhylL+7@*lm><rEauk&q+V%WfRA_E
z+K~la9qaG<vCMa52g>bJP3P9ipuP?1_mKQ6H>ShJti>cBOsD8bt4WGR)7a1vSzFVP
z4`iRpv}okT68pXXBc)vG<!MH)&D$*I4BB&S3kCb!G=Gk?2UBj-l)FCPu3EzlF9S$s
z5LwC_`zPsb7+=yoH6v-4F%>W62qyXnV+YesH}T<BlOsnbx><O;p+{j(@qoe-hEB}+
z%^PnDN;9Zy|6(mfSq#0{43fsb6?r(D7Gu1<$pj#IW!<i^qFti4`_QyO<WHCGV2?~#
z>2H2tLM|4XB;0%wk#x7|<&oG)CV5nB7uYsYv5Y7FBbUNoJn(G)m)>&nne3#0dk1+;
z1JyK9>_6gbJtob&iQiGlLy1Xa^)i-RVKh!xg3nKCOvFc4?aQt5rx%-h$9^AWGPIXi
zW7+oAm!Do4Gm!bMX0EbX$<@5$Enl{o*;tl%qGsE>uuoyW-1lZD8?>c`y$2;yAI)8*
zq9*sVPWeovmemn?K}vq)rE9lzx-81Kn|c4mQmmLME;j3U&$vBr|54GeFJ5zlN=-vA
zJ1H)B8rW7rx0$B$6rb4hr`_$Qn4(_BvE}hBmCK}sW&UzbwSE>^gk_v8VVEt`!KFqZ
zf1T|vG_(9|kG1J3v2{~vq|5n}nuM8ap-q|!LyV>sZwtrNz8LwL2${V(G0!ZaZ_1%8
z*65K#>C6g!sGI7~Vavcz`?JYBNfi{9ZcQzE%v^xO-;ymaGvaIKrKXc-vR!jd7PLzg
zJLPu~-GAnJB`qrHbIg<%UFF^0f9Nfh)RMfs)nivSFcW3$yThi5A*WH643GuM&5wo2
zG!yfMm_kY#BDP;ezn}tJa3#lSlgJEd<VxMcj<m^evvj#IRVU@ui}A(2!{78Qzf0ry
zHnS!Y<(|G7@MqVlJP!V90VJLMZ>}y|1GcRY>1LE#uU#m|-rTw7x4s6%r(l?STh?IJ
zpIo7?AwhF@$R#XQ3o|>~&h{}+94!^POY_=}jLYqqUSL~|X5BV9tn?(BHTrEyBIngq
zE3`hJC>ylovFF_MRVU(${`F$LIMK73+>kK^is{WTZzD-jLkF<FL111hE-Wo6UwKvj
zetUzPzg|l_YAteMnM#n9gWd8b#LK6{3Z1|CGPT$--=WsV<n2Ct?Mki9Q&KdfF#by*
zJxp<5;bEeWQnQSTxx{<Um^e=QsN^b;Z#A*f?GYBgHBENu=aeU$G*E3tAoZS?Pn_IS
zGC}StX-34C(@m+m(7@0xbU8of&3@j1OByh=XH3WblP5^ZA8Qu5amZvjeu+(HU7C+?
zE?B9jB<`2^I75YT{*-D3lX)f0k*4*sq~l>Z+TC^7@h|!^lQzjSMn>%7MZY{G<QMPs
zE3j);u<W*6G(%>}+N^SQ$*`GycF3}1(`53I?evJ3Mv~kNx=NMFywYx_Iq_bOo==&Z
zW?GJp<PKAFsqjuB=h@Ui#>pIPSt#&dzrW(c`|<wBOqo_`Yf(B=R{pg3>zGLo!vF4#
zAX5z6>bXo~pypyqax-tZ$WgH?fr8#FU~QT|c8PrcZC^teD(hLxJC*h=&+_GNZ3}t$
zxFS36@gJ#wrar*rWXTE~W`PZR&QjKw>?rF?x}@*k*+l;1mj9nvn$=v&mriCDb@>l_
zR`@B;=4Sugwyj#X8s9QEE4yRc*6qi)Yv0<{DywD3tk&(aa$H$0JLI->waaboYS~m9
z0_HtjDL3q8C=>k61;+XwNlnAQ?Gv+2#EJv+e_IDko+44<mtN&DGoh7q^V+Fh_BmoJ
zQ=C60X_YTY5}(In3ypY1%1$E6l1ge9W}Z824JK3jjqOWV(g`fZYG&bKv#gRTR7qpj
zb`b^{jx(#4np&W_;ZlmjBB^E^)?!QKcH|e&Gvlj<AL-XSjx<f%PT`y`Cqqi=Bh2B<
z%#^MoW^w*szCmiw@mJ02o1bfuJI*}g^lkStE{pQ9Fxy@F&lEaRFzX5`@^EXB9MAy}
z3YnB{1p0x@gtKlr<^d_wP?^xv(=J7B3SLtsvD3}yey#APhdq+q1@iW*R4*qx601O(
z(t&|2`-qa|NX#5!Dcz;CF@yR#=Wbv@j<m9+QjjszE*zG~&y?Z1=0&0ecXYLLt`BeJ
z{Ue1>d9_{25a~Be>g|=AK|XV~tcx#Wg#}V1#n&bJAD80H0w{~WVfxQpIe+noCRGPz
zGMJf9VV~yqVv?G$6s%GyO6o*&^pnbNd55cxADE7LGl`1d5!HLcjx3lh@#05XZ`#9>
zX$JGP(vWmF{%LopRtS^!{M(XOQ)mxP{HLv@JRc~(sv|{tVzA2gaFo9nmh?Q<=`j^1
z31sGuGo9M;&K&tTGBPf;GX1$A)55nu`t~0#te#Rx%Biz2q{{6hy<X|@8(n^W&0d(P
zWomix8E@S3Ga$B!Cd=>0w5wvfzMwoY%&c+STPzhK`G(BgZaEi;Kjc5FNLHw3Ce{6N
zWGNo`g_0pMPF!r}*XYo?v?dtdvGqJNiYe{Alv(8|>ziec&oX16W?amU2_>&OA*HVL
zsFY^W_Kr^jEq~8S8f-d5q~}XIrDP7X3|vXw*-i==pIIs`fbnLjKq<zG$K^@~WuimY
zls4vytgtSdw&JwmC$Ev)x!DC$xuX7RhlHg%Kitk+m9H%^kZ2yQ<E53ISn)324*L7w
z0q1ce{fCLwOyiFpv$SB+cP5o$8My5(t=sNXd?WN9QZJT3(rB){DI&wVb}Uz)^USA|
zdY;s;k}k48>z$R$#f;M&p&vFgGtu*Mq(`(QJmOD+=&x(a@VKcfbRAz(D&1Q%X*<S;
z%q7Y;sO6fM=ed02xpKRkHdK7Ef4S6T)mXF8>AzlB>SMx|jpnggK3HtWj^(Dxo{_ZD
zr%X&YrA^{aGk=U(smXSiNe_ekXtYd>mg=SHFOT<q{p)`dBlZ<n&Ue@WOZ#(~)FhJ!
zWRnX@Ga<|t<%!?Gl|RKU#p{qCx-&Ooc20SBuc?3>`KKy%&!UtTtob<;nd7a$UZ1#y
z<K={opxUw_(W#MCN@&-Fsax2))Gn#2amh1%O?Q3#7X90GF>RIJW0oy0p9q@t=0);}
zk(O(GX=ABJq^y^c=(d!S!(8GzPM&TSudsb_<%N^nDB1zg%`CW8{@^`gcWC0dFfG1x
zQ@s2*-}bWq2lu>A=tzCw$jaSv6UvWF@v2jW7b>U74SU#c?Ab`Ums4}n=Vr>|PhJR-
zJmmSTjeJ?jHls%mTYMM#?9Xf3_Zg+xZ`b=zI?LwA5cGF2<b^)DwNigFL0*TkzXT+&
zUTaPLf|f)3rCflrP;TD%@*OHZ>723(Sf@^Al*g_ho9M5w4^;Vz<t-`G4l8CAeyvjf
z{il?L8!A?J{zo6wd=}GP#}G`?gIbE@eCel@ucN#&Xf9Fd^OV|xE@GG{sp544iAcF2
z%e$14@|KYoDbfe?+L5fnnJqts-AmT%lKEFj<3hIkC-oODDxPk8f!p^FO}AaVXfo@f
z${3}2vDMtIi8fe2(_JAwv3B7x*>J?wh?#NnuQeb{ftYK?DzpPb-cgl?gqet@b2nI3
z&s097(j*hzrG+j{HuFp{OCrdO1lyJ}tufPTvPInHG>7A8$1#qyW(^(Y|LS08d=Edu
zN$2Z+cVh^n=xnarJ($))!)_da**FhRaB+NWfX<d0e*q4|y4V&Aund>tTJ*NkeDZNB
z`tfQEU=Ux!XYfND)LQfN;#~X$k8Y#e4-!6&unT+OOnd@2U<6}Wr>*AK4twGqxE`->
zr~5T+ug>bAUWsKLH5|fc@h!H0j|cG=wpZ<>=}yINcsj>@7N6$$wK;x!^6}tP_z|8;
z{w~~u=aT;|7{oAsh%t0@*3XxNMYtGuV2ulOzjn9_t6iw^FX6k`x{Jnl#WOC_a0~K#
z3%|p@<X48z;5%4_eCy#k*a35K7G8!Pjz0nOaVEZjd(gq>Y>6GwjRSBVuE06C4DZKB
zaT7*xH}1oe-FkjHVlSME*Wt~$5<}?es_9?Hw{Q-w#t(5Hw(X|r+}Ia0@lw1IgZMDM
zjO+0aJgvLtTL+zZ9(KZu(1VxY47?6+#`|yuhVe~2s)ru;WUPxVunoF#04~65a3!wA
zk1&dko_d@rn1-L@X}xs&c)~Z}-S{Y8f*$twVkrjjCJf??-kSe%{E_&x3Ae<a_!Zlu
zgug>aAI&EXYoQa}7{cdpGY(JJ^jTPp^)A+U7q-Pxya#{9v-|4)r{EOwpG$tL@HKo3
z_uxSC`Hu84boS$OU^<5IW&9R<;`4ZHe@$Nln_)V7@D{uSKfqt{v;mrr6MLW+Z^w~1
zAMeIh_z~{MsslCO<1r1pV-8NiCLZ0t9o{-f!zF~v25b0cT!JCofd{aEhVDNYmti<X
z#}M7`G#rlW@mD;4sP1<rj=*Vb--GjsUxRBgj17ltKAmtgMzO;P-9829;bQzAj~=P}
zjmBP=XgCvp#0y4g{3SRJOYxLpx_uV;o<V*kq`!{+en7`)-TxfyfNu2T0$hi$;U3(F
z&Bkbc?Xd>7#v3q%Z{nAD+*nPYhTbd<hj8t94M%Z6wuYa?DLEQmh_i^_iC<zF+kLsZ
z|2#Zyf`;$Fb+`?`!|D@tzch?tqdbjoft_&x+q3azT!Z=S=f`XCUaT}p^F0nv#Cq5W
z&%+To3s>L^7{P708yim6<Fvz(=*1^-E7r@`{TJcg#8)T09pA@avGx>A*B<+z4;N#N
z0^P4A4#jbJ9X^1LLfyY6*2a4HDf#|^?MUZkdk(J0ckw4|;nV!RxE-rc)%a7fKHkiB
zH{mHb2Nz%f2NY>O<FTw*!}%rZMvUO6IJ8u^|A-ByY52f&^;hDHi9daY#*aWB-h!{<
zAxxX8`?tc2@ZMP(zk=}7IE45~IA*r)w}@~7VLt}(5p>Sc{b%4jY>oqP5x$E*;0c#%
z`ZVl^TkuotTBiF=!2o`V$73VB4850Wy4l!wu7<xR+{~}xiMRo4U9RzMu?RQfr}!JT
zC0`$2h8r-7&MP$Et~eas^E7@8&cONj30ApM_uGu^uF`Ne{=t5~vOR<NqN{bk04~N2
z_!{1ab1G?nMvQRUHJZLVPC*~Wu=2IK-z2P!Z7>I~#0?n1ud&W`noo1=fNq?Em*UCu
zb^ob&1&(BU3EqWo;~{Lo{w;6>dhtfwgE6dmy&l(vIoK3O;S`*W593DMitpm}oc}0c
z#{xahWc1@xxDH>%82*e$E!6Z)u{n0eES!KPcn1dY5!{A5@iRPOksjw{JPUhaUmSs#
z;XGW7`>-#ce;whicuYW#GZek(!|U))4B-aciQi!5#hRZJFT_FEW{Tbqz9qWfwKxYu
z7)IwEy5Ag3$MHBHZ^s}uU8?EaxP|z=co6Hey)zELN$A7hFzrsw_in<^;~V%r9?yRD
zuoJp5gkhY$O!NJfa9_eBaT4BxPh%J(_~Kof&vyI*Pr?Sc2lr!IP}4clh1cONn2DS5
zJ^U6AVcOl~hc0yEL%0S9+@t$@u?VlkGWKgucnq$<Fh;QQy_#QRY=ylr3;nnd@5V4j
zu;YE2-!a$-yJA1|;^nvq@4;s<jM4iw-x!7-(6DQ{8b;TH8jfOMg@!#3sb1WTKjUq9
zFRsFB4{N%Uu?e~{fKl|W)cr%4_K1dk7(w@=8XrOLDh<bQdPu{~un%5^k6{>lKc@R1
zh4pb5UXB5*^|<c8mGJl2i1<P1#YjB-1fLWAt2O*N)_PLI>j>|{eb|WYgV2kc@fW-Y
zU%>aV>Ke_zCAx7XzKe75Zd{9>;Ym+vK28jx<7tiepz9e8cf*H>UxV-C$LL_c^RO)z
zVF1&f)qKa``Pc_1;Z&T30sIo5#cj9`k6)|#r{NepAN$}WoQktBfM4RXxDEH=@#{Da
z2GRAL#)r}KyoUGT6b#@pT#3)(Cftsn;<s3Nz2<u+y3iTs_;~UQ8qUVa_$4;mpxX;y
zQs2bgr2iJ{v41gsj~!psbe+kkE9tYb3|Ha@_{7Va?j^i%qlO1#4&IDUV+8%L=>D!v
zY6ShS68{?U=zpE~&FT(3^9>ER!@lT6AJ%zO_ZyFw;!XGvhH%tdy8pfS9B#v}Fow59
zbpKuWBObFw<LjamU&m9n>h^Q7D-K659{;xPmw}V;a=aCT*k_yWe?8udPvaXH!P)QV
z{u}T;{2r@q*X?Pz7AwE2@n>L5?2R6b;bl8?{~h=tUb0i;bMZ6$4xd2(d%E9RJpFwQ
zcgEVgG<-E4Oa7<g7@Uuv;aTKgfNSw5baA}JxEYV-xQ+2*T!Rhy95-U=2m1Nu<4Qb$
zEk4xkE{x*Bk2L-qEW&wsCq9N@EQo5l$~YLa@lsrjLG<p{bk2{}7zRGku=`VWFJ6IL
z@l!mAC+#6WjN+@GY5Y6*IsS@{&vpA(_#2+^g~p$SE<E*1jc<S*@M84fejM<X?mq^7
zcs1UQA@qE$``?F8;70rae?iALy8k)fs*e(`O89hag?%v>eR%C&O<xo9u?%m+$8a-7
zG5npTUlCKE#8+?!M)9@pb-$hXE&hRNKj`*nvBr-Yu8S`0jpNXVH~ysi*TZRe4c>!m
zaXUuwz|Wfg<$danzo?Jns9!aFAKQl!e-*Z5dr!OuSL5^e1;)_(o964>ulg_&5C5+5
zVe}th|AT52{f9IhLFXSD_F)*kjz)H#jH8koLhn%;4q_CYl{Mar5p-7}9z*CnTH^!g
zsH))*PN}Bh#dt41i9chtV|2gT=*8FZ7xJsi{^w#3oQL1w6w+OVAzX($@Ow-<R*!!s
zy73^^KTfx|#-4a3-ij+QhNF(xbXSwlBls+C!<y_r2M^+Dq<3RKoQRiW5LaRZz18*j
zF$|xeVNVU>F)dBQAxt|-!$C|tnJ{`z(XgYY8b<G_#Gj_RYpJf&)u%BV=ix$Jj;~-8
z-L*A++8Js9W9Y5Jan4j@aqDV0Qcv}trB-vQs|jzz5Ai3g&3-Oik9l}4-iRx43r5jf
zUym0>@7WrT;P7)a{54?*;d+>hOE84~2Aa;%P~C{Xp{J3?&%ipwpN|*eARLd=@JhS`
zpTT$V81gN}Tkt-79wQiRte+>Xi5kKVO*Q-};VOh{U}Nl!UW}mgTumQ9=Xr$D*-XRU
z=4u$-Ei@d$8ZHetMh{+tU$fs?gfGAnd=B5o7&=;NzGHDZ25|>=#zMRvU&0ewX+Gzn
z8$I|azJf7q(puBaC%hhaVNJG=!PywVARfY6Z8SeO&cOHCej?#!n2nF%>lnot`r49j
zJJr=*4WYk-hJ78elj`owe(1kI!;TBp06My8*oPr>T||0}xiy^DRdu7kn}&lJL3ekJ
z4`2jKa2bYhBYunrFs+B?<Hr~d!G##a=kP7ujYsv=e4OY)cQ1{PptrY%)B31BjKss~
zY`>W8=;*8A09Na#;pYkO!msdDwx5M9(T&djn%;}yczA%u$Iw5JF#0`&(LabV`Uew6
zPlkqr=oq5m06K?iIEubu8g>j<!{{2J;UK1s)UY387`TM}Fl`k3VGMnvH9m&kF&d7b
zd#r{-=*-k`03BWp`!I^0aT*^+SC)o@7)9@R;?bE+7@awU(Uq&=FnT6vIEucB8g}HV
zUJRpalE#P7HJLCv^EDj6DEg*od|Cnf$1T*b)2HTO9$tte(2Hfb5CiC%s`*6FTclxE
zvFa#ML+CBlaN0E0Ib9844E-}S-Z4}4qGJ|ejG=$F#;46ugXp?c!(sH4X*i0$%QWop
zs{wRguHg{6=V~~D-YYa5L;pMtr(LNA(RG!E!|1tM!%_5IqhZIjY7C>-Y548=>i4)G
z&qoiA#XImptaiPouY*o>p&ReSwYUwv_&V;!y%<B^0zHl&!{}J3@ow~E7#)j9hY|Dy
zG(L>(#Twp)KjH6q;tjfeD&a-=AU=t8*sm21K`$P_nm1~Gb+834X8S7KgzsQy_8W{P
z=*I}UZqob%7{Lo~*7y;ag@qVnzY__U;R1XKBlriNd5h-T2s>jf?2U6VfK_kRboH<i
zKF;>dxC2|Uy&H~0A0B<1=661h!64hW;m7zpHe|nZF&#(ZOuPyg<C7T1wA=OgK1^Gp
zVLx7hi*XHxaRY9_edxGD^Qnv{VLNnVcN~P%(2t&_d_D|d2&3q}lh2J|bS%?&4+hY2
zm&Ut;Y5*N~YuJYy@h)74?_te*bpQJ3!VtcKWq2=c$4d8V`ZQdGPvKa+8kgZl{2m?m
zX+Cbe8)xC|_ylgm2=={S)7Qi9I0j45kKg07xDzWsp!uAJPJ9%7cngN{U5uh<x#kl<
z&x0CHTcJiU@Q{W*53_wG+cEG6VN83J{V{+MOk1Vf1Lz28I1QcX!T|2XXYg(O4iBQ^
zG4jV9_%cTDYy2InKd$K-Vl#AM2*1UT@Mo;_1b+uM!Zzr}>#+=P!XQ45n{XFKvDs=p
zPCaacZXApgaSr<NIb4M=<2(2%9>C+C<nO}KH~_P;1h2wn_&A2K`Wnse5aF7H8(=#e
zfLZ86_fwiag1)CU?0H5FqxV@2`!R-rwHhBo-#QIP(fyo;BN%*M!|wH}GmI~&ju%xw
zIyPw7haq&or15Dlt6_9*)Nl|_dqu;|u^qn1_Feb|c4PZsoQi&oqGuDw#o((N4&iUu
z>@|(g!~&d)kKpqdLGSCD&WBNSZ`OF%8>$;U=tUm}Fo+=xV+5lZL&uw%e;PW`g>Lkq
z7k%i*00uFHVT@oDW9WE`<D(N@=td8E(T9EvU=TwX#t23+hK>lwM<=?_jUM!(5B(Ux
zAcioE5sYFC9a}g)I?;u0^q?1g=*IvCF^mz6VcJ$bo)>-S#{dQ~gkg+e6l3Uko4*5{
z=t4Jo(2G6{U=TwX#wf<nv5n)Q6J6*=5BkuL0SsaY!x+UFI^N-U=tLK~(Su&}p&tVn
z#1Mutf>Dg2V>`!3C%Vv$9`vFQ{TRR?hA@l~jA9HO?{a)}q6^*VK`;6+fI$pn1fv*3
z#}1B%ZuFoRedxyk1~G(Tj9?UF=-A2e(1|W|qX)g{Lq7&Eh#?GP1fv*3$9sBw4|>s$
z0SsXTqZmWS`<jmvUFbnC`p}O-3}F}}7(>S{&J#M(jUM!&9|IV~5QZ^|F?4*uanOZs
z3}HCF{X^Z~jR6c|3>_coejfCq9|IV|Fh((kjwqiGz39U*Mlgns-K0Y&y3mat^r8>_
z7{DNgFpLq5VhkN0b9{873*G2JFZ%ErT!2e4gpN<hA6@81FZwWy5p;g4>0RhSFNQFT
zQH-Hu564F*y3mat^r8>_7{DNgFpLq5VhkOhaeQ>53*G2JANnzXK@4LA{hw?80SsaY
z!x+IR#xU&*&M$h<i+&7X5JMQjD8|t7rRM8I7kbc(KJ;S%Lm0*=#?bK<=M|mkLN|KR
zhkguT5JMQjD8|t7HOE6Iy3m6@^kWc17{&<3(D4oVqYFLgMIZVxfI$pl1f%Hqmi*C)
zE_9;@z34+f1~7;r3}XbN7(>Tij*m`sp&LEuMIZVxfI$pl7$X?P7&^Y=_~=9zy3vDP
z^r0UE7{m~UF@jNa#5gW`(2G6{U=YI?!5BKeCqHze3*G2JFZ$4r0SsaYBN)XPI)31|
z=tLK~(ThIxV*rB~!Z1cKiZOKj$Z^qyZuFoJ{TRR?hB1OsjG^NvJ&qGy=s_<AFo<D{
zU<@5UlOMX!jUM!(5B(Ux5QZ^|F?8+I<G9g-Ui6_K0~o>xMlpttUo_tkhB56|{w@q)
z5JMQo2u3l6j^8w&G<2c|y%@qUMlgyobnGWTbfOF0=tUp;F@Qk~VFaV-_?_dR3*G2N
zABHi4&I6j>g?<cR2*Vh~7^WT6^iFi62Yu+rAcio4QFI*Q??D%O(1(5uU=TwX#t23+
zhK@ft4m#0=ZuFoRedxyk1~G(TjA9HOj>dL=Xc{`vg>Lkq7k%i*0ERG(5sYFC9hEpP
zI?;u0^q?1g=*IvCF@#}^U=(BMIEv$=6TRreFh($nF?3eed|c>84=%(<@GJZQ2UXGZ
zrG!r++zNBhhgagwxCFx(!6?QsaI_xhMf?P};|KUN9>n9TYP#Ck3J2jZ^kNtrSJQNN
z<3ZwU5blK@9EBx#7k-V7V>G{p*c(0g6t+B8_gg@C8E(fYevMVwzXi_506vNDVic<$
zr^ji4XJCD7hh1?Pmf|e*<NH|Uc+Iy1zD;}#rw~6MpT_4fg0-t_KE3cXEX6zV9t`1c
z*ysdJzmM=x!k6M)4B*#TvxcVYgP#!ZAbcs_ft&CRjAGLhHJ@SF2(QFP@No>|AJ{8R
z(=WhnSmz{-&%zZL!}cfZ_A-1KtDmCr8Mp*r!1p-r?^vJXb;F5xEv~`{?!#Jqjtg-t
z&cplhb^HcT=5w~jkvI#3_yT^8$ME@^<3KFMTW}rjzys*0spn-lF2g<8>{Q)ejL%}F
z(=@&hF2e0tx0Y_t#)t6-?0C9v_v0&AqqfEm#ijTOo_mIFpNdc8AK0soZeM`gu+Etp
zpM@(hhVAR>_A-1KtJl-`3|xY{vB_Dwy%5*nLG0nw?elRf)~>JdUR;iQvF+KqeGYEG
z<Id6eL3kT}gpC^L_9?g;f5&bOb^Em#!CH+pek|UP-(c&;x_uVDfX6h^_<?u}et->{
z>h{U_IR1w2b9MXG_$Houp2m;Hd+{r5*-W?3!1Y-5e2wpqH{ttuc5~gHhavn0yR^{l
zSK?+o#ij9=;NAEIwrHu_r{Qy0rIp6_#T#%ZI$P`Z3HT`fj2E=g?N{Jycv4%9AAxt_
zXLx=)-Clxg@u>D1pN;{17wdJ<?K!v-f5c85b^GPG2~X^#@x$;={1ngYtlNw58FXBr
z@x5^&zJq68sN2WmL-;**=%U*%!;N^tMH)W@@4%0-sav=E@F_fmJ-h1m>+x;8qnpNm
zj7__f4?cy5uxAhQ!ME{@o*F+6AH?slT`%2!DZYfq_tyBqcsoY1aUb1YfKTE9?4GXM
zufr{P`o$Wbi4Wkn*ru;;pN%i#vHdjOgSX;`*s#BD&&MZlKXx6U+pod5@U(#%KL+o^
zud$U!x6i~dRvV=81Mp_th35>`?UV2^{1q?C(Ct^@8(4FQ#*e~#@Jn<J)$P;qc|3ZU
z#`nV;@ja|RT(?ieRk#l?9HHCi;p=$vNR1zfLHrz>U!vPfaUE73rSTWzV%&jejn?hC
z_z3=loyX|*x%eukjn(+!xD5ATvrOGyjL%{vug3SmMYtX7j??Yg_%Qx}9kX=1A78;5
z<28OLF2zsq+-%)G6`#gGuvd<5Ux3@NPOiph;R=jl`w6<e3}43T6E!{qm*8$}lBe4X
zaSa~C9+PzYeB6q)Cu_VHm*ZY+o3GpF;08QyipCGZ+wdc7RG{0Z;A;FGyA|s8YcYbg
zd>TI%@5fiL=2YE23h%)$(N(0|r{nW@bg{<w!yEBEtY4zrC*msHhZmOW_Ida^o;*$C
zM`92^$L7;@dnvBN$}=?nVqA<n@T{4-Jr^IrpRn^R-98sz#kAQPKOC3g9&9#8w-@8H
zSm{!Y?}LkQJJv1J?b-M+{(v1X)9rqI1#9><ekd-*Pw?E!b^BC&8vnpvb9MUy+=g|o
z(D*D|fiY}9Pq&xh%UJzNjnBX(xEq^XrP~W}4IacESL^oqxD{((qw!u`j(f4~wYq%{
zZouQN)A&Jn8-9e1=Iiz;7{!*?Yy1pck5w0Fe1E(N-^a5T>h?Sg;V;-_k#4^dH{&S*
zjlTr%#xL;nn!3FeX5dU*f??c^)lSuPO>h7f;?1}Qci}-i=QK^<11I5pd<?hZuUNa5
zrn?Becoi<kH*hc3JYCbZ#Zfp1@4*fDB_3B>)46aEPRHBudHe{EK10(r!hSdfZ^YI3
z9{!H?>uCCJI1#VKRT#m2SnEtpcOj0&d3Zm*j^E(Pbv0dU9Er0qh%eyhcuYM_*Bl38
zDc*wX@B^%TmZocf7vp4HjF00E{0+}?YI-;3;??*FzKK8Ksr5BoXB>@l@m_otzrwV$
zHC;;_jx%r>uE#xC^&Cyt4Etj--h|KM`&g-graK$^U>+{Q5N^j`ux>+5-vzVrN_-eM
z;}3XBBTd&4FF`-vjj!MrSfjC~Yk@;?8ZO1>@Dr@kMAMy%eQ_$@fKTI2`~#g$HGMCf
zfD7<Z+=f46opUwa1(=0b;0k;VV|dbenyx*Lz%slGU&ha{dNWOTK4xGEF2S|98;?3)
z(>1|#EW`k=!FTZ>)@!cmdteUE$CbDhf5h4?G+ih3;^nv;H{o79(WU9y;xL?pcj5;8
z6pw4E>CVGJScJFXGx!lYT4}mQ*c+$dLR^jS;O}^5Yfaw`$K$p55JvEOtkp)-b-=NB
z8QzZ@@f$p$t)^>@LvR+}fiK|4cuYG@*Axe$4{yPz@B=)A4ccq^o;Vq=$H(z){0-0O
zpy}K=4zI=s@lE^=PwlAb+Tm!t6z|2C@GCsNlcsBlgK-Alj_WasRXc0C#@HVV@Fsi`
z-^T-Z_63^0JLcha7{V?13!Z+Vrt5;4cqKl7oAFybrHiI(gO}iJyc=J{FYwrlG+hhy
z;557ypTiHaid)k)#J-r1H{cVv6ZfODtETUY6Yv^*6yL(1aX(h?rs+rHt@s>%h*i4t
zIk7M1;|=%(?!^7*?7`>633v@Yif`f1cv??QcL9#UEAT#i4Zp^ddTF{=I09$lT^PpC
zuv%|TcRmil61*AL;x0U@kES~Z({U08@G*QBf5m#~n*Jip!K-j3zJWht&5Jc%Cme;B
z<2|?uzr+*!YC0DV!|8Y@K98T`(fu^tdDst&@J4(F-$O@#O;;a#<3wDDtMDD%hi4AZ
z^cUiIoQDtL>-asMJW$hhz>#<v25}>Pjwg6DU2`0QrFaLf!;i7@AWhd4FGe3O#;0%x
z9>TK*Yx<s;i`U~L_%{B8XJlx)&NvR|;)D1qeurs8G+jF!j+f#xd<pm9@k2FTGaQV?
zcso9eQLHpf(>2CESb&T0N!*SHu<me8-yO5@I(!(n;178E2u;@!GtrL^;4Ano))=Yj
z+Tc)}jZ5)G`~;7^MAMy%9-NA|;?wvc{(%igY5HE6j|=b#+=lzH&S*{76|?XfT!C+4
z3{M-Q>DuEMEW`WoW&9eekJWUoFau}e5)9*Rtd^<in&1E|#G7#q?!tq3j#tz7z)3hC
zAH%KqE7l&T=`KPqUWLo?4cv=0vou{>9EEf69^8Om;&J0OoeKxybi56p$B*#nY)#h)
z`{5M45m)1T_&e6m(e&MLB3_HDFoOHAR<5SI5Xa&?ydPi3Z}8*^nyxjD#90``7w~gD
zW}>ERjsvk2Z^3o=0anh_bPe!goQ#X{aomBw;aQV3y&H4!YJ3FW#Gmlg$(pV+j>fro
zFTRRjVOqYXYl*{g1}?+(xCg6F(R9tQKNjOn_$<DUl?pW7+1Lm3a1n-ZJN|-o3pITg
z%*HG6Vcd*A;3+;$*AXv4Ki-Y+VcJyPza0+8OK};#gnRJ#B2Cu}2V*hbj?ZEgD-~<H
z#@Gi7a1lO<+wlO_Ez$JdF&nSLhj9!3fTx#gx{jEMetZC5!Edp~G)>nAhvIBpiZ9|P
zc<gjdcP@HxD&C4u<A?YMHk_g9dtp8<z$b7U?#DVaHC<QC!fS8^zJ)P7ZI-5Mk7KY5
z@57hzYpgz7)3w43oQX>?jJvVg98K2*2Vf!IjB9Wg9>jAl)$~1Z63)lRa4Y_bwaYZ!
zMd-z=a5=t#d$HzanyxL5!Z~;kZon_`IKQTI;UJujx8d{n5gvWHrfY=#a0=dttMNVj
z9qZ54^xbeGUW=<Rg8Q)66`Jls9E<btetaFj!IS4{y4E-nXJHUuz|ZlRD>Yqn9EhcO
z3$DWtu<})!t^r<*lW{RVjyv!-JnL#r@5Wrb8Xv(o@h3d>8co+3N8?<)7hlD%Fzs4R
z*Aj>03|xlmaSv9#PSZ8R{#cAR;j{QYR+_Kr&c;5Nhl}tj+=bPz*L3G&2A1FwT#LK$
zs0Es?38rHq25=3&iwCjZLQUTTb8tSc#I5)v)?TFPI-wUY$K|*Q_u`2GP1hEO;T*gZ
zH{hpu++t0420F1B=3!@?foV5rIv0A-he3>B%^P+9&G<3ic9X_GfMJZ{ujsm2_v?vo
z;~s2#n{M~uOl*6*#y_+~eI38Y26t$DYwU;1+5Qaf#Y5PX{Z`{XbS%~UYU50tgGb+~
z+pFVX9FCXZgE(lJ?!OHC+@;}MEJHu0b#7wc7b(EyxED{qK(}{5FD}8+7wYyRY|=%;
z?a+fhyavPg4vxP_)0N`OT{RrVW4meiW$e^L!=vzWd<?_56We1tmf=ENgI#-SzE@xX
zSKzU|bbEEY8JFTt{1A6zi{6^<RXnYahM&gmxEGi5J9cU5y5A7I2nVAVeHg{o7i+qc
z@JwupUD1O#<E3~b-iIOl89%@uu}WV(UK+N;OYtSF-B0)PVk_cD;7q&=!}uAVO1jQC
z8q4q=3}NN|dc56)zrjkxpM*}##(|iHKJ?=Sa@QQ~IK~kp>=?k`kM*!Ic0muWtE%Zk
zgty=h{1Ph-)O;_&ZMcc;RSBPj=b#H`;uW|Mm*ZAE$D_w_<2am%GjTC~h1r8N-3Y?B
z;NAEHhVfYTKLzVyd(6ToFoK`p`GfU%%kdB5`w<?7<8T(<kKbdJ49zDETVe-HM<+JH
zT%3yY@No>|4s1L`k8?NvMtm>A9vp=wco*))%0v0Q*bG}^53Gad;5f|3%kU8l;U+wY
zRfg&DPQ`5e5Wix-pYRIe7vOF981BOJhHJhfa12hu06v3Zd<W}}(DV(k6|Q9aemt7^
zQ?WgE!@-!1v+)rOVbzg(JU=$X7T5_hunZ&k2UfX6^GU<5xE8%Q2}^MyK7%KW(tMma
z2wz7Z&cZ8k8NP~jM{7Rkp&O^-*BHPhct1XmUt)(bnx6-~7{m~M7T-Qr_dgm>#51rp
zj>Qm0@BmKB)O_k-Lv&#}mS7kocmSt)$p;&v3)8U#!x+IR#_%AXFi!JrjD69A^>HSa
zp&xI;J8&g_kNfd0+=VT&^theTo#GI}qc9II!)tLZev04WHtHwt@p_zmyc+MqC-8ZE
z6>DZ|y3=tf@xNdL;@ud_(f!-xs=s252^xMHH)0eW6FClgFb99cqw{pXp_5e~-i|}_
zHU70J>eqNufrh)H2lMb1_S=ayh_8pea0{MNsQEduBi@cnv7t}*YleAPfKzcjj-0Cd
zKY)FTG@OfN*noPCpKuhN#k#*2U#dd=g|MSUw_k@h;N$oveu#rg$q$#|Ls(~;Zg*lg
zoQZy1fh%zV9ze%*&F5Uq#jEg9d;{Odudo=;nW_2Y<Fy#T2l1>~x?fKmjg@9=d@byb
zy>TrbHAnaBj2Gb(xF1_zs{6Iahw%rjTc+EaT&7MSyc!?GQno*W&HTE5R~(97JmGTP
zehzlRo;VoC;3O=?8TbgU!S(naR+_8FIq3@3gD>G9*mRz5Z-y7+YR>0o+>R%5{#N2k
z_$F54{N0D^a3lVKC7i$doVSkH9UtWLcjj~6fot)2KIcfh4ws{Cl>h(#`Nz*d^`j`q
zIPVdRVcL~?eYwzcwT6Qj!jrDi`0e-&?q&Orcogx~usL?ZQRu}<ScoM!6U*=lT!eRE
z5TC)#xCKAQYS-%L?~Fsyi$!=VK8|6GU=$tKX?{+0qZdOM!5F5^*K{uQpbvxi0KSYn
z@wn?bF7`wphA@m#{0-;v{7YM)`E*#Q=HM=Lvb{a_!z*zodf9(29>ltfG~aV^IF80P
zd=4*RANtW5(Ddne3tqtW>|Ctd+hGTE<1ie7UaWS5rn{E#z4#oyi)XT*2fbK?estfW
z`Q_o!w`#a1I<fr{jUS1F(Q}8!`*1!!h^z1!jN)Ewx>VDz#HZ15r^c_ss&{MnAzX*A
z<E!`S_Nt^m0UKciL-*@`uj3c^9sY)C59og9VILfbBhilmd=eeYHT`^Cf?IGa`|ZSE
zFvRvJaWjS=)chJ$*77jCQp4ZjN9cG&<I~WCGw{?$$p>B7181OjmF~9(V^}?;@l7xX
z>!SzX#J4eupW;!EX+9^S6C30Ccrj++UHA+>fjjX7+>1Y=<8eJ+8amO1UVH&};Q?Iy
zgr;AD%~xx<GnTR6QXEYDcKi_?Pind|a5i?qG583Ea3dbTT5B|)4%iO!(TDfqPINq_
z>8>GsBi@FGum$_M@FMi#3>=KPI2rx;5Wa$YFotPQ>*sM{OB{&p(2D^KVhF<+!Fc~~
zl<?13<rzKRY3Rlw=*2e2>V4)Xd@H_<pW(OYcvka01smaD?1VmCjl1v$T#g$shK{wG
ze-%6jo1hDQxD5N?c)ScB$1rZen(Op9-S8XYs}Vj2hhYiM!~oujKjYcYX}<3hUWaev
z7g&Y;&cb#$1V^F|@5P7kG2Dos;ZIoOdHo#iu?zOZOK=v>!`$_nZUOn!C;l;f6F<d6
zcrNL>;8nN;gZKc3@c=%FuCN}z9bSk}y`b@7jNmRj{zcv12;1RVI0nn`0el!YVFb6}
zZmh9E^G`!3y3mat^r8>_7{E`l(o34}K-`28wqHp2H#~%OUe<IM;3YU21Na!8yHWQa
zfWvVP-j0W`<}12?ZLEjY*j^8_iO<6g_&)xM?KWwCZhZ1p4ZnzA<4<@9k9ke^tAm%5
zejyHEzf!`J2oGg{FP7jn7{*ueCtQbvU)SS$@c~?gJMnj{x>@({fEVLX^rGht-LC-C
z-qi3YT#ie~cPsl1CjPj$bbmMY$JzM5YN-0jZ!`Xil_Pq5Csy8~;WX@s-LM}H!-+VR
z^mkzx?`L}mx8QkO^*DX;>bEug5N^PI_&Ykb>3*kUQ(TXiV+4<Vhrb8wV>g_QbMbZz
z;*<CeeuC$1*L+)HZ=8Tr@nZHLi^Ukg+U&OkSK$`iiN9fkclEd}aWu}s>+yee-K6Fd
zB>x9-1G;wT=jw=Ea0pJr`|&vp<2D?#Q}fx2C%&iQ#@G!rG4=EPx8ZrTPrUJb-`C^%
zFoKR<8eb1vVrT4*KJ?>5_ymr|>9_$S7(@RDdc1}B1b&F0<88PekNr^7Ik7hu;B;(`
z85qRp@e{1^k>*zmd*CLv|9~BdAA)|o9q+~vMzK~@^Y4Qn5bq-##MSs7euPo{9v!<i
zzre?85TC)_xDj{acj)*;(>277I1YQ^NGwD@F2WV~9PYuIpKAWE5&i&w#G~2X2;G>D
zqtK6zJ^Z~G#I(;e-ia=BqX#Ep3Hos^2JpYVzy8y7Lypn*i|cd!+`VxSj>Q}-#R$HK
zpW|LUgq|-nUoZObN}P*fj9?5aeW~d$LKpgQE;jp0_iK%va3B`pO!VW;xDq4yIiC8p
z=6eP<!H(D!Ct)EL<FVgp`kHtqKDAflt9_@Qh^J#?bfFvHzzKK_-j6ThR~W;iV*DNG
zMn7JIH{w!UgL}T$^bV|rXJS)qgPrkq?2DP`!}%D&&OhjJGSG{DyaKPs=P-i(f7JBV
zu;x!1ZiX)0f{lOH?R~KXLu`MB@G<*zzgC!mC3rmsa3gNPC?3Enzi58P;tALidtiSY
zf|p=E-iIgss`+)tv3MCS!KW~c@8W)Z=r_&hb^Hcn*m1vZ&-z`>A$&Q8aTC6cyYOq=
zk0+DQCOqkY=Gy?9<3)JFLEWBD_%gf=8<6ki<aZCg!1g!sOUx#}>+wEZhdVKfpW{KS
zd`Le}8aBoeI0*fC6E4S9xCX=cI_}2v|Ip*q#ZK50OYnMJfgz0G$M`L#Ihxu>MQiMU
z7hoRx@oHR%OK}}`JWBKHf<5rqN*Z4So!AoFVLu#&e!LH(xCg()%Q?<H7{Zq@ij^zt
z=Qswd<Ja-{D!ScAcoX407(?gLx?f{#ju+t#cq@L0)v9W`I_SimY~PPta2(s`VgOg;
zTHJ{5V%utbKD-K-<IDI7#<1lvnl2M_aSBeyJMmH6gKdu0^e<7rcN6xa9~a^({1`vS
zvyan!dgEA}fva%??!YMij9rh{eA00&jz=FZ#j~pG{s(bB@t@#+9L@H8oDtt2gZLmm
zhcDtD^qio_8;M@b$GJEggZLTl$J1+QJ{Mp*X5$L{4EN%%=sZ!=yRbWYF^H>iExw4a
z;2!LkrumJ)X;{kk`M3kW!SmQJ4==^*a4qh{D7HRHkJk%5I0mo9t+*X`<L_AgWX<Oc
zJRiH@0NlrMk0;%k=)zJA;B#1u^zE=Wj>Sb7z#xY4HQa^&X+8hH-EZ5m+CFfcqUXh5
zQ@s)c=&Ysj=VK=<#km;57jPrKjfb%E>6%{}c1Ab$$Ei3CufXNF5}&~j@MHWAPpPfP
zI~^NhIu5{5cp1*a#kd|{!Y%kS?#F6p=<&`&7hZ_tFc*vQR$PiJ@D1FGA7SM>dYohM
zRP2adurC&330{u(<3soqzK>D-7Ed};k9!)PgT1jIj>JoGE-u1naXoIv82*e$)z#xQ
z!t<~Lj>d6VfH&Z+cpq-UH}E|?gq7>*@zSsjcEnyd2@7!!K7^}q9e#pe;LmvGS$e$s
z*c^xAB{%^W;*A)@S8y}#M2A!JKN?TQj@SkJVi8WqdH67f@Fn~jV|Wl7)Ys!S#dbIb
zv#=0v!8`Cld<(bXZan5}&Hn^E1G{1`%)nWA8D5W1;M4dreuLlRA#8Y#9`{^qk7IE>
z`tVj<iYqXJ@8HLHYy-`|2G+rD*c*r7Z1m#-T#e7*M*J3kz(252Lp|<!*a0&!8>ixJ
zcqcxDTW~vmg2y$|{7=L)u{-v`p*ROG$A$PLK8vs5Ui=XqjrDkqu^DzmFXmto-j2)g
zVcd%E;-`3g6Fp8E*2Nx}j>GU$oQsQa4X(va_#OU)m741Dn&A1^3CCeB7UL4U3s>UX
zxC8fK^>g(&Ct*G8i5KH=EW;}>fKTB%d=<aMz4$w}J5P_>9sL-<+cAi)X1ad|?1%qt
zz3A`KdCA|0vvCmyG3|W)TyDG=M`96Pi%W4ezJl-Jw^*gQ=IcT)Mz9(43mq+Vzq;5T
zeYga#!1?$fhB4yOd>k!RCwlP(d>?<ug{^e||Bt-?0I#ap7r6hu_hjwt>;MJ`y@e`8
zLZ~7H38EmO#RdTcf=U1jMhPkqk`Pd-y6HuTp($X1AWZ{e6p&C2f<aIqN>EVHPy`i_
z_cv?48<FF=$8*lT|Nry6?{lB)C#%iOnl&?P%G#WjcnvEzHTyruUooHSPT&t1#C5YU
z9XFxH^~JG1cE?%xG3Mb#EZ@werwWE)cU*zHFb~h-uNcwXgcFBJxD5BBvxPZt1ltwZ
zj>9>agI{3So#wipI24QUDV&F%mgf8@9ELtDz(oJ=nC~*@*TJ1w<lo=Q96yicFs!xN
zAA<!L5^45_w86H<JUotJQCyF4I094Knd2FlhavZv{p~OgeVBo3F{-0EKLJxQ3-i#|
z#hjOeF<s4eDmuEE?GOyZ49vkI4DN2uFO7pAFx!t(PZQW4g)gHI*JJ+&&2>X@J1)ad
z9x}&wV;=s1D<3w;x8i4b3eV$p41dI&-x2dr^)UP6a2<Y(zhYodb6z=Yh;1+qcVo3!
zbAB`IfOF8%%N#F>RWTZSU>eTF+}`H83Z%a|_QE}Q1b2~sC+Vt-&GCKQf;0W`5zj_E
zjK?szk4Z-e{;T)Qe|tR1W5)Mk#McnxZgB|PskjA?U?JvVokz`mL|`k7!c^Rf$MH0J
z`kL!MXS+Px_h1b6#@Eq@S@<3nU=d!#kbWkfFpR(`Y>#pHIQlRHmtYR&;UO$SM}HGv
zAckQ*jKVG$hY9#L=3xOA;jb7HXW}W15!eD_Fb-#624>+J%(=x%15EgN?2lu65f)$(
zIvz9USHk)jgK?PfxH&H!?|H&(f5>(rKF<E}=pAUz3&DYC#hd-Fu>B6!VSi^lgazmr
zWUdRtniz#~n1HF6fmxV?JFozY&@q^FV{d#Kr{X@mf<tf=zJb;d6J7+~iBXt}Z{P~t
zh+FXtHh<ED*By_r{{qIdKLO|9V*CPEJ!Qf<&-NwsvcCzoz%KY0PQ`tA0vkPT!d;5*
z;&H+c<-F~99?NlFJ4{9AGbX%}*byJY6kLEA7(CQmR{@)16rSU}K(=G>349i3;X?cv
z^RNKF!*l3(*2EuzVHkm}@DY3#C*nrT!Db02yzUr?6|p`x!67&f*I*9DJZHjlK5s0C
z9dIx{gA>t*A;ZjdVVHnNF)+~_uZhj^1+0Ki;wH?&0(1-~9*n?d7=v+`fT@^;dH6HB
zUohd8#@n$aK8Oi86aU-y`#-xri}2sa9a!>3lg>sMjsGm1|91Zq5`g~qcVdWV5KhLI
zF$0%iHg3f{{2UMC6|77=j+e~+MPM9;^6&VhvhBkRT!(ph0e9l>SZ;&~zZTww_u&AH
z$JARaU^{B038x!AiBs@#oP%p{3+CZhcp0x^)lnw=Fl>$8@L61pkKt_GjYsh;-jig)
z?TLeMCa%Y9JcFf1o9k*|Q;foH_#{rj$8ip>!7Z4FU*ToEilNEm14dvcd<>KEbzFq~
za6GQW9e5B=<Ih++#l%}42Vv?h`q<9E94x>hbflW_p2RU&3uCY!PQeAZ7PsN&ScG9?
zO!#fE6ZXYnI1XoG24-OqMx>eWp2qQbFZRGO_#y7W%45y-cj0{)jWJk&1IL;3C%$a-
zVaZp__P1=`#r6oCh0E{*+>Mvf@u~?Y1jDd4Mqmeg0#h*;zr>Sx9;=Qw;rGJ6xEj4!
z8@FP>1an;ltc?$0944R-GcXUcCz^2f<1ieDOK~F}#j_YN$y{Fs>tJJSha+$;=HP$(
zp7~F&Pvkq6Jkrr-vbm36UN=seYHU2sn2Nh_CCA@lyV`VfUIV-n+hGTM5PRa|I2aRg
z1o|)ovoL0ci7x}AW}59hJdWq_3Rak9&Z~uu@du6<V>=9^Fb3bpJUosiXA>?q#oqW5
ze#`ka*zSl=VG4eRXR!$5=9qXk&NWt^XWYVe<2TH93%mzA;zJmVPv8(7jw3M@x8u0^
zCj6!7SzxwHVN-02-S8oN9t#NX6qX^pD0~u=a565y_4qy3BHUK^I3{5_zJ>eo3f4<E
z_wk?Br@wpM9_BqGh%W($`Qv}nq$3L3V-9|bM=>PBobR_c+jDU}9>*9=!%RGfzhTru
z6Yfx4j!)n;%)@+iEHc+cU=$``D!z&D;%3~5KV!4SCcKvTCLUa3_J55Qw#1H@hEp+O
zDc2GHRqRK&6EG7uVHDRrixcr}+>Ut|^OlLH2+zN5wwK`Z%gpxQEjRz^c{#*W=#Ou?
ziT5u2mHpvtx5Mt3g7fe#d<U0E!t^*Ni|rz;mucb&SYd3A9q~zg^c{0N6*DjgPhd;J
z8-`PHHSWb9FleO-cOc$aZ(w^8PR9bgiVatp>qp@^EWX<8Z;Mm#jx4jkCC05W+wbC0
zyoO(J-3Ofi37*9goEJpAZLmK+jk9nuRwUf+_#7tVGF*?}U{&I4iTB|>@0$DQjW6LW
zT!)|G_gI9XYt8i$_zc#={x}@R;#%B?NANiQyY-=f>jKu9^n_twtbpBdAP&RDxCM9N
zr}&k>KgISTyo6V=+Ir&0Rv3f5aS+bJw{SOpjX&Toc>4wu{+;*;_Qm!a&G9juzW~?b
zRxHK&;n)FR!c?4!i|_+Hh1H3#2|kRYF!dJm*e>_JNoN=QhW(A+GyCtyx*YF_y>SB0
z!rgcR&tdg!6J8UH#u)66`_PBSu?WL9nQ&6khcj>ievEloZL_&90Y_ka?1;hG2)E%`
z^nPH%sf+(^{rGp!k073hF#%WN5PTEY<5s+YCAOG!SHx=g@1`e=>+Z!koQu715-!Bm
zcpQI2_lG83!FaF#I{~)uz{l}*T!e4pqqq_C(8@94--&TJ43jV)FQfA#bKPyY2rpyx
zkInJNa6I<q`Zw?*hI8FLn2P(cJ=b@|{`d;MjX8J(uVUC%lb$G?%)iSS!**YM2}k3X
zSb&~wCY;5%3OC?ktn-OEuhw?s5Vlis8Q#0Y9Pf@j@dezQOMGmvz>T;Ab2%?$r#Zhl
zZpD3=k3VDTE_42eI11P8Hv8Yl9Q+Q)?J>t+$Jw|6t$pVB4{V>u>lm=t9FOI^I2?da
z;Y6(SsX2c$+gmXYQ#tPUC$`(<nd>^>LH1wAne5NOqxcJ!*>A3IicxqkcE-NA9e3l)
zxEURvnealeB38i%a12hsH?Z0PbA4_6iv7Q0S@y@_vp5o8!PBS?n(*pk1a9Z}HLS`0
zap=RDxEA-}X)MC~KR59##t(2Gw!{Z<CGNnVF*M(VQvqAx1NbZ^U=r@aBX|POVwpoG
z+)6kUr(y<f!vd`Eg}Lr9+reyC$41y6N8&3u3CDTO_{GQeCs=?XhfO?Dn2J8kz>p*6
zya<fJRD2aD;T+7scQG43!Y}a*7NO%y6JK#GgO#xbcErvYhp%CiuS__1V|PrzxTEHH
z8WuZdw##EZbR0MPo8cE&h-dK<Uc)8@=DHNT{cE%RFvhTdG6wkfW4mw6`Mod^7h*PU
z#~)CgFxPo7>=x^>oq!{83{FQ+p$Vr1=Ca?(b_|ZkH*go`;V(G*qzP{^?!XIp6!X3%
z{ofg@;v*P`1Mq3|;m4SVmvQzf;>RQS8~%(1r_FiSu;TY-I|5_zGO8cU@nYB*7oRcv
zhqC<|PQY}Ie}hF>?yR|PBW}e?KbrkfSf<EqKgISa9E-cr!gO4YTQTb=6V6_|fFXn*
zg%98~oP{6a4%~wmumF3WGx5Y>9(Flz_V>pO+=7KT{-QbWwhP8Ed>fy@XK@Y==DNdp
z2}6E1;WWf(9E1rt0~=m4=ih~I;8FYyt6etdb^FyA&312`i%W46*A?Mw?Dzd*u3v`R
z@f%cxe+PELnV5n5(f6AP?@c_0m+<Q==J@Ye>8ja|z(?>Bx~`ey!PpEJ{ciU6V|ySD
z$9M1~=Q*yM^DAKl#^NP(IqKU(r{{1m_H>&4ui{e7!J}A&zoFA*t}Bh>@in|3$6y{_
z#8;HLej2`kPY0O&Q?NdUV=jJVne%QP59Zynzu=Z_|M^Ai4|SVxB5(-S!#?;Nrr>Ja
zh57g;`s4Y#7I58<cn)g^n)G$T2k>$H4$tBe{0NU>gvW%_7NfBTzK;j+7jy)f>t?dO
z7cbz$9FN0+I2P0Keayk~UK8#xoPz1-!5TOkXXAD(#M9^wHsMyswitz7uovdz5md#@
z^+C9r{d+MTyKy`YhvIX%6nEl&Jb{CXn|PkVTG$Tzq5t3Y&SE>Fgt=}L=3pKcVAgHs
zyd10$VzxWrarR%tGVE`PEineyu|5^Yb^<QJiX~0>)o?hb;CBD<Qs%t6*dK@D5j>7%
zOPlkn;=?!v=i^UUF_d)RBiI)gVir2enDZ@cjqULz+=MxJ8LwmWvL>8IaRA0+0&d2Q
z_!SnRDrdqei(yz7AHjk6B5uQW_$ZFY`M4SnVgVN6Bjru}b1?%8DwzGgipC6Fg3ECY
zZo$9X{{5%t<q=QiN+w<9FcOPN!1nicw`|9;eTI3bEVlEoWfc=%t;)tNxE1%|S^OCr
zRW;{##V0WV*P;(Uz&%)mQPoU15n;w=_z=#<i0bBe8;rpxaXhZXkQ(Ov%2)#*#7FT-
zd=VF67Vf|!ScvDaN=*|_6O6*{7=*R3KMu!KoQ%`(W88yB@MruEqiUJ>pTlMNE+*DC
z$5&$xM%6L<V=w_zF$3SjpRnZZ=DK<qg*g~g*Bl>;!!Zdr;Bq{L=dn~hbA4sJ8{6YR
z+=vIU03+&~>-*tAoQ;tU%<)ug$o?_-HU5AVIIku?iZk&uJccLmcdT%SiLV)U$LDYf
z9>Xi>XlSncPuq*XdtC_eB;!Qv9B$%&5WS7ec71G#>p33E_Os~2mAD(f#lM@r|Ma|d
zw0A}R_##aDL$DGy$IjRY`(h%Fz*Nk@9K3>;v0`HrUljJmID8d1U=Hrbi&&$H38yVa
zVIE$_I!(=a5qKXC##y))bFg|dbKPfbN3h)yyI>5)VJiA?H|F5yScE~%O+1aT3&!9$
zd<~c2dw2v3Fs6kG?|r<4@tBL}vGkqh{5BYcUGQ<t!VmEPUcpi=O*r+j3qFELI2SW8
z3%B7>{1v@-nQ#xYt=O)MZEy$<!x=aiGjKhw#(XTm3+QcS;`LxHjKUa9z*O{MNNaO_
z1op;Ha0n(~3a-Hqu@HYkN2CciA8*GV_!thsVfZR8#CI?U3$X~VVwpB3zG`?ow!#kB
z72|LYCgBWx9}BS5-6s5B*)G9$3yi_cK+}(lWBVms-PT<HA=`WK5?0`NckGL!Z~?Bs
z-FOU(Fe1vtGYH4v>zIgJu@If@$Ukg_Q8)pY<2ua4pE2Yf6HXY`!iLx#V=w_<!3_Kg
z&*PHzCfvi=>0YyamE%vd|5ps)co^P?y)hNH<37yC^LPaZ-)G|c4liO>2eZFeM`Hur
zftS(U$sDha9We&qCZ4l+8{tP_6c#7Gig-IV!*<vW``}<4frs%Y3?QBPcoDbXZ_@oE
z{)*j5_duMAYcU6lN1N;C<1$=>>7C8-x9|fzh(BX!7js@!496jujw|tOcXNK@7~^dG
z0t@gmeouOT!X)B-1CL=h!cV{hShK5%=YE`tYthk-^y5=F0rT-9wj=%hF%x&;A$*wh
z_s5Yq6W_y~SmFVbjz-uFqwqeA#W;Kc$Ko0+^`Hqam+c_7AHg_G#c}v8E_=vaUjggm
zWc&nwLFdEf{3h5Iqwyhp85a@HSqvkb0r(7lfakFS*WZOt;bhFhgIIu{;w21w#H6<a
z?!-Lo)x#Vwk1N@K5}oXCi7(+;oR8~qE9T;V+W-Fl%5|INM)kZ)9{E*(McA+><qg|n
zJU)dl;T&9mTk!uXKPU6uW&!bp#G3Th!w9?+qi`P9>}}5Pgiqr_+=x$celpI+W$5f>
z!fA*@aSU$8T>K1AVgd0u`<QTU!-g1vkKoHV727^)uDch<;^R0GH((AP#sd5fi|`zJ
z`kL^Au^~oa6vkj2CSWT1FaxtN2lKE1i_p=}#2<oT7=ck3gK?OEsp!KD%);%MhXq)K
zvHf{27>>{5Yxov^fZOm342Uz~{9n}vf4CvU(;AoI9&9tfq@xAf58^X81y|z^{1#nY
zUjy&OOxlwKwhQn_yn+Fbkq=l0Bd{3`$5@QVR4l@T$4z)XbUb0U6R-ed2AcgjxS8)l
z!{W{U2z(kx;3Rw%KgJ!HhlLn5$b=Vz1275aV-_C8a)ZrvcVcf$MIWxg*<5!4ce20u
z5ED)r48xY#7CYcW_!ho{S@<sIVa$^z`~*zG>9`Qr;Z)p?d3XeW!BS6|a9iNrSPVyC
zGG6812})&qJ)TC_(<Z!<SOHsNA?Fom`)N$T6?hO&;CJYK#)Mx2d*LhS!z|3kL--kn
z3^n1@!4CKgzKU}(<XLn6IJ`5#Y`4OPa17_K#lxI`6}xcWJlup|;VC?eC7(0#G{Xll
z4ij($rs5>@;R4J;$MYuKT6i}`;{eRUT-=8b3^Uiy!h`5cH2Z5~H+%|L;V8lx$8}rq
zG+x3ooF9i1@KZd9=g=|S#2<n&7>5OT4c#x8^Q&Mb496(EA7e2N6L1Xra0RZ!Jj};J
zbi8QdFNsmO7Qe@GFPZbA@GX2FhmJPKV@4Qf;yip8&*E3;8)?qZ9A(^t&63Ra-MEPT
z6WGqc<)m*Pp2K6DpTc=-@H$o`z0L6<oPhrK>i?BK;#r4zSUlO>_fEF2V;T0>#D{Pw
zj>HuFchjB9b+d3Kp2HF;CS6tWUQEZucmYFF&G|9-9KMR{F&lF+YK*zAEB3=FxCZy)
zJ!$6r?)WUG;175iuj1`v&2{%-S9}{cV)!_7UNhW<h3MpdTjPT`0cT(u=}pJ}q$2~X
z5?^!t0*?`o=Vg<QJMl3bio<Xc`fw4xk2`Tc9>zysG2uUtN3jTh!)sXcRdaqFY=Y4k
zkCV}dZ{iBvfK|qGe;9{@@n_C!%yu8#fd}vlyo4num~ijF2=w6@bWb$rmBUo_&%;X?
z!tuMYFFuATI1k^$9Q+(RO)~NI!FcrH2>b}Y!lzj$EMWUQhD<i$g<%tX0ON2zzJ>4M
zI{XPQqGO5)HwK5`bbJ>t;1_rWE51g$FbZR^9}dF}xE%Lk5ne_2>n7atxQ+dxZ1=&q
zTTEs9E&K!z;4?lG?kl(gb1)B2;eAuh`Q7meoPicr#=CGL9>H(02t%itaD(w)jKM)T
z2Q%;;d>7xxPcR?voNmHzk1_ZVCSe8^n_<om!D?6seYgz|Vc1M_T^oE96L1)=$6e^1
zMSK{6k@ye}z(jli$6^NN;g9H@ZNlHo_7&{I{sepxm*PrXj|=c17T{UDhUMm%_*!EW
zmc$`A6caE5-^ZO;fJGQO*MwgOqp&OX#TW4n%)sYxC+@?i8Q<rzU4TVcbDoK}E)Kv1
z+=(x}Va}U_YcU6l=9}YB_~-W)nC;)Nc)HoHh&8Y~_Qx4`6~o^&*FB7_+1~^AU?9hz
zz$v%_voI{fgqMuF@G&gH;DzSA>ev9A;^R0BXX64~g6nWIcEcxeJg&yQco`jwOgcPR
z73*RI#$f`!jB~Iv4#MS_g{g~8JdP#C8dw`!<NX+esp!M^Fl4C-r##+|u{a9H;CP&a
zPZ7?mcm<1byeW3a>9`zou<lzX9e3mX7=x+k!w>Oud<tL1Pw`uHyv==LKm2$5@Bi-k
z>zD^iCBCt^ACKV~3|>Y$aWDI$*p9`4I1I<(RD2jS@B{oB_ux@1Lg#W5e;D4j!fe;U
z&UgkVzC(I5jgxRS9>m88uL0LZVGIt$1i}m9x^=h{595zGV5N!oJNyN&p<|Ufo`-L)
zHruDz-pqCmhGm)KoB7^5WR2Ml#W0M(N3bnM;}e*QS(uIc@e4eOr}0Pp7(?DQ>8OSc
zFbZSwaeM@yMIXL_oAE7NgS+t*UPQ-Q6MrCvU^A?W^{_3*;A6NPC*n-Zz#PoOkaZ^B
z${2^s@V52lcon>l{m<e!d=CRSFBX$<Hm<|Z@GKVLFX-N2;tj!2tc<m=0mfkl=HW#Q
z*=WM+ghALEAHe?jI<CMCxCxulpU7c54-2pe9q(};xGu=7TZXaS9cN$$ZpAFj!Tnf-
z*RkCDCY}Hc#oE{wJ7XL^i?3oLj>DO_8aHDe9>>dAh!@eyHtDK{cVHBzU=Mr>lW+<y
z!yNn`6E~UgQ!#9_*<Q+aAlr@)%>FU>8aBWMcmU7gZ+PDpbA191M<2d}S$G5!KO}wl
z8ZN?>*eu7KH<#`0Sf2eMADQFf7>V^T4ku!*kInfJco+7?Ay|m_;&9BxoUJCDBAm0$
zY%jqaEWr2t`?s6(PGZt0W_vsy$9<Ty!yLbWNx7Vl<#w9wLbk859kR>pkHMKZ4>NED
zZpIVXc{kT%0#3sed>u2;y~kWv0bApv_$<DT8F&B}?={zdf;sHZ!>{o&Mswagd>22)
zgZLAM>@(q4$9r)wPQ;I~0Ly)9u4{$;FbN|#FA5*T7x7h`gX?fF9>pIqh;Zv*3%nnP
z;0T<EvvC#XVDUVY{wCN9Q}9iU*>BE!33uTS*!DAXd;sR+_xKB*BHnum_hB4_6LB7{
z#EQhz2?yeEd<8RbHEzW(@D%=vWk^>oY>C~lA3lo{aV{Rh@B`-lN8wythdJmtXwHlH
z+}M}x7qAF}INlXwFawLQWWEV!G?v9W*a8RRbbJr@;3@nW-G@xLbukM2;7ZKGGGCbU
z^Vz<JCE4EyV=xs9@Mo-f*o4y#yI>6V!5w%KuVINJ=K2o!1ip>yu^H#zj{|Wcet~Du
z%XRJWIn2N_81kiwzae(Uc=V6g64*{fA7<Thp8x#+ZDYPN=}bf)?!*bW1^44|yzQvD
zzCO0V*69CzY82ZE_%>e0`o~N-&9Ea5#IZO77vgd}h=o}BxCyTscEng5f-hh)F30!r
zTfBtM0uycwX5mM;5377lzThL+3)_><hwx+k5@!=%79Pa2xRZDW5Z*9M#dq;j{0@uI
z@r_Ae0_I^6{%@DF|LyR?PMGxHi*dLFd*L+9#BBT?twIw{Ni2)oEXEsb$Dj}A<6g|d
z96X7RlP3J<u?kMZRk#bkMc23Hx~kX;d*Jgp6|?YD{2s0EOgLfK8hhXq_y%TU)G2da
zG!DV3xDr3bLwFuboHo~Yz^<5p+35Y=oEM7Ku_-=*V{kF%++q>iSFzy_CfpDV!}~A}
zeVB<kcoNka6VCl-jgR6`_9y*l_P>F-cm~V<WRADO2k<pKh!@aXWX>N-Jel}4>2VTo
zGk>~pE$+pW_&YWroG6@#rATKO-ih5X0Vj~YRk#QHlip|Wd3+fsq7P@`67;tVS#0NF
z5qi&={HTX_VIO=7U&bmJjgR3p+=9C?A1|Zhyot}Bu3Jsuc$WXX3+DVNjKK^nz#<I2
zXwHwoAy^Ol;B%OQt8o|R<Cj=~AwQe&%VQ55gd1@u*1~bP4L`>UmrQsMU_Ts!Q*bHX
znqU9_-kw1_o<;gDV!&l{U!hnRBQXwN!VJvCV|WUSFzgqeFOEeY2IK9Rf(tPVH{$^;
z!ux+U;rGXqSPN4z3wNXSo4KwkcE%q${v_K8n2Lw-68?t8ubA-4VO5O9LHHD&LB~~d
zU2hzM3FssIRrnEJ#1@3pAG5GB*Ehp<n1OlN<C=+o7`}+d@FL#!yE(6$vw>ZN`{Cob
z2%q4*u~^}{x$Yypf)6-MI3M5<EW#kd+lR;S1U7e>>$~9-I22#Nw{Qc#uFQ4IFdKix
zVgcrOUo6S}M`1ed#ow_Z_Zy9=xB^RY-*@2?I2t$NK0NE+CO>||+qwToZ~}ggCAiOf
za4N3EW7voLO~NYNZ#Nu@8Tb)?g6Gg_nf$s9Lop2Fa4yE<MBIj-Vga5<huef7h+$YA
zyW{_LyBb40iRi;maRPpXhwvnp2{iW+fsxn_{r^654BIKV2rr<+W5Owim9afeK_7mB
zKjH-p2r}Wc#*X*|cK4d&dpN%W`|rV5F%Q4OUR?J&ZpUv>1)Fd~ur{{Em+&r($5ea`
z-^cB^3;pAiJhrdm3&l*lm9REOU?)t(9e5gxuySz|&Ma(=_hAeU!5R1+29@CcF%mam
z9~_7YI04sT;B6+HFzkY#;8QppQ*kbC!*U@eoCq9%Ut%gw#F@AX4`aiUCY%Q_4yR&p
zDRVrF?QGnR$1sTV`r;V$;YZlIv<a_(?eFntEWz=+a0+JNe(W4-!YN|=H?-Ja6+7Vq
z%)+m*M;Q~&6}Bz5L$Dq`j4Ln)&tQC66HYLe!7#iFpTKO)!>jm0ITKDLtc?-a2@`P_
z7GSaR=K5E#0XD}d?1kg-2o_=W3MQO4Fbbpb5gdwhu?QU%&Gn7(9gM-=I1rO@IXWtt
z>qD>|Zo)Wx5{Kh!_&$bIHsOTf1GpO#a3sEr>9`Zas+e#hZ~%UZsW=g5;wn6h5mik%
zQTQDGfIgguOK}UH#;9s0oEUrs9bsnwQnpuPHXgv=u?OcT;9D40-GsB9?N9LtUcjoH
z_X38>3tj&%*T;4iR%~td?~?C6oKEMJ62&$0@s30OU;e13wpVdD4h1_LAypj?<*eqq
z&8iwUK>yE-cKi054}VNLQ7`VLab25Uer#UHx^l&>J5z6yomGE4bMN)X%KesHeSU7g
z_)hz_mXe)o9{y=UEmxNx8%(TM@7l*dz3(b5JJ(h@apbb^`iI4S+1%pUk#h!qR!(+)
zUcFZ1EtO_}_4a~QISocX@J-nW*?CXxzKy&^`46TKeC+E+-wg`A*h+TRIQ;YVW?{uo
zPYxQr=~{=Ua&<4pS$k8R)V}ehI!@npB(BewslD3Qk)6-v#(Kwob#(WAORu~Y{>8UH
zmn<(k14kDYA9|#A!KIDQwGE&B_(h-OnzMPQ;~&rK)_&cgXXpLcs$s#*;rcfBpa0~J
z7}az7x#lr@e_JrVR%QA3=A46%Htl|4{`J_ST?Vw=T<Q7#_vqM1Z|yK^?0`;{Uu#~w
z``%K!9xWr;=vuMr*QtvqZyfu|SjV6@-gv$1<c6|y=!37mGJNffyW=PQ@K}c>XFq&K
zhdJo?yyFX&KRl$&vv(h;c6`{owQ9)D=PI4=Gc@;H>*ak0mv7eQn~~o|$<F4Vog6#A
z(~Fmu<eXZ(aB%LFF8a)F1&PNUquwo*U83fh;g0lO{q=43-!*4?`7a|!H2nSh`XgPF
zYeh*JaGp6`{IP8n>t0=3c1xp6XXY-SrDLz$r(@{1-=BPF*QDS!KmWF9N?ZALoO9y?
zmj-^*cK-c8UjK69#@Tl_4{0Ymdo(@uS<>lwJ5IH;j@{XB_gAU<-mmWYXyp43RO`C%
zo)O=rP3(Jg(w(w1?X_iDADnM_qS2le3G3gD+&5NYcl8~)F6Z>-*}pz~E^R~i==z-x
z)RZ%S&$y$(j}hU&I0olE^X(%+z1rxAewo?z?w(6qF6_EGHGR_q+in}-m7S}vK6-Te
z_qBE&c>VWPr7F#B;0SVs+)-PSAVoIE=?rjLO}uJeN~YDw?TNQ4yH&YJ%c~ONt+IN%
ziCcwaTA{<O+ue!Q9kEs!w;B;|1-c`x+uRXW6}Kl=O-l(^KROaEhy3NUYRlh|_TTxI
za;wX-QBu7iN8M_A%6WDLIIWgyuOnU}Z=`0YL|T=JHT<8)8vf5?js2@wy^Y-I5|45R
zt38f%iK(Poo)Z5L>2OFo!d+FQK1;@AH6&v`aXPIqZx^dfFRNTHtIT!F89dG9GA9-I
zPueRdDesF`*f@Ps1v&Lk=Y@q6*dhD8(!F~6qlxh!IVwjgdqb`AJ*~=~>Q>2dlH5p@
zo*eF0h0gG*Zu!%btG;n2S|!}BTBWUEcdS(++6s=B^LALldRLv+R!MiXr6S!CE~iOg
zNFyRXCBave;LA3&DqbLT38{^Q6KU0P>nkJwbY+zP%JX)?sOMEvQu1{ss*8?Dx0;-+
zpO_ShFCDr6esLN6FN%x(u+-1a$UzcGJvA{U(vJC(BU<lhqzaRxZ<Le2Q9A!#beAPM
zE6~$cmy4H^6RkjX#Tjjta`zfEt&Me$J5x$>YI3~nP-9(@R%3U(RZUKcwk&recW_#O
zvOI3JHQ7^23TM8Q-9}cZJI!h@<=!RFL={i8Dh;w~NYx38w2HZ-<c3PA_md;7Aj$Yp
zcVl@F@$$D^HO_t}`p>4TbU_c!b*q;=;qKsUU2@&(o8<Ep-5c4Y52#;~J?VcsAUEWB
z!_&pnMeiM-><O1NN<G}8(mkehq&A^+taLd80;B-TeLE#kOI>p&x-4^&&wtWb%W2*2
zb-Id~{TcrKla<qI<Q*&dkZo0P_YQDdCDgRhi8@16?R2?=Tyi5eAkYeNM_KhdTQyYA
zXsgNs%i3%eS9N10&Sq+x<Wnulx*C#&QdM7)vn}Nt9q+FyaZO3>%a$`B$hy<3GDqc0
z9F?;rs$lh=N}C$swd$(PNv_&XtGr~Z6>Y~{a-~&HwgSV|rj&52z1r^ZL`&IMO(U(4
zL6UJ@JdtvOG%7lSs=3v66)jaF)(X|>a!RA|yCczco4M<VW+Y;QBtq`gs}?2c`zodK
z9fei{dAz0EZ_3q)R?}#!U7}=4N4FH4e5;jPjdMvWO1eVi@fI6vKj}azKkC=fxpJ<W
z;F9O;b9qf<dH%>IN!AzhPIZNt{YC!$ze?5zdp)7L5UFuV@j8g>D&DQeB&Ep%^}5wg
zhpsy1C3j-2puQ4&rrEzVDPHzhQeUb>tBpJSrk(05*%j2#3LR_(skO<jj!wyu(b+n~
zHoIcoYTM|12}T`v#!5rg*(&*kJgiLJP)XyI=5M9abSqU;w^H$TE0stqRpX{sO4pY$
z@<0OJ>1u&k%&ks4(k*q;DluH1N3>NY(Yi0&YMf@>7jL!9x9)YTpPZ3aeL149lqMk1
z9j%tim2%{W%V!^vTOOO7=~j!Su?m)lr|w9cHa4J`6v@vVJ}Y#rRjOAntN2bUSSZ!Y
z3hiap>LrybUfyB7I$O26+JTgn0ukySYt<TMg{qO+R>hH4b$Oyy-JaG~<#EGx`a(Ke
z)jG?wPLz~(wo0$GGx-Nswls-S(z4a%QL$3l)#LzOwt89R?KeV@)C->!-74~;$&}I)
zq-Le)?FQ<qBV0;|E<F9^Sw^cXqxIt{?N(C*BIU$#>Ozu!gOyfOlqbx7+CPnoHxKQg
zBioLAtID*3ddbON^>LC<pQB@Pt7*=7JF1_g#dhkJKypmS=~ll>^CpqXQF*q<lBHE&
zl9Df(s*X8g<z}VrOH_+eG6RZBU}~m3?^4qUR&u#WQmjkeuaeT@I)J%OpH*(9Rl1E;
zTW+L6q&#<ZGFd00f?I8}(ygGjR{3dGnVvcgAC2^cTXo&)yffdbsLDrL6*^n3)Doq8
z2*;A7{7LLpq&?I%P;z0MGz~g3xoKVYW(P>KDVaJ$=YU$7<O?Wa-L3k_`D$diodc0}
z;`|w)_XVlH%7Daw$bd|f0pVnT#2`)PBbfotV0BhSOGK7RP4a}<v9wCFQ}tF#v?NpV
zZL>4oRh!na743x2uDU1vt!9pY{|wn*!#g}6RNgZaMtOV^f>M_QV%@=!vQO&q#8Kh)
zzDxE#e~-h_nsDbx)erJ|re3tHy;2+PC$1JtEid7@TT-RpNAm8es8eRCXLV~_NmYwC
zFCbkIYP;3rkyh!>R_#t!P-i(DrfTV?!czUB-D<D2CGt$ysdQ<}BGsNGPqxnZ4|OZj
zJD{}W_8ck6PASRX>qq11XNAqN>U6e(y6BK}Z3yaVRqbL~Qcb2>HF`@soMzYB(Xo<I
z>C!R{wr*3Y5}Zf9GFobk-3G+}(>5U1ZUb_qJyE+|QapCL)y^bOq^SWr9f_s@$m7rj
zK>BNHnTofnjMYz1ok@~6pCm}lQa&qis#JX|aF<)HP0E!@ua5ph8l+!)OodzZ##)th
zL6$p~ca~gNW1Q~TRqAY2xNbEaC$Fk_=>p3w%SoZqnoFsknG|ixOH-YbHPqYkA_<c2
ztgav`YqT!biSmMJB-chuK2>+CG?%VDRpkwlZbj_0>Nm6^+DN4hm(KUycAnLc1}{wl
zlhXBx+_*$C-EP{YP~H-W6)1fN_3`Lj`=+la>&}{`K9(e?RZ^&Q2uo55b-f9e+&dw6
zFNJKOidDru(Vp%)<h_zndgmc|>Vc|~NB83&@}%2k{>!8^w~Eu%v5wSc6)Va2sycSc
zRG&ok_<0w7im0~!0xB=aP|M?0OXV%CM+xQJjpgznbx9sgSW~;J8szqTU=^2R4)u`~
zce}kh<J6fKqz=gK2FjZ;XO#2@<hj2W5UwkRIyK5iN+b{FS&`gZMoPS^1g}kh5*GNA
zkfFo)BMpJ-SU|j0^J&Y|&E8u~#xMG44R;GWA4W<WTg$ChNTVg$l0Q-kzx4VtEy+FU
z2I^P0Q;qOsO3doWNc*LmVP#4hbx|24m2sp`XX9ck)}NxRC^F@3sr4T3a2=aEG}3<O
zy<_QT^XJIry|vCt(|0Dxcp>~oD(wA*dcUOSCEdo69M347m^~xI{auHccEq?)7X$B9
zDd?koYH>iMR==5+1^BFry{yW;tcusIGSj6Fue1Vt=?bN;jVcUKYJ;?PQts5JjzsC(
zd0vnR+$uqrj9}?LNxpnG%5x*SsC$TRy=1v`{Jd)FDBY6!yMDT$gvd)n-e$VN`);HZ
zE8XDDvHocAPLGt1OI)nKrOWUKne7j9@}Gk=eQO!D*nR6p{-!NdH*LQq=|U8!F3Fq1
z+uW*Q+Swx`J^G%cDSzA2C0MCf?3bb??;*R8*$>dNrUit`LzG^GUEp;O@DnTC<u;`&
zraiZsYp1KdS6v#dZzfPpawWRe26?KIy?K_o(@&)=nJC}2Vx{zsO!Jo$6U}FTie|q=
zLp{pzM{~d~#1*_!)O3{&l(b9ntTEEg$ZaMg<&Hiu!N>nO_#GqjZ@P=G{<w>%d%25a
z{@@Q2{4xpNtujWY>jwX%E}Wh>C9lHNPos79S89fgeuAY6^mYe_s~J+(rFZbPj(wV2
zEf|@p-<PMYM1QZnz#q}CcF`|w7yXwdlcb}4O+t5Wb}5swj{8W`HvzhTU&X5?jE=V7
z+_O^BWyj6Eq-P}3=z!vG<&$yzpSvZ}3!q!#)~}Os&TQS8l$1-SWv8;=*t(QUBQHHD
zr`1BGIdi4CSuA}IDb!h`q*kfCfN1$E*54q>i$?m{>ZHqO*N-Fig+XdnfL#yX9u;Xd
zlcvZiDN*~Rag+EZ6N0<i&Hk#S@SD3uGVu88Zc!&4yWK6SDDQq(?4Nr?(i@Uq;RP90
z+4r$G;Jkg8Yo$M=XElBZ2$#3ds2hbRrUO~|vfF7zdL#YQ0dfBQugZR@q{E~<Nn)$X
z+qkYORw!Xf&vitYn&1xiU*hv$GRe(TDH!!%K9zFzQ_*)K4<uXP$p6{{u_tx(1Ch7g
zc^UpSQ_Dul*dth72uQRa$7FfFH}z8g!^e^MKR=EsZh0IRM%;KD1s!=DubXUt%FcFm
zQ!P*`H2X$9xSzA9OO8uP^{Uw;^{8B_vsSeKv4r?fnkOgO<IqhbZw$@D{0HYte8s%g
zE%kxaRe!@8<v)7NCGT19A?dwjs*KU$szASS-SPr^Q)+sU?w)_^a<#uPl$H_fPFK8D
zyt7>iu8ho-jzL>B#_6ghm7|SSIYP!1j7gTtgiQ&nzKn?_r~28$q{=eI(OBxCUG)~)
z$4ki|LR!qt$!XRd^1^d^GUd!j>2)=>V_1+9uX^g9U43<3MzQ+MG$tkO#&)0;p$Z+b
z_Wn)De;xwpp3e>$N~<@dzbVbrx6(;1BRxO2t~S!Am?LA@mTtT68rVrXy86C1OAV6y
zmJYsKg~dv%Deo+40A11>zA2RADo9?Ro(O3X%cyYy9{pk~CU4AKd1JmOL!{EGif$#F
zs+Gw;DJ0nva4)O0XONs<Os#Sj%24iWsoW0fD@c=BU)m-;P?sii)d=0u3YK|TS3jp!
z;vRVbjnxH*?p^3nn%a76)(5NYGI(kxuUL68Pq(w->*VtiMg=uXW(nI5vhJ7r3U<F>
zl{;)7ElduVqk(FP91W4yWPGxI%8lLXj7vH=5{*9UR<B7OOJNIEsnVbY9{0E7MV%>?
zi=|AJmNHdKy6j($NV6wMGWG4$vqJmGyY&UToCO5Q9BNn>t~uwRSMbPriNZfS7JKvT
zm?TAd?W5ymxTojDdb!o4lz7RtO42Wt{-~Kalar+T`IgL;1gQ||j$BLj$e`1$UU#O;
zsUG!Sik=imWWMQ^**AI!dPFL>xplNIQc5NY<OLlRtWG%W#{Bza-5{y4x<3%8hIw-B
z`?v2VrYm>znY_t@yl+~8gQiM8N#EH@l^hFGMWfE&m?f4-BnziXCo53i`JOkW2er^u
zRbF%fN@q+rg36N%E$>)){JK|rKzgMfb#YYgO__AdP28A5l+oElnGLDtX`@GI)*aCS
z74=K)b7`EF6{0c&(j;T-DX?JoaH~ukNls^}6)$9aHpyN8DszRpqha4zNOw}RS|*8G
zd1VI4)zm5V+^Q{utn<>_&$r9Z<&nDbRn|3L4!4y7U9ij_|8=#`{Abl(y3r}RPD}rF
zn^XEGo0DTr<#*MQ3DJL5`*o9Jw+7NHPfn9OmS_K~Gu^7GK9H4zh^bakSN$6OB-wLb
z_g+47rb)T>#H$aJJ!!I~ryM*}CBLGjkJw2LMam9WZJCpjJ#}T$FZNIWtu52y<>iR<
zQslJT^=X-M;p=jtPlEKx6oJ|*b-q#?dAc?1JUW@IZ%$WDS0|a%G?lZfedu3SPJh-&
zKlv94)gkY7siKm}>Vi`SW_FVxWvW}ab(eaf&?<k@3QYA32`Hgj$LjiQ&)3?|GwVT~
zXQ33-`regR$<9_y8Lb9Is<|U?s#9Sy_hZ+E`MTpeTY7TRb6#iVTHRF(se>{RV3ipp
z4@LJWWpY4<o+HEM9h57hwdvA5l;P`cOG1^^3ahtW0j|iTmw%Z=LawgflJc*Dhe#cG
zAzQ~F6V%eW?vvM;T@`d2k@FCVO;b*JLkD@)v}9>_e=sxcv!#gZxuA0HMiQW<&N!vl
zCxhvk$zeCY#`R>AJ}x~^r=;_0QkWX2V(oXOog<EiiR75PE$v7)jP%HSR)#-ttHe~R
z+@>3oSF*^W!*kdZSoXyJ4rv}_IHTT|>Bh2dmFtkv!FwsO`VT8qHz~4nzLclnO;TiJ
zpdTp=w!)-cjF1;V8@Yubck8F@g7D!8-R_oCTlMKO)W6F;#GW_~cAu2icSNk!!L828
zf|X3*sC_c8BjX!g)GMi#(zmRwQb)x~mGqR-ZKIki3kY`87%ykEkP;-lSj+Rg<j|H8
z(t;K5VpZ;Bm3~<s<~E5<s>aw6;WA`azf0}CL-tgc35>eEtlN9bqmR{HNojYly(Ggw
zy-bjKqlL(Mg#3F|y5{!q?!zSg+Ls)Wcta&WWWuDngx6fw2=y#_S+zV#GFVnX0;DzF
zAq}`b8Xj)7mML(_oKSaDotiVFWV4Z)A>G*!scfy4H{5DI&T1#A3z0#NyucG;t=2MM
zB$tHCAU)Jm(_H4WuPQWGHMPRk2zg`wG|C>cWCqBchp3lRbTbvCt~t`IhELkLIVDB<
zf~sPo>Y=ks;@>Z`s`leuud?-xs3S7-SzXmllh|}x?FU)W{j}U@q}nQ-?MCW@0D0w9
zln!R3Ok51MB05`@I$6zo$xPUIX@YcbTBbX7-j|gOZQ>pzGdYsjNa+i-k?A7cesq_=
z?d#?Cev)F<TJBcvucCyequC)nR4Gi^0g3X8wL^O~CHAJ!Vy#AcG+;uLzKkr2$+bE_
zeW|&f9n$X#lzMAcp>%P(OKPu*v~%H!8<}8dfS#6Huk-**rl+MXEunr)&Xh^n8PW#n
z^lX&dj*t?mWYk~rkkwGe*6}iVBX`(PW^45dhw2k)g{`tGjnS3AlFX5pP-k74Ruc&#
zouuhZIVV;6Pni-f^KW{#%cob9C5qQ1#$Yu=vQZw*A{jTy`0lupTv{l9OB3{ylzN#F
zm*^#bK9!~=SSsWi$*$(I6)AtY?vS37z8lF+wV3oT^QYg{MwUR^{BZ-ZH{XC;YIdg7
zS6MJgw<_&0chd95oh+7~TSduGdFqX2`bTCq^{iEx{CS&LSvpx^ZTM4;wvjHh3_IkW
zvn472<2|pi@A((0#{cY|)9qW9n~#=T&o_@ER^Lc%^{Vuo^+HKQc|}MF|HDJMu6rYk
zB+De0S)&SFP5keE$D`EjdG=CLnibea=88_pl)61XxkTxja^EL*!KorgpOoq`AY5uu
zjWo|nITWZKJuh=h_WAAg1hJlvshB9;m`C-Mm30NK^0^h-*}7wzo?p_vwBow3RZ%Zx
z%2p*M%b_<qcamd|Mp{j~S-0P%doOZa-r_#J3ag~2sRxR>QQm78bD3nxlmU1Xx&Kn`
zuKJ)9SIPV}@>-HdUS6#a2)Ej1TTSCl&*-|{GYVE?rD7FR!*u&zLZwNE(F*P&w=p!^
zqi4aDij+AbyK#71PlrlRahX&OeW0o**RxX&$t`=*UDf?L7}Ad%+#$oMN?y+p%W_Q(
zltqhmqYGt{Ogj6eJ@;6*b++255qhaz8t63X<VMJ-TbA#2@o1#qJFTCw%666}T(2iT
zAyc(7LaFSQA(@^#jg}Y4JLXcE&2<MyxPtw$$2~^uSrU64?*+L!P5R7@q;5#ZM5>MX
zZ?qIfX{mbJyDP|H`%k^Mk);Ox@uJM~2e(0jldj+|QjbfjiKBIkBOPODVn*wRUTt#4
z_mo7%>X)f}10da$Ki;L1MN-&SIeqq<`hfJ4ZhEuIvVwp9qomC3%Bo$EH182^PbI60
z=X?Fal3b`SX|Eya4b`oTv@b78p$b-BR|Dx7WRK4Duf-NT&b@yui7o3@Z>PxHq@wh1
zORkb!N>kI5rPt(EKRRXipFeZZZQdn0`mdHAS$0q}Wy#Dx6p@}zb9F^lA8SmLbjzzu
z_c2de6?Y1e^3cQ89#^dB6QkupUzE5r?b|8cN&d{X|7`GxoE<G^<?H3GeR3ivxz!rU
z?0cnUmV#4Xf|ry|w3j;dqKPz;i{-<biqb!l5s$2Is*jSx#|>wFGtCN>?po*0y6Ya?
zxs6q-jTJi8swHc*(sJGAmOZuF-n70c%bo36-@HejsegU5OB<`iAS<MeRZT*Wz2#&f
zwDcgkYnc*uCdw?Zo&5oU(sxUU=K-vjfkRpED!Y3#RhBewT8?w8$KqvuT<Y;=smD5t
zwk5~wzx_(zT2^Ew#gfS4=XH(ny1Uuc*>XR%X{S{~diA=}xYc*^m5`E_eTj4m!(_NB
z-v_wWSJLp^JxIES6Xh*gT-LtKlfGZ)uLM1npw-8+247rlkr9E!P+WgFqjyP4B@Bl9
zx295VNBTic!vg|?Wp+=Jphm`fhI@vqQx0hW^mjr5Ude@!!Q?^&W4Uh97ilV;tgi98
zYt%xDxT=w<;`B;y6V*5~z$=}<hjm`W>l)g?ZO^(tD!Wy1ibOI@57DHZ?ka;AN!$_X
zzm)a#k)Xn*1JTAR-%|F<nANU%t?jje8=Ymn!lXZ<Xe>`HQA*ZW>29zVE6s;0uk+8V
zR=Irgf=ZG;|4AiDeM3HA@XCcs*4P7gTID=t?Kx=s38xO>-d>g`ESJhl)qVt-H$Q@d
zdg=I(4BhL>nhA5HZ_4DtO{GAdRT*_gW-|gidrDhXcS@@#rB!z@WPworEURm~^g}yK
zL1d0-jjXB$%Ev$Znd4b4O|??b>5naHs&6Dy8p-QGN~K#xM&JD1l^k!N?(xa|jy$pG
zVCg<gmu`qd6(oDI_2ZV0%;e5xW<O9j?vtFco}I4p{%4u@B=`HH%vjjpn6#I#4OC@)
z*Ec>fvD8mVu~v&attiQC%iT_DsFXd4NPpp0qK(z0ja5@>&<?q#nwl!>Mtn>n4_L-@
zr(}8CdejVn+){kvo+8Qy(s}WID-R+~DtH%B+Ddtnf%Dy}yL?3;?L_N*{d^@@dDD)Q
z9`~QVD$=jp2J$MFc@4LexOp<n)+42(GC;X`+0m_gRZC^oyoMycyq<1Ri4v?TZm&qH
zuUvY)|6eXhdUeUXnw%(KP`qgct+Z->D6bOvk&P+Ks)X)x)>a=%ZEGl96B)m$DN_4O
z%HO$?y3(q<9$!i=lr>8=PTCYb;20zCSqWDiO7fKQY?9LgRc+6C*%hq%d2+3qo^G-b
z@6(q{PMGV>d(du*o*@?x+EY!kD)YVEiT;9U_Go$BE98ddb-F>8TlKJNf)rn!dGeB$
z-lRRnbYmP9sEhGwX;n(AMbcx{ZBHrpi?;s9u_Q&FM?LuxOFk*luP?7uG<}{!zGJbM
z15e6)rEaN;WU1svOXZVw?(&1J;%($yz49QP&9CL-xf=s!y~Z+0s*g_etkLr6n-1ah
z%^}Q|y}D(Xo?<WF{wVcHK6jGuE{e;PN4)+frDPkqN~ugeYx#|I%Iw+Joen+mCm%>i
z^Ptzq1LgT1)-#K0t<)(AuZPU1$(J(G(i@ZSBlTA@f%@gEzn<!>(^q}0b^CA$*sH#C
z=}IZ%PFc*ZVJ3n%rTFZjqdf%cnjRpPYMfKPVzQg;2W2QJ>AB?cB--~iceFg)7*A*U
zKfS1=RF*j+iTFmkTY5BO)|*#I@zv=(Bm*+iDId*94?rT<vwU}TvdYTH_LOYyaHP3{
z{1rGZfpT!dp(p87;(1vEC?StY-fFVetb*^B;D-MHq(!YS^UjWZlQcck5%nBN+i9lR
zPAXT!8`Er=W=yXqxi62@ljru#y1Y^T>CE~qQ*3(KU*`X0X5BxJWKXeqWZvb*9NXWS
zV3#+Gf5u&p2<7Xms-E#ydHIw>Ufm0lJ%#Fo%%hcddrsOb`2Kmc()K)>{`|DG)N}c7
ztpC4}@($`L?|*43bt9(d(MrmYueS7PWgbmh|7mu&;S-t9W9IC^|9a-^kBKvx$<b5k
zN)44F87}iBh0;M#U&*3ipl66=mrVD{EZ*1p?Y>Lm4^-v!q(ZbT4C^LL%JOmf60(xN
zx1E>BBOE7_;I+LnRVzz_@*-CAQ>3T*M;F+hPWt0psGFyg<mS~RX)A)dSWWKNje}G@
z>9kCh+7O~X9PN``x}zZV@g3=VOT8FB(vz>#oFQ-WAhk?J$M&S0p4yk%Ay2ztM7$i#
z6?1jhe7FA8N<Pe!Rct*eeMOf0Z!|Hwcoijk^sQA=Kga?^-IwL|6Rnn!>WdMsnojF(
zx7wnUB$LY0<!EU?G?_}0zLMMH<BxRFHPop;{yi<P;c)e(#MneF9T{!^E%P?@<hpP@
zJbzy%qh6O|Z_C2Fyvo0{66H0i4|I~QOr(V0S{Ez*1cJi#R7zzXp-fj-o~pkbD!0>8
z_IC}-M--aU-5^s^UR#}&l2qIsu1?8hmW+Mpx@8FZolKlH(0@v&yqk;{`{|}R+FoOm
zSJhs3tbD<s){KxT9T{X^m4wvpth@jE>B)MKd><kw{~$vGsWrDwI&KKCKXH>`rM!l2
zT0CUdQ3}b40C^5+GNovL=4B?;M^IEUq_)eqs#8;3HDt~8PhE);>da_YOL^^k%FBJV
zvENem0MVycd?w3kz&|dw)%_ougzP`hy8pRJ(ACRs5~R!bM~fh9*75-UwB)8w)RXLX
zKTMX~ZnOw`TpZ|;x8;|~{^udZ-&9&zLHZ}N-PwP=6s21!9aD%s+pWJ()}IQi-}F?s
zset+uHu*?kVu0+S2PG9*dQb~S-}oFibriXLRlZ=BEcB`uJ%x;v&7)nYCz5ZP=k+wQ
z%E^F+p0OT|%60ZcuXHHwZ?$Vlg=pa3WtIBY3T!QPD5$BNQR+Zz*>cOrYWjOgDN#92
zc|obqW%8~3IN6R>r)6X;MQe^rPjagYrl0(eLzPB4M0u4(%Ihsq+Ko*4T6mH)kGiue
zhxGkjl=rpn1iU4mYXzz&J=vZ&O%qg2k2Xeg$3^mjE9Z694z-)l_hjrVx!FJ+GV8Iu
zt;&t0uF;qNRKCO14|Sz<$g1>}N1iBEt&vP+C7OD**FIiKmSp5BuT!J#_1N(;C@rNY
z5m=7hnS5ULG-V@SX5n>HvO>y%4)%}jw18^T%llMb+IkqKr#9phB@;)MEa&|<2Se3C
z3ASQyDJ9Zwoww&;`N(;N49}}cf9<e54LvUxtk-z2$eOVIN^Br=<$6{0y7WLhd|`Fg
zlM2CZS;Ez~@K&-kd-mac$(s&c<gjj8<wMsj8JElVC)IR6?YzWXRsvRfbw^66URctF
zbgz7EcgqR(dT6<L{~SeJ3Xf~8JxAfO$9igpe{`fjHnG2043>gpf4s0udWqGgqbjq^
zx;n^K=LoBWOFo>gU^kAP<g4pxc73QJeK+}b-KQJ<4KA6Rv(L6viEyj7d=g&5E$@wR
z=_%<ZRbLg#Rc=|<^jVFEOB*AtatZ0;S)Hs3GIFjd8Cy-yCdwLFi<^#BlVh@UZ=Wi|
zGaZ~;b&wDBB_kwbO2`VcKD(ZY=%$b)3DeB6I_6jjPn7-2yOVf3T6Y|g;B;v`B!iwr
z_iovenOs?9k>P5dBi|m!%5N-gcQ26RQb!`~(x}t``4%czz6;jn+LF4LuNUmi5|}+4
zNF&?#N$VTrEp4f%trnH6JM<J^F_kMHMe1+Izm<>oq=i`^Zv*@3%1nYxHc6YHirE7$
zd*MAXULw`=Gpe}6ul6K|%lF=DjVz+;83x(CS7sIhWeLz7Eu-@aGU$<okB}Fo|EvcD
zdGe)`yz$?aIY0fz)AN3<R9vDz$jZ`Bzi~>weTqZvlWU~?pCR9`5OY;&v-MsX`K*&A
zH>q2)`lRQt<k?BIIze0P1nG5#2GWS=uM3%WIl#2bO8IGuIv{7&kx3C9t~`^-SgTIj
z-}tY4jD53p+@7)KQYldS#U1MIZK=-wZ&IShk?FrnS(ndN_UT1*85b$}Oo^(fN9_OX
ziz{hm<#YA<_E%>5V=MdhrT*e$tJ=~G%Tic*`Sw<=OZkf*C}isCrW-#{kb=)osr1jM
z{J&6;KAGKR{lET&LcR>P!*$R6s6*P;#j<25e~8fm=I05qfARAK`9$}B{&_;Q<ovBa
zPmnRI6k6#@&5{TH&wr&*S$!{^-@D}D1?w*zg2%c_N$*x(EWTIG(<>=0S{bj(b%sf*
z?JuX>>Vay$yc2axE(7(6$?@{yklzxhu8FQtx%w#oYY`{(uNnX9*Rd}DuVeXTY|!Q=
za%!>Xn+EKQ>{2FYQpCblm4gby+sfs)?o8ET-ues4zyHhFzx;9Oop(r>dG2xrwTg$G
z>~a0{>5SIpMtv70J8%7U=wJTkQ!=ZF%u;2fJ0_ok6jvqW?I3*%hk8!Fu2CVnfzfNg
zjifs&{ej3}S&NrfdqTDhMM|XkKdMO>PmPhTcX6+Lm?0sA*k1?#!=-XC#eZ3m|FQ}?
zcJnpyz;LONs;d1<9#uRxz>>CVnvB++G6(g|=uE5l5(#~{q;I68?*ZNX{W*%_?kRG@
zoALoqf*y7BvqNyJ4_x*qv$ALF=s!O9&6_~F>r0Qt`=(nxEHjD??QgVt+!S-DIxjKn
zG0q**RwddwP)1fa)lj|k6RZxZ#HsSys+Vq+S}0{v_eJY#)qYvjleb@%EY(z9tk-j5
z?e&~!t8H3<OIN1rqx1xee=(<rPX_rvONUjmLq3C)!GU=k`gWg_FQotcI12SZR6;M7
zFN@5wU@6Z@)=oR?)kys*Mc^3w9Z{t0uRfKUmSnGcDoI`~sbC?}<ORxvzFv;jb3T^-
zRB20!UY$}c^eRCybx{>cN-Q-+eh#3g01v2qX>_H|>Mz4|Dae*7K&1v}+F3Q#<gk*L
zn7l+KqkEIj_Lo9MlgR1+tuKX4{{5Su2@Row@6(m>?^JNltF`3RZAsrR|6%y1uU+Z;
z;&UHZ(yU2Sx%}3h`TEz+Z6EG3ApgOb_{YCk_sjKlqt<uSzjBYsO3oW}Zp^pGUucyv
z!@KXJ{<?jrI(K=kx*6MEADKA4^^yx0Ui$k@L8?yttu2Yv<$}p%ahU!Ew9~7ePgiB+
zMb**&UUN+0gogU#hYH>wWyVZDVR^4Nkp&w0{-x$h>4arVFQ%7OZLC#ttUhg-%;0D0
z0Ntvz1gO8=E};pKrO%h0^8LFd1#@zs{(!NLI`Lwz)RT1GotP%gcNNc=zc{QrwsKe|
ze*WKi*dG7Ltg<J~Dz(O{=P4t<(A#aPZ)L#cRsHl=iGiw?{Io)526P=vRIB988Y-X7
z$+WxDpB~D5!F79LU7l*3kB4?d#!1Rs+N@~=$dLs9kx%4EdEI=fXQQR_TWHnQt&aY6
zPN7xL{CT_l8R)*FnOn6llx2rD@+Fu#P+JbvH-FYK=SYn%CKH$&Q)IX*&wRHdU4L_{
ze?%LgV-1^1diF}Jb-eO%owNwjX2eRRuPx1xEO6c)Z+|)&Q`c$#8x%n<DX)L0vX%Qt
z{t*I)+CM1y@r7;6zxv>!Ss_bz^;jkU0-E#Io!>>t&Rbt{r=@l~!>(>#`}6oC36&oF
z=!B=@q*i}fulDJG>EWKlwIBDq{kIcs5*wCTH)lYfzx*c<QYA(IP}wd?qu}%oe(c!|
zuif_0#<bsa56!&PBt^fBSInxGb#V0bA#s&^FZrO>_pZPES1$hY-$j_A|5b){4-PH!
z%((_N8-(5WUSx}+H~;cqJ6NIrjg1~F8lCU3W7N`RIq~^>ioLen(NcDncHJL*s8d><
z`FHP4I(RW>?qMC~hK}2ZZR}B|a{RbPQT6u3<POmPoW`zGi^G4b(WKkN`NL8+JUH#n
zzx=lrmg#>zV#B?M>kldQPK&4ydj<y2>skM#-dX=(SW<4x>ILexh-p8!imdUMy_3KE
z$2@eookt0h&O%vTY<t;n3z;8xtJ@=`r75fG#7Zlo``FTv?k&HJir253P5z&hWcdTA
zr)xy0{k^&UlF9KOxFQ|*K<^0oUR(!#HA#MrD+^88@>i^=M?BLl=NEczUw&C3pIpge
zus!o`f7AK~$$MSCBvD@Z&6(6=nJ1R{7Wq<9KKhm4$xB=3l_dtf2xCdtU3vt1@kjcU
z17w*#$^7)kPEm$GumyHt-|E2h#d^f)8lanDr9QNm2;|=-7=7N}zTlEUV1c|^rS>h9
ze+0n}c!Ay(YpS76Wx{+?`Sz^=-{}vxlU{l7s<8xnO$VEAf_>+vVDoPdHfjOEuCasd
zqO)&LlJ4y3pHK$6<<ml$BHEoK)934D;m#p{rs=M<eYBS=P`45~d0GB&c1So*`LVtJ
zcO1+!@*mk{$KVOoSt37Tl<6*+b=t4u<!8)2tWra)axeIETTdEz)zf<3v4kuhdu6^-
z)~a-N>wjXy>8U9Vl-@`v)W3$jDU)*2xtn9RrXtUO<Qq8>=#{lx{nG+jS19J5FF#zD
zwd7p=TKrxwPRhV<maMKx>V8n!@@wniR>jBVpv12a$j>nJM`d~mIA6Uko6`F_Z+|T%
z-@Ik&kK=2~KAA&`RIlly9F!5ZbiQsrIz^6_knX9HcGoMnsqfLgttRF-15fLoqWr!h
zQU4m$tDZ8OWfE6O4)oDY^dBWSgWH}cHRw;7AK^bTMUE)1)Bwr*BKg&>zPlL#g?d&e
zR1J)E>wk(s?!QE~lB})J#;Orvsll!WrCqf`Z_4oJrMFx(>ZS~jSV+uo-Wqe1|HxuH
zX3yPrZp)Wm-`N>_NWOHEB&-NXG)d@gk|4Vx<U^B)n{xb+bg3oBv*ZVHI>+;5Q*wNb
zZ0e~G`IYO<dHSh7%E5ok@qKbsip^UA=~l@;{$xm5mfyPDS-ety-J!Gmfc$dM-jT_U
zgYrXNol*B?>VYLWB_}qLH>P<O`d?Wv#eESuvhUU$$@3rCFGnbXy1K29uRoQ1sxCjV
zkm)<!;K+j*8otu9R!X0|m`8q2X|K&DdctKEL#dMa5@}=Ig#8ieVj?~H50UzhoHCEW
zu6;7sD63$!ZW389J2F=jsT8V?KXjc_Bcy`Oy!m<OEFq%b?BFtV`p1sa&Be6<nLR4+
zzF^;g{9;zrKf}E)lRwf~ctwv~8p)!H%u`(+DWehDH%%U#-nYvN-D!nvl3kH<!D+cb
z#yE3ix<;MSXUaVLTzzI^w<^@z(n$V4?7az?l*QFH+%u0oGt98GETZBv?izI%HPQEt
zU?%GuamggUQACXsKZ9FhMrG3^+U$ZN%?<)X1A^={t1t}MfFK}C%Qh?m%?>Is47(!y
z_c>LktLlN)nD5Q?fB%1d$#qH3xz9OuYClzVs-7r79wfadI^czZ%uA&3lcThGbY0cy
z2vvei0OIB5+>hpnxSFh-XAn0+2P!<wPsEjD6mDm&y=@r=sMoB~JvLhF{}5WTbK3;H
z3MzpL1eJT<N5OLZYL0(tzUdpRne!0o31;<!%Ezg`*r%e=vhaPahi+texxHZwJ)r=H
zyx|71C5sN9ntL0uZI+S67okxhuVFuJylqijG;L>#dXq(nTGRO<;|5ZpVb<Fn%xRXD
zrw@AAh%wrTDmH@01`J4OW85+*hNm$uX<%Evfi90%%d@;5cIrt{@%8sn(ZN;`8LO>$
z2Pczygra&m$BKfBv=!}O#i3WNS}7i<RYak}sr3>;-Ure!7rCK=q1Zu+EXmHgZ6cO3
z^$`(2?^WD@K>#i`l4<*+b{4j=LWu6duzUsfC20o~OYKi^Y{w(^1Z%)XV?BZ0TpFU^
zM+l1tuY6->tBF+fi(W-*!>tayFy)NbMdP)>AH%Hd+-J1gGW}I*M$2ItCZ%BRWECVr
zRTU9|b&kMj7T9zifusoRWC5%%*fAQ&0{h_iBO8dY^p0pp2JNF1*TU8AKaXQ26SR%r
zK!cZCL0d4`zG%nSWW(8LOJ3NngqZMm&nEs(i0o}w?)AuSN%zwZw_AdilbzF&s8-E|
zChA1q6``FF8VaE~h^ru>p)9o17Ah5?kt_uDn;<kgyBP^>Y=_&;*r^zwjd!}>oNclt
z6C$}(OXlIo3scf{!8|wwRPiBN0!`Kptt~p+bX&e?k~ViU%jf)r2a!-#P8TS+iWLOB
zA-LWi@^+!Q3~5JuXWpJ{@;7-nMavy-D@=&OFO|Zpp^*HLoL8Zc8mr$TG}T_Vf|AMF
zqBH-zg0pR{8mJNly>h79atdJqm&v45nP5vgiuJ08zJL~x#Pca02pHIMIt8!1mmB_A
z+*&8kcmrJRi^DXtVkBRRg7CZ#5P6P5*cm9-h432YlrL=);xV0v;mQPD&sK#%Ca6My
z2@zugz2M@3027Y>^A;R&Ecnc}ATmW;FvYeYJ1b|hZNXWbTj)Y)k4O^FJBwJW?_&XO
zbW1}(87#LgD4D7)n9mkeNG)6g0~(liAk#oQw9E=yJ|Xg}RSr@uvm+Ep%N)~|TIPnF
zCf=B~+%h-OxI`^;CcF?^UCA_UPEx5$wad1t0^yz__Rw1ZLv00BqM+)ZSJ1;&5T33r
z2-k4i<z85z`s#I59sCj*Zj@2QAiA`Tw1;83MeeX$<cHiM=c^XEBvl*KBEM;4d9@`o
zbm9?}c&bI3_Mj3UixyxKM`XO}4&R;l1}E_YcH%`db>dMwaWuwQs_a$b1OL^;_oosk
z|89$&c>V`E@fat*5$$jS-0hs(gW0>O_kY}mSMB9B;GHdQqr9<-8Zy4M!}z`&>gbU{
zo<yReZUL(sjxnPRshLHhsJz8B&;q@3TO=?`Tkt81JV5N@+mMI3t_EGgV~1<3mIsMY
zE5%|F->yop0!Pp{G{goc`DAT4&Ke5)wBd&xch$g3zxET1*2=ma;VgJZ567#$@p1`P
z?1w19Il<aX;p<-HO-I(hYv%&_f0Uv>FkT(0fs$Gp%=m<6{*<F0nXPm84V#CzNp^YR
zHn0L-c>CP4U_<PVNZ5GCVD3^x(Ko~JiSEqh)ea|dVLQ8_a;<MrE_D;q2locf6SN?o
zJ6dyf4WqTQ_Y$Vls@C|=5G*tCAW(KM#emUyWG*?>YNwI?4N*Ii)ke__I*?vC%5zBX
z2Bmi_>wQM)rBQxjt~7;LVd<vUR>g$oXiF!a*V2_r<22UzgjgCgmJ$O}_`q+esQuvl
zmVTu4&Skw18B6ComM(TIt<aXvH<qFcbkSw{fzr4vx5-jCjIC8e{s@NFd7O)sa2UEW
zx4<dVsAyej849%z>LMMd)K;?EF{BnsHm9s?gYukgTUj_)Te<nXRt{C_cCos_+RClQ
zO2U!AKA)9w(Yo*aRz{TC!>sllZRIh?%6_($#q+e4$5K{i<;+=HN&gGhu0;4CbvRll
zw&Kw|g6gG4O15>(i|A(G8kv6+YR&rTL#?)V9yR+7JWOGT#C3{CgLM$A;pJR~<zg$k
zn~kG~q11csB?9)W_20pE+&8&fz0?vto1Z-5Q7b>LwJ(MCjNJ9q`n{R0uozpGSB4*%
zw`Jg0lRSha%xtw@jdnTk(&HkBS1HMt>JrRF0E&kJWjXE`V84GxCL(%hi|*_{FmByn
zenAdxWlY266vSe%V?6`6>d?PFLF9R^H-mz`>B)mTf;Aq)(D^hXl#&?qbG>$u_zMc9
z{t~FAB_=M++>OlwY!uKcx+X1fyfFxws7ZVE6uof=juyWRl;hA4?@I9VO+DX0pC)cx
zQ4ZEX+x>m8-cO)=F<ubOK`6h6=!{%!S>c)xBIha)I7D}TYi2#qLJc5vk+%u=r|_;1
zjK0!a2g_=*<M;b=m~t~6Bo_tqo<S`5_>5}G)hKXZ=50o<F2+HB7TpwR;*Eq?g~iBL
z93Ih4qS_#5Pr^SfgBPM!s)IP5#%P+0^>H5JAsT>wIW`&a7+7tu9^(JJJ1G-%n(}A-
zi;FaDUvqCRg4!^IPse2>Q~(U~9;RoVU?|1TQa+k^zsJW+5Hpjt1*%))c{mVJI256Y
zCoa8BC_WOn3|C>V0r#PQwDfu*)CuipEFLly_A+9}DGSA$dB-ylL>lS{QW>fZ8JvWF
zaGwR$g6+-_Jy_$#USHyr6RkLL)eG%@FKmXPH)E7S5nh9V8rIkJdLnAU`~5EOX379I
z%<<-e3e0#v6RBP9J&LiS_G79jYU1FU#KFyaZ-$}j<%PQ@;OSs~409O5b15VX&ERsh
zi$cV;q5=?<MUA0u)(c27`44R7<l!A#oNQA~Hi93{v4~B)sc6bzPh>(flr+F;G6Kp#
z8XpX94%V8)wLnb@A=TJDu7xLBMk98fB7nAKX5NEGlhL;7W-X(~TIhm@w}am1EW>YP
zZVuj|tnJmJ8`^iG2#XNMMA^6%M`YfW>&-NpSgd>Ka_*t@#%#_6Z(6IIR^AG9(P^zR
zf0%(^+(oCgsx}Tu_+u@&$`*`~U;=_M7ECC?m@QbiL<_F5#)JtpCai_r1RfLCLT){N
z>mh+A(u9ar+Ubp<^fo|l45zn&<;K{_g_r8&HraBcNNzLaMzP#xmK$Zu6)w|qTP!n2
zkeOQ{HiFIE%3>oNGevB>om?3uw*zuzoZJqUE3=aef1;DyWy=jFx!sT(%yPR~Zm=y^
zxLnKaW4X*$6ujIj^TrHsKVE|y&?*P}%ouAC9@r|gF%7n8B3I)1xPmP2k964rgK;xc
zsiiBlwl9pfzO3y_XzSZ52R#C{i>}uy^Y#q<gtp6+wx?+y1J!RiDzTMX+d-qP7i&8N
zZM|3<8X3lI*0zGQU9GhJfwZ9&TH2~sX>HX;TMyQD1loFVPSJu8RLnVD%Q?j|QF4kV
zX5}>hQ?2co(bk={9f!8=Y#Z7KhFZ35Giht2wEc-}Lj$sGD_O0zoiy6IvbIyu)|IuP
zmEs~XYuiBDu2R~bByDKMmbT~`t?g^0trKfI4Q-t`r)Y$@AkR77O4_ba+Mc31MJu&(
znpmr~oiW<lv$nI))}C!c6T>Aywrw|QyHsgwNw%R4TDFDOYi;do(u~gS?aV9G-a6E5
z;+?^3IT(jX8(KPA$PUt$ue6nrHuMBbTXCh<c969#VgI1lD>==)Lp961Ua#P?4}M|r
zL34R!iSZArMDDmc32O@f;3D=9=mli>2g5I*TQA%GR*qkE_|}wW&8-Lv%Eg%`9vo}1
zl)Z-!{*Hq^@gCmx;0qkZ{dz2?dg4?H5{LiZ0rPlIy3y|Q9_YnVcn_FQ*n7Z@DGd4N
z^B#g;2gp+DxG>nr8v$LIneeO~ru&l!*$tAnL4O8m^AkFspj~?ko+i%&b5AWip%oEt
z1MVGOK*Yq^5qh#c_>gqp2ALX+h;8yU5U+O?Z0SIW4B)5kc=rBpcmP^yHos+sm<mjZ
z7kcj^Wh)=_UdQ?KfgF_3wGE_9rgO?%oP#lq<v@2X<|riW{h6+EdX1}tHGf6f_PH>K
z+_=>1jswxlU^gE9rP$*}-Xg>hi|=tMbYa)}BA)(`izp8Fvb`OM#llkXZ}7Bmd9f#=
z!pKJfJcQvr3?I^PUxrl~pgp9lphEBlQVrk=rt1oT^eABOW;g;e&*QC6gf`=C3zRtA
znwl**P4a1Sp#{;ScqsxN0~rpRYLLf>8?Auzyg~F@t|b5y;Md?teu+mXrFdNd8TN*H
zyHLFw5VB5^A*`D(&Kc%i2#=wrckFNQbF%S#6rTC<p3D4CiZ$AWQ>|J!MBa$gw~H}$
zBby*X7>8Z$!1iM1Z^8E4;0^T509_Woklq-;RR<Im&QULfNB1KnM7ND#(2BnyU>x;%
z0bF@_1x+H*&|(+L6!)VjN4JByEcg?Ik5lE`jO|x`#nr30puqbt^kTje#m668buX^w
zUW~UR$%p9iN={4f2)c0(?#4%`8@I$CyUcWx6v5+45Le4{)_dI$o9>-}V7FKBng@RI
zkpNDV+rXQ_mOzHrzXmu7&&f9{nfNws#n-m7PNeKKlyzcdr$yQBl(NA}85k{P(WF+^
ztriV)+ATW%GN$A1waUEXFOy~6Yc<Ksytyv^vizcw?OO63Tk<eT_JrhNmh7n|58IOY
zJG5kPTk-%&_JQPqm(d9DOQm@LlBq>pl}PqyNi~e_e;LE*00{1XnTF8;5X3OL|7AUl
z^5(=&o#MMHMOAUTUq(F*hTv`%987{O@rPA$MZ2_MnG#eLxBX?5^)Lu-=S&W(r7Lbh
zs;uX!vIe77*7>`&vJrN^w@|)ELfIBpHd5z%i(O(#ksQO4nXkb4seC!}M_BS={=f$R
zE15r_38XoGVU0Jo7WQV~5B-W>H^nSd*R;gIUTwp8qiOBSnRh}HnlMgWSkn<`>Oq<&
zkS0tp|3!lm8hXCbRJc!TiYZMm!lqT}HBHf)es->=;{96FbYs&eFVj>zqZUrkUe3YX
zg~o`r(#w?1?zNiZR|jg9Sor{?$9jKHYSC;Xrm@66*J@`gwa>x8#jLg*Y8QX2+Cr_i
z_*}JtFSXja#@YohXWo?I&4XI}x*lbTUX8hr%kl)uvfC>i?Cy;h&~@*o3%zXr>^Epz
z6CJ(GWT5nbPHBP3z+6sgAyUGx@0HSWoznF4q?A0UQ(A0Nn*DO-4H@3YXpZIHZ0afW
za{_)^V?g|nmRn}ZVuos$=#aQG$rUyL6}JCnw5CsVyS$&;CAzKEE|XPS(@LXhifT>h
zp*V_WO@pCnH)&d>TN6HKL73exL)BW-r$*BxuJ+X^wMks<=x*4>W|bqMatm4$`W+^7
zUG2Xmwdj~uYbrgW)vhyY$8lw>hg$ri%D|X_`va_Z81!z3UTbjB)sC^jyGzyjy;SKK
zB&<qL9@XhYYSWHT<|_?z9>@L^!VItvUMJ@Y_%k_sf)_8xj7Aqk;TUXe-4fT6@Yp&w
zA80q=HpJ)Rn+E*XRhSpAz-pb`DGq|^c0qFrWWt5g<A2+QxR>sDI0ykuv^PUvL8y^6
z5&y#HXomZ~EYKbA4$(<T&8#1R3)68yumGyzxbn+H$n>n*{5&{}xeC|e+rY1=<2{I=
zZ%zY80d5-PM)dlwG+L@Okf4Wm)UJdlF#m7#tUr8|akc4Tq*`8C8+vP^3@r3wH_qKV
zVT%Ae0(h3@Z!Nv$@EC$wU9eE2*GiQfz5Ib^RbQsv_Mq3kHeEf&17NLlnomSbKB8ut
z{r>mlE_?>>8Cwq&J=7RZMD`L2Afi7xkm!22HWyH=d$zZy2EEa>`X#tVaEqp6+o@*G
zJ#5_YHVDwVfvsy{tb?v5-g~g_GqEoEHwvPH8$Psl^A?#ukTZ~~le!I5aT`G32}R~2
zx)58Hmt$WApO}zfLfojMjmT6`UCAje8i#0h?tO?;cnM*|h#7-<wefxhyk2^yRed>u
zHr(j<Pe7O7zp#GO*A<vf7T}B=HzdTy{I7IEr`SfsYOROc2OGVKDdT>C_VALm3Mv(u
za2;yflrNIF%+1nGI9e;Uq!J?a8E2^;3bbCjtTe#xX-&E)MjkM><nTZnZ8g=zssp#_
zf*H@@Am^ogDv!@=V`eMkC**ktxAbeBpEuby&d+7%=I5AgSzKh^WtkiV<z!xsn4F0?
zG%7;ky@Howdl?&npW$UNc%Y-|z&%AD%D`Th3P)^%)3lwo%J6Az#u%lt1w6%SG%pMS
zHE~9x^{rC+rk<m(EoS|7w!Zvtw7wbXOuR&yIA5uZpQDmYTw<$CipquOHBsxEuJo-q
zM<1D3ZtDx5(I#$Wed$`#d$vqMWRgbaJhe5{mdQV>(>cU4+^bZJHQfm<_8nWMRAi1D
z8I_JB)76%Vi%gdg@99$*Bgc(V6=H8&mG|Mla+8RJ{7r(&=fq#F_FodX9s%YEdhc^y
zeY(7NDSboFtM7ALUm!#48>958R!eOxX1Cc@O5xOV6{10`vlYffVa53sY76Hmg^SOt
zaK5duBvV_snibwdqw@H_Vm}`zx*?j(#)CT*${H2!O2a)`ac^So(KdHdxIZ)8!xi^7
z<{oZy=X=_c?S^}Z;@-vFLu~F6;jUtC#CyYLlxq<@yT4N0uy+z0`oU+iysm%6{x$_2
zokmcE=e5O#Ob=*Eoq{Js(`lckAbq-mhmgG6gTa?@_4`Gw>CFp8Q+r!e(FHmOZ(T?e
z0-~fVf;D>@C$mKGKEM%#rGmEhlUB@2{Vya>=<^pX(_$hpQVB>Vo5B-$kuy2wLY2wN
z7gb=@1h(uo<Z+X&C_ktzo5YILi7`qGbJofiv8jN4QLSjXR`h{VME;+xC?bmDu0<&J
z*~+3NTG0nmSj$s|HP_Y@$kHY)XH6Bz8*Qp#rz!aOvJD%8wLZf(&5P^Fw(VMFnW)^&
zDxXKMwx;8lC>&Rp#{vr5T#pGNXXm5v9R5=6O{!afe|aZqa$P(q1P|rRUvk#NE2!gj
zs*A_`5EFaB1vpH@fn(kyxDSN$bi8b+e&s!^ZbdiG!d-7~X>EENxf~4+QSo@-@d9t!
zcM75Ht$|o=h>`1u2L-%&f5Dx4Z$aHAK|Gsu9r<zIM7+aSdl^ijo9*}()lm?ed2heS
zkCPs|uo{yZA@>Du&iOI+Yp{t(uSk0<vEM`SXsiK8cIVN+&%YkKFmgT(a4i!7J`qpF
zLKM}|^|SIk#W@MZxf;beNfqZBU7VYwILBDUIq*A6#yfCMarXR9Ar$8cUva+6#kt2W
z&Tju&aaNn+?1t04f1x<>*ajAx)|ptIF3u`doIF!ykuyC}2M$q$6h41~j=*Q_uVHTL
z@fT|_D5<G6oMH_}(Yz7w-WvW7zP8ChR|Q^slmHyAa6LFa7!sAD32+T<MOC7xZ$0iP
zDZi%$^w15s0&n-9S<evYsnHQFy@|K;&lppN;rS(cWH=u$$u|pLi+DBMDQx0x2X`L!
z$Lpa3-yh6~>4Zy>gZ+QDauBbjb1+sV{4D!$G-^MFlqtAJmq89nhi!iqD%RR6ifd~X
z8<h%_R_1kevBt@ngw0~U3XHXzm&8k_wpf*j#aS%#2Z&M2xd%te_+lUY*I?Ey!R*fH
zQ2aOhu!u>>&bnQ5&;kb>ZO`Sn1BT<per>q76n2|B4~4X`JHE@<!M3q65j)CaC?)LZ
zX63XCURW6nwhm^9LrY->-E6_og*v^?^|@kk5eK1~N)#=PWa)knmux$<iW4tbl@k@=
zUM#G}W5`|`ypUEycDh(de`$+`>gsfd_(k(7Q8pOeyw6_Xiw2|W)BZ7S{Vt_{y-hFh
zjOvZ8kKI1Ds|~HvZRJr>Uam5wf*m<i5^5Om<|*|Hed;T;`j3rzWP72lzPO&YeXUZD
z^1&`U)FafZH5v!Gj8lTyN60T4{sL{HZ>!$~zf=U!_WB6g=TR3+<XQ;YYHgN>f%;nW
zc2g>t-gr71{6e+28^+ek>Z<!LeeD>53VmOoLkRCceZ0|xv{kgT@gU-&^Cat3^EgkM
zWGSphNC#anKrFC#_IZBoqyuio*TJ>~zI{O|y>Fi9sP`@nU^$mKPF?J2w4{MH`)#~I
z;N|i#Lz>%ZL#ZAqe4x4a#q<2WUw<SJVp}Pgj%J`XDCEP>TWu?Fk=8kabyD|+n-072
zj4CAY1Mi%m^wdAKmg53FJa>MRFo8sB;-QlUIHaMcL1?L2n|dv6pkO_X2wL?s%Uk-q
zs+746F!Zt^&*HHgD{`o)p*G_qWyT&JJiOV@S9>2fp!stF^0*ZLsyx=_<XR)3gA}7y
z)-<5AC9>fe^dyS<^Ts~Ei*gKcI6%m9%2O`9Sf{mx(?U|py=lWSP>A^d?_hQd99_bR
z!G^~E2Jk+-SK;ma0{4nIX#4nIRM2+!y6EQpo>w14IoP1UTsufg65TJNn&L&)b+n9D
z(S?Vf<M5P=C{zt|0kzT}1g}$pg@j?w?{yJRQYlxs=n`%8Q8pSOjx@H>*&S{HqH=J(
znBFT0Vjn0M%l+W}!Q5Yi(j@D4JX7|3OYaN{C^u`5a4iky_H>!Pey*k0`JyKFW<cmt
zon{}-#UvC|guKA(;CwHoUbmO?F$fp-_C80Y;0?Y=ce;n*E<a~Avxuk~!m7~9*5f)f
z6-Y-?w2c)FKes?SDh#Se`U<2<G)`cR>Lw6W!ZVymDp#?}DX?vovTd5!_B!q+*|wE5
z(%GEDnsW5Sg^q>M1^occAD@q{i>8O}gk!%e=)@KqT92CX{uL|+7uY%rFVi|#E1hO_
zNjlMk{yClFZJkx3^9!XDlTGF|b-mutReJ~V{aOV0)`|tQGjFQ@&uD)M7WamBbEtH=
zwzx|}o}myLn0ako@5Hm!Ubluk5L}E$iqEy%JNPW2USBX#kznBC4*oI{IPq+AGw3F+
z&<PAv2`JwN1567Hg4ETpnVa(;KMgjTh|9On(ny%+O=t*RUK@-REsUb@m0HnMR^;~I
zP^$kfg<a^sb8zuB``}L}bI%>YS3|SCt@YrK;I@j@WefAQ)+Ne3y3avh<$-y~mQg!R
zsa;EI@zsZdUntGpwB{2j&8hJwDVk5QW?B!^5`{v;sd3@c9MA9r0(oBNi*ZW*^)oGL
z+&bM5zdIqZ5S&K5+MC_Wlt9T<+Sbk&Q(Y;ID%f0UtQL*ENaG=;@lE=D@)?>oy|;ZD
zlcKQ~YpkFX;u^fwfXA^K%%f0SvUJZgdYL`+VvMRgmEOMe`^#sV^XC$I7qR4OZQo3#
zjYn8q&bbyf+_Y^lV~U=cUx7#QF$7biTtWojXF<5mHPtYO(d%+_I~w4ypNWwYGfRDZ
zh5<KRF+0#O*f@Ae1)foR6AwGkJ>;eBn&B_HLxWWqhJ%y5!$4>-Co0zszT@@{Cbv_e
za!w1O3;6}XMmjC}7GhpbW8+T7uF)A@q3lkzZL}OUi`IwN(y$PA(j0Xpco#x4X)<e=
zh4;VNIlfc+4Nr`zo$T<nTJs>h{Da$MQk2)>xlW35!Q)=!0{04LV0dPChb~|3YH1Z}
z;5x0MJ*&uRFbuAa`ajBY59jZnDc_4L5N_GBW$=2hH@@|ZCOYXwlrh~Pq4K92B#&?>
z8^({|1QEkw-Dxs6NKmS)FQigMd}<T#K+6I=S=#3kgi4}h_>JX9X^yAh{X7qEa-i<M
zXvt5wet9t+FgNnHW5%T?mhl=8);;7Mco%lSTcdv^Pk>A*3|_vBIk!Bo9%E+Ck>LCh
zaKx#CWkayeMqZ@XKtsZwI5}yT6a+8L8g7Qf#Pz!1Ca8i-`M0m~GeQk{3lq0s;r`$(
z2raPDGU(}mKux+P+0t9wlBe={mx$Bz61=ZZ?W&31pi@|+Qb40XJHdE^T+eKYyBpYa
zz64IM?FEKz)H17-j9#koNR`Joaw#l)JjxZz&GnaH<7ynr4>Rb7OC&6u7O<0SJ7S`9
zJL`0Z)H<ml^<88LL+XJ`&T)yVpHnW;I~Tzv>eW)YMAesA<7>%H+U%2@#BcP{366F9
zmJ#wJ!}!DwZ3>?KlOAI_UW&p0Ciw0)OXdY~^pX&T-7aMxi^}ZipXk2J+n1IhI0=6q
z`4hZDlpzs-S$!h^2ilrGN|>y{E;tX#5ZwJIzWy-;Cef;&mEa20<&X141wv7tWtX}x
zT10e4S*PWm=cqjZR0tXdyjZ-s)kk;t)@Tx29LsLjmd!GjU1=<fE6e8lEW1)!Mkp*>
z=(kLCF8ZF9&2lWeMO(I+wdBwsx5OLqCkkA`%J5PI#laWFh=llP?hU=r!4OCH{r=*m
zMZZ7MV}0J~OECj`)E;Vjmm|b!udN={bmr++6g=Ls5q$+Jl@fr{3g~<e@Rb6(o&%gz
zK(}*%;|i!k_9%^83cSdl3OxEVJF8Q$-cnkL(h?3W_pLuoAamYoMECh^bB;dELGC?l
zppZT2s;BwE7O!6;+Sb1pKGO(Pp(;*uucD+tm;9(kJl|WR#QLXI6CcwE|JuVuTx4dk
zj9QOc*Jk3dsiqfi)WnOE3bdL9jbz14caRo6ZFQHT#@c`-tc=}v<!`Y29%EezjNlnr
zw1VXRf-B#2v8N->kO4#g>PBvViHXt<mr<^G=~+g-a4BNt+M?2NP<tb$e~%Vo*du8g
zykZ4jYNFzy_Z?oKWg(?>C!zwM#-cSJcWfJ#V>G8}0%zuZo~0Lg(HeT-i4w#pZijt0
z<1`ur0xm#gc=agWhizR9uj_wVzHs0+o!zeIu?ztl|8C1Jwk<nuTULC#wyfWIEPMLj
zZ`t*>W&3Q)qIYP^-aU_HPyYKYyV<sEr)^o)54B}u*fMqJiT&?Jm@cpBh}cwe39yZZ
zhi$m&wt=I(4k}W9Y3Yx&iVsZYc$cIjW-e-wwC%x8{YB`TzYI3|S@1GEY>OBGQTehv
zi5K0e6|PnaasLo!=1@f64Skd*iQ23eH@Q=Ky>@WO8+q_wF2k3HEZr|*EPB*zJ`~=i
zb${_)`IYB7D8Ie!{KiDZSO0u|l})0um7Cx2e`uTfT+V&{U&^oO_T{(qKeg_OtotZ+
z`NUILW8qs9m($!dhYK+$i+yhTzS3>*-w?J-JD_x8)A$(_CcTY>&=qWv;rolLQJ}%#
zboI$E3Y{$du})(Vr?HIEn5EKKa(Ova($=0*U5ix9=1k*`S~ahUTSX{FhZyOiS#Yz_
zUv#(DznS&ZicZc<%bw!n#GRM`$vqnK6#5eR#(S_J^7umAh^MTk8WT+iSQAzS@K4dM
zo`L7q9ERrnoDA^Zd<tJX#QNs)=J@nfguc^QCu<VjRi?Lr&>1kRKE?6EXTVVt=MEj7
zQnN((3YjHJ;RRaFF5E<Cq01HQkC97-bQ^QJm@M!h0bi{>`bV00-o7F=S$NfdEb!jB
zqJaC}P0l11XsUJgW1Y$`Mb~YIURDpas{3sYJagrv!JFFmN2@d<qF^*Du(r;#E0HaH
zS)rx3`j2X}A%=||kH2YBX{!j_qivYUDpFO`)H;MN4Ay!XQ|2Fa)yySLrC8hAdP+sl
zLe|5_`Q?A)HTL2w=s3R|+^J22m~ek?8IDrCRyA8Hyu3St!T-kW_D8OmJnsaG6q_Yj
z{`IlbE&PeL|E#46n_F0n(}&8W2r+GQ8>QvrnG&lAV<JEHO76N?Z>SyO-CVOC)ofh^
zk$V_H)t3cctqj&$8?3t*+k|)~My0S6QKu!?=74KcaV7T94rA$yq$wYF=*s&u+KaU%
zTvep)9*A^vb00u^GCLrlpK5RzgYQu*H-%Ru=+^Wm*ViV~1&D@OuVE}$3kTC@azvY4
zoyO!BT$95;(^joGkI7L1*PhShgrKL{WVM~iwG=u6fA-08bV{!?zMN>*BdDnMc10Ea
zT&F%XpDJn(jZsMmdR{c2cerUBY~pQuk~R#yUir-`@mcdLu;fLv;n=;pSTe%#!gqdh
zt`su-wpx1m`aV@pt}MSoztE{o;M8)gW<ek6Ogzc)c2+8AUGPwG5eli$fkHjKHG(yB
z4x%QOJ;_V1LHQkg$>;0jy|(a<KYh}2(+Yp7O_;|f(1MG+a%|b=VyHvmQS=#(FIord
zEd!}6cxy9n=94Y)9Q!g9B;DS*7*hg8rcHaYAb72}A|F0JH%hwIK{svkaGH;a*qidC
z8vNGayK!oydC_-%@hff0Mm8mN`g0!!88T%#b}CR6m|k`u>H63lD0@<ch3vw<5{7Y%
zwN}7HLezcks=G?mAqKV{78$g7Fz1GFL@XU8|D{bjohls!hSKSo-jc%Rnz9YbP!z7R
zw&1mtg?_D-y>nG+a7O{-aN!Glb_Ln$f0fGC=YLSy8i;DfV2X3J$88?7d@mJ^gZvtk
zO5<>?ahujS($;8rjK+j$oWL4WciTB&6bBY)b+s@IE&l^udhupJJ<X@fwE9`B9=n}(
zwoC8T*)IQ9mGhL!g<55WR=N0mDnre*%9Z|nk5~C#uk$@lYuRAu+whotmx{)1tWnKz
zsOVH<!8>q&;AowF+Q0!?^%u%Ox7`Hp(*{<jRo6wS`&z5(q}81{pSp;s>x4%s)dps(
zAihJ<4%=k#Wa;xA7?<!B(`PvBXV@R`@fmtbaMx9MPfrdLT1|kf@`N=h72U6m?8{nl
z6oTs``=6k#GF<t(S{()kaTpjn#>%nzx*I(m8?H13bqrpa^NN{(Zr6Rf6|Q*fe}de3
zQDFu8KA;mE&k6GEz4{5By(e8wv-fJ4mzupxgnK=6n;=W&0*yi<HNDL+(>njVT%G?B
zhCz4B)$U?BE*d{#jkumBF6N^c<nDpZ<jJP}P0koRh6<CRk_UB$cCkW!ziJ7szVQSK
z#um&S1K7&_SL0qWUJHaxUtFzSznN%Jc5PINuEVU$8vWpN(lCsZBiikT1@M%@SzebX
zw5N3X>hp|#;ooS}PP4j{#|1s9us`K-oqk;F>4K+2G2CGDvAtuMXnN-wYJA^q*nUws
zh!y&V<#4e!Y!Iva7Q?oQo{=fTc0F#j-zw2G{+~2#xhR}@Uc-K?4V&p2_EW3Gei#h?
zHOq^Oo<*#u6-w-5$1u^f;-55ZoG9FIUc-K;4cp)vc5YQhMbC~@iH&j$6HTA9rrv0!
z2(ZEg7VvsMj@cQzItQ-7)kb^xE`CU-eu8E0r&itNaq@`1x`sRjdS!tWbbfpZ$urTK
z^r9ku)`*{ZjKn)!tHfyn4S3%?rqXAsB_H^2oqSI`%WzKXTaLj7$}klTyjD#`dr1#1
z?FJ)$3*oS?PY@ApgZ<iQVx#EV&y>1RP>1I}yv>iPsrdc#Y0Lk;wrvb+Q=8T79Y~kO
zARyruIG^=aLK^}tra>9j#vEe-Bk?Iw*HK7{-nidND(X6P$je*vA3R1w;L>aHwRx3m
z?~`j~Vd?92r4MUUm-|&x@n1#bi_$rsb*_WX@tk$9GM&o6BU<G)R%tB@F%wa)LpVgL
zg>cY(41SLlc>Awyg(7W%^$qr=n|S>mqa!Ns5T2%Q@U#&77-B6xz8bDqRHbm#%O0~j
ze_T>OX=MoB)PoqkR27D=Q!V0b3sr?3>ESXsiC6;IiVB<SY`lLRU0W@qRgkx#Mdn>N
zy&ZF10e)JyKcm7t#$v8i%wxe^X)}i&)tbjz%<C2NI54lbnZv?7&SGAtn8$;8oy}Y&
z%;UlA1To1e2?a4xi(8J&qm*YL?-U@Ak!1%lt!`lzXrzU<b{=%*;^qI;L>v`lm1SD2
zMdsZop#)5e*`|da(`phH^JJBePr*FdW)2JUCX0ELV%`kqQ8sgxFz*7hJiCM%zXNd-
zd<WY*KsAoYhu$sJW<(X&cw21D4YG4u`nWdn)H#aIKFWS=hwGK1Zyr^OI$zIY2^4j*
z6;+9%9@m?uQ~fA6op-J;@TwoRnoh|RI@Nv_^XH1WKbSwanG?c1)MDPDn9IPt!)7k}
zgVsFCV&0^fM}v8j%^VZvi5Bx}#XJejt8M1|C$;7o7V|R2JQK{zZ03kC&$XBrDCT)!
zUSKl^O0?!h7IV2`UJT}Po4Hh&S6Iwb74u3kPqmqo!o1F69;cYsgL$0IT>M9E#ukft
zgks(b<`Fh?T$p!*SwDA&-6Z&WyI;j%{!{UQF~SUd)aqG<Pw7Oya^$)M^XOx+6x3_Z
zdm!7<mW_&R_ZuipcA1YfXP3Fh4dgN(X>Pg9N#P#I+^He?Dz)Lo&f{&t;KNu!H`mi-
z^aeC-M4Ih}p43)-$^X(8k2a}Rs#M2vs+D%CQQ@BcU8ydXR4a6<OYKzipVp}^U_DXx
zPH1|<l-iWn*#U1hX~2xR=mzXf+ipr!gg;fn_~JZWj2Pd%+FOUj0{GP-^CH=J$^Vm1
zYCm(UWiG~&#=#u=>Vg#1)7au^j++zYPaGzb|A?{1HaRMKPN-CA^1vKKbEz_XDgB$r
zNBMJbEdL09zTm4HFq?IOo>YWNVM}dK`x~j6*~k7CJ;#)uPB&siI{ip<vml9!p0`*J
zc4iP`GYZq*s~zUi#<XLU_vIt-!*EUQM#LUfLrs<xPw_L_oHABJKDvpvf(~!fBe<JO
zr84G5WM#u69BnoHMyxQmJ;Kpeqi%#tV?|pfMdKJtqdNWUMOJx<M|AwwRLay!rE&a?
zSWT{jMnqdpz7gwEOJnJ?+QxF$c$9LwigK!At<=ihoA-!b+<ZittWav_-iU6t5E)Kw
z#08$y%9gV-Y=xJ@eaS&!MNEl&O8J~D%GS_d6CXi?K#U~)o&sg5$}ACOYm72v70Sjc
zWg94qV?@~|`fIdOww`{Eb81wS?N9|r*?f?~Ya5_Fz558xSr9@>c^f49_S0VjP-sAQ
z)9=A{)fPUlb6TbJIaN#BbZ<&Ud_}f&Q5GMe-#t{-p12YFJ61)+MB`VMMyXoTc;;bU
zwViH)meUV&WBG<Oc2rf{{w7yr;S1Wvw{N1}P(gYAo2gnniVCe?KCElD|4qo_=Spi&
z`hECet0~7s*AUjFR<G#k*txINE)h09xF+#s7H>ObQ^R2XpS2yM|0&r@TQ(xHvsjjw
zO^Y6;b<doebY#s(_<%g>i2n8#*|JHI{fK4N=!DR}+p){5o>Rto+fqF5iWhQuG;EEs
zoNa3;{foAIH#FcamV8w6lDy2v>w51#Y(2UVdQr1|;b-e>v&Dq%1lxnJlhPXSVVq!K
z5qJ|HNF3$;=%w)Y;W@$g<XA`g6m>qv?+~71ybvqRO8|15VPP<X&mj?(`5F#AyjFBO
zt%W&#jr~>I)%ypjnHIY$U4qwR1F9qD+WI=<<jsHad%Ma?wd52d$#XQG;K{^ug?%0_
z3pT>Fz!~_gu7^EjZ`KbGB{x;8qjyf1+v*~s?wy;-5NkV>R${0U+8uuYnJmX>h+{&o
zjW)=PwSp~tNt-nITXfv#)6ve=kq{lDS%*5qL-Or-bKXt<RgA)w|812?$;;Y+auzn%
z6X>Z(J-b@jY0V#HGudhDo<bMCqP2beKcOwqN^4uIw5jkwSwl++KjSkA-vO{oI3`N=
zu@Wn2OkW(s5{Q>W_!ld3S_q|n!1=-lCb<7o7+F83;96_rj{k4#?P}`{wb6Qe-a@Sk
z?S^ka9S??Z1FDib*S?4NTFAS%(0NW1Z_h(`HN$)N7HrMnw9kJ>1Ur|h`y#!}&pz@K
zmt`M{pnZB1Zjoc4mY7!$K4cvO<+s%)O=FYP(HS}pj!IEte6>nqlIZ#OWspq`lM#`f
z$Fe-sefSU!b@OjgTRhlYL8j(Dq<lA~S{<ZH<X5n~8luqz@c064${^}!9JFDqsJsQo
zj0hO*h_918RL1_tHhdxzk4w9|JmGdaGuxCxb|j&T_GB9(CV}Iv20;Tx-yOI39Ld0|
zTDY2p&Gj^#!$OC1J?$;K7)nI2)2&pibe@iv{c_qNQ9Pwh0Sj=LaNIl&-$9|2=_u~h
z@2qkRyrxs`!wT4YM-5bF<#ot<3saoUTOzy@z^ivH@-iQ;n|f5EN*E@lVF>?Cr#2}K
zL+J0CVR9OVs4&FRFckhnYnYOTAubG4(=Zgjt~E?c!;ln)>1h~B-_ROnq+tkzHN(s_
z3=v`YAPqx)d#zzs8itrK%ud5l)In<~Ps5N9hB;{%N;+x{6=@i%gkf$PhHxjXVO|=B
zP-o5XVH$?0FvQa^6n4=X=BHtZ3&Vmm48>iwhJ|SulEUy&8ivw0wT4A$7y{ii!zW;%
zZcIbh)Zfu?w(?fK&4LG@(V6oRuFio^aKvhj4CZ6)s3V~edP$BDHbC*|aa?T2J|`Yg
z!fw{@_*81^t=OaG38$2&vD&igs*;qGoHEBA4E-G<k34VZtrRJz5^f0BKqZX9@r7VR
z+HB>&tz>b-3bO=OufOAEzE_<}6Cd`))-z?GR@PrqqeZg2&UCDRGmWtap>*Dg-&T9m
z3Yujd4PLw|*Z>3Bg-vla?7j^_ZxbzNX8(2xd^&GwK^fgpZXWz$s|#u&I?CJdJKSER
zuW<)!RNxhY`t(dv4X@X&%>jE}lF%#)E2RbG@LMT=&tfcG@mqU7dRrSb3<h12dl80z
z+`VgN?iCaY!)Rt`a1KL}Fs$S(VOH7YP5v!P6Be)WTl|r0@fBoo$Qyhs4b;uDT3=Ah
zTlYH(P!_8vkk$XdiV!xZ=D{jSaXY7|oJF+|+KsuG?g<b0Z2=x3-(R5pV3c8hlp&_(
zFAJKBOAB9qh^@A?hc1dEP>b*EhOEyUh6Q`GG1VJSy~m1qd+4pkWk~Q?v9-#L3wKY$
zeNb`tX6}PFci<hJZhz)xr+=^F9@H4`B<=-sD(^+YJk+R8D&{DwPTH!&!u_7%-l(|W
zXYP$QcU-u~8$(tr?nz|GYTJ-NPi^%)W>%X><lS<N6?q|eh>xb<`Hg)Oe^?}!u%v#z
zWV&Ymq%l2TGQ&<PAp)PWz=qUmnj|$&%dTxq>l@4CDUN6(yB1QlMX_1Muk9ZFjk2wK
zVGf+E<G-;AI4&Z+S>)Wr_G{UGg}%g!dg;VQu_Ub>=u!Qg5&Fbu&2Ko$Xi{O5rHG83
z`a3?r-lcl-1U$!I<38@mQS@Z=U<xaEFN1n=9zGj@&k=glGn%7=fV}v1dMCA&xBhk2
zO}rU}(x_j-5_bc>vPkc(^XJ&r6}R5nvYD=BIF;w~twqYR3T4@Q|AUq-Oj*_jmVJDV
zW#K;BvX69Wqv6c{4fm^$QNH+XRZT*elgy0Ej^G}uxOd|DNBp+93;Sx-`z`Liiu+6A
z?rU>Lg!?dab0>UTF&`~N+ke|;P6%^{+qhZjb;aCZ_P8Qc?KTWPSVilqokdk^2I(tk
znNIbFz>TpPO;Dk^-fF5}gtJY~Xx^A>xZ-tsu(u3E3|YGyqjq^7qh<5g4|3?)fz0Oc
z+mZj?^4)u`=|n%B>Ajg5!A7|q{dVXkM#+00<AV-}DkTfx^5bg>-)dJdg!rRg|7c0}
zVWE&ghl=#R9}HZY!OahGdat!SpHY$i+Nge9T)YfJzrcIJlPT!3k+<Qu^d5*5*l@cq
zguEM7UznbOq`he{GnMYCRKc{y9kG26qS>FwEHKT!bbwBGET@a@Cp0XqcPfIxUcpQ7
zP$Tw|@wl<~F`hnz@sn@E2Xnk!>RTNyz`KLkt9_8_CEM$H+XMvR+)qUB(1SE*WqZAE
zL&RN9Tb#7wYG6UIDc)f&P*Pv8R1I(7ZP+85`ykS+gLgD<!(fQ5naeOjW@Jfl_!C(2
zTiY@5f!g2%8_bb7yjhM1`ci__5^ln-=z~^U4-L|SNhQb~<3VzzcHU;M3<p1`#yzH5
z<7BDGA7*(KUqiQvDL(u{{pJUJ6>f#}n9AEu>hJ2*PqTC?(xhqdLWD|LuZ8>-XU4W*
zSOojuo<U`o>rH)-uI6QXOK!vF<?oQ7D$?Fuq@K6$wnz^4kP6Iv7#9_TMa4v0g;$Gw
zK&7OIQR2~?H@2F<V4a1is2Ss^scY1{X{ibM)D(%DQKDv~qo$!z)6r7Xz^5iGYBt|a
z;|tFu-iQY#cw29$+gcUDD=ULnH4QfAI|7-79Acld5w%U9G9B@N<=zz!(fKNereEha
zpr;oK(FE~$fmdL@uw5!_^TF0kvE2?f+#3g5>K0=}*p{%ii(GA~Q;nFgeahON3)Z?1
zmsy(!FU6y)5AddR<?RK-sQgn~`AHEu2odbm;M0g0+qyr1Jx`1_hi@mJpeHhB{Gg#a
zV_&fVJ7gUn;ECg#+sPs82=3IeLs+=m-$Ck``}F<H-RTbEK7GH%9T)Dm40pBSeuufM
zZSFvswxl0(!^a@g_uh|PbC^Ex4)QSeK_F#%u?R$s`YlTRNLIhaRv!`WF-kpq6&MsS
zlBeR-u-Wkj!j11&@fTC?K%wCv3MV>tvVmdR&S|VD^MwX@MRYg9X?tN{!ec0FgLRJJ
zS3PC_B0STJfhy0Ne+N#Wpl&^4;%#+BQLS!)$;F)eX~<c0hu(KDCmkpQB6xFb;fM%t
zU|}<2;Ju~$CtxD-w%mdJZOk|$bf%J&DU2VZY&C)5+J^1ts2Qx(?9pllX*K(`njyBD
zQc-i5)oef!;}sq{q{P9oKBW91uIc4?pWILDJ-VRR7<<|@5&QrrS4iV_94|L+8T=(}
z%I6?5qU-(YR`F@B3;Gnp+jfeDBedb2en{;x2U8;6ub^GnNpNl}P>2&A!kqLyzI$1D
zOSxXx(AWM>-DkD8&`6zFFMnctFoPjB4Ko;OY0N`-DOfJZ?~zvrzWXDGo1B!tM9QgW
zIs)(Mlt*&P-0e2q$K7u952@R20(YvD776!!WvAMrMwzsMnf!%N)B$GYcYmY2rM-nX
z(#IoVY(2e0Rbl5kA=*~@b3IjZt<=4#)tw@B^4J4C2r}7L7apavve#$at9aXht^wnb
zB>H+d>^uA*y%iqxBDX;yYL@M53;X)p3iC&6g~yaa+6ll5WXSGRXrwz^M&ov^F=}gU
z4HMtAH6}&lTR);kmFiU3wN6E%Vx%a=j&w6MQuO~3jTH3$S9LS1!4$u*P3x}|@p#sZ
zt8LJa@J|0;n>ixPW0+ZODmUb(ZPeGcu?x@)kzNy#RxJcGo9J2uNQ80ut(iVvoXW<$
z+JSbC@%?5q>lBaC=FCc)!a_=6ZkiP0lEQ*?DTK!A6qcq<VJfAtGEEATC51KVQb<S&
z8`GvRf>PL;CWT>=!p?LlM8@en?B^7CQyWp^e2&RqJ;zdkQaC6n(1k*Yzd)c$Pf6j3
zN&%C<zaGZx6i%g0p#!CGCQS-w@6|Qk@lKwg&P^dSL8s8|PHMsD%R}|ONZ}nQ;I`z;
z!y!qbPr4KmlER?0DeRyW%F?8;O;Q+{E``WMorm$9!adY^R^Q8=XVRS<dw;JLNR$xn
zY0S+d<TAxQ^G+Hemw`JqLWU-3-E$200>wR#xfj^n#lpSZSTaR%uVPE4*p@_ud#%wu
zR&iId?y<J+gm8amxQ8q5ZOlE~<_=BPcJET$v|h$KjPJV8-5_<*px?c`vi<T-I39fP
z?Df94z&m)Sx@$?-bL_31i0C}YI#Z(^9<iiDpaQSsz5J-i8B`QqLqn6xg8$GNcBMSA
zKup`!^)6~}^^8Lt667sEk}pV4KT;@2KSpqC7#{<qur+%79cQI954;Xv@Yj|F5s?|n
zGMFu}XG8%pL2ozBCf<kl(}D;me0N|n!9swPJcX-rwvxaUos9{sWCM1PTjS#c6LT8#
zbA!u&4X+H-*<JL#wNGuqhzQPNL2SW8vJ#S<^vL_w%6rQlttwd2hrd=n?VP)E;Lt6y
zH5E?PCM@^oy(4sD$DVJ~PW(03@Y=iJ%}@9>g?xJJ@4}LZXGn5br`S4UqVoXj#PJ;5
zvL9x7f2BnMy`qzmd0Rat{3mQp*rG+#v^mu(V@)Buo@AfrUH*$m_J>+_HheQ%HYu`a
zeX>um>@_~wv;SpPXL!0!x7&YEgHk5nMV~q$lW!)`yNt=lY|+pRE&2|N;+7$q4B28%
zwvc3hY)r1QW#b|{*eCmQmTl^j-D%5~&eZAl_)jvKZkDU3#Q13+{gfikS+dUrYyJdL
zzq_%v`IT}XhWsbxs<j&<Q1pS;H=FgbGfnN8-u<(pYK&*}8l2;geq|M6LNqK>7U}cd
z)>?)7Ry@VNq?{}IF!?lFMd>VU#Ac;JJv)RA4aAY)S;!o$o@nxel2117L13QF&@s-?
zbLb``pe^Ob7SGn&zF}>Xa@u{ek%sl-zvPY;{xPji!*=>{k(mACj9|lDc=YgSC^`M!
z-vUS?Cg#H*;~O+6@BGYqayC^`uG3lw$?V+vlukLg@#W}`e{6l*BrJTNa0<&z0$9#f
z@UHH}UzU;Br$5FDG_Ql1MXE$}+xd+fqKw;pu5pn$+PGbJQ(HmBl;e#P>fZbi&VXNV
z_x}8D4B!}#saM|=ycS*r)^UW<@yp!>0L%znB^oPP<HRUlsCpisM!~yibUW@oT(7+w
zYMKS{;ZhHugP`v}gruxWE3`QWSeUoVWLmkFRt!AoysfM{CRTafeo^3^xcmIGN>Wsw
zWR(cTSg&))u?!uRt}~)5mBWsRu~Rt=H*s@VG*=thy$Sa(s6@ZU>aqnE+xG<VeheNg
z2-d;mMh}ysQRAf|nyb8aXog3AZubk(G*D?m?1+AbW0DHsz*k0cp8Xb$Z7}OvPZLJx
zY0Jhaed--JZCQ(N(L<K4vn?z7P-~jYno#%UsKXo_m~y+A_j5k@SlFb=vfw4C-fK~x
zL8{(IgV)mYpZN1yP^fyn=|AU(P~%Msf>*1SL(*tDt5mtxl*-SK>ogObCT33dDX`)A
zC>o7a{#bn34qLk$&PlrUxk(R0(nx4m6Pc0z$^luaW8?h!I>p_bB6+7RDVzAX^)TwD
z{Z(3DoRiklbJKcTrF9~e)=MbUfhg0|pc7c2Qybls+l^WcqjvD1FnSzC$jAfP9EkQa
ztEGfRb_&b#nv7Ru{4io3kKXezbkmf(+!hQh)QMKGU~2G2L^EDg#CNALT8$t<jM?*T
z!BP?Ym<8F7o%u8NdY3gNKXxX#Q+{krxOX$R^7GMi@n$~-BBAoQ8qF)yuevelMIUK1
zt5}|{^Xr~~pd+*zxRgo}R>(g?ELTnJh%{CENnPk~+SfYWlqRpX?gNIQMOyJUO0g<*
zSjLlKh+I`zM`dqr61dY}Tr_mOhpLSW;`mQ#Tdey%_D=lKpDJI9sn$+>{$g#xm`?(v
zK%J{{rZ{1=vrq9kF!q?ne-iK=twu$9i<0JEp^jSkv4}=q?+@Pim(E%~To%AA!UGe1
z?Ib$=4gYCN_+xGPlof0_&ki`1m)T+2RHTf)_`!+*Iruut?WSNI)y)}3_nW;!y?CE;
z(vzb1n9{3SOTc?`>KpVuOw7D111L+{#p3Z3FQsznw1R=)vARX^5^Z7km8xvfgXw$c
z@CIt6_EPbQ4(z2?O?gAzNb0jPK<(r;^pqo0Y-Mp#HbyC<FA3pLaov<Rs=e_m@y*a$
z*r%6(k?i9LwV(3F5<FDlO<IZk)6<$)<MUkJZfwD3V4KnNUO{YP3z9H9AJL`S;9-<T
z-Q0%wKvgq}&PWwbOgKKYte}q%c`xwwauu(VopUXxUw&@-cw4h04Df1U*xlhhhdRzF
zr%dFmH$zcknKq->DlRwPuEYaS82g3=bMcyI9xZa#V0Mq$B~<c>6gP=og;NE3am9`;
zDHV>PO0n8Jq6)$NhM4+}`G{3iT<{>a#NN>R2cuV|>Llr;m=t{p(wCk4Y|xuWBlFnU
z8;C*lDp%oSwZW`DxCM~XwS{!mq_4TaEy<I~rD(Y}{e;djJs2{<>k>odezhvF9;bd4
zn4@uZm7K`R;1#%`PX~egG)W;Iv4(!W;C5E5564jZH12f32QpwwyIsKs@Bp%!sM@ZB
zXPXLF=tRe6P|t6w#!Etq1epW`ktsEFWC~%$f|YA2o*@Y-5hSjpYFSdHf~^8ZpO~RB
zZBk|i6#&Ub1YgIp7a1ff$Yvn<xt7kDAX`|bp^=FTvJ1#nwoF2hJ${*_AP0aH+A>vw
z9AcSEOge#;x&%-9WkP~{<(J79q!Uieud~xB6r>A~j9e=}MS{Eo#K}*wAQ8Vzi68@j
zTx_RPD##$ewy+?Bf!t=xL<AYi=~!hD6=XD!YiyaAAY*_O*hpNEaX=i~5`s(s;^Z_b
z2wDvS$E<cxB}fd&)ix4XrR!&^Kb?>u)BWk>3o_H6PN5*P{OJ@4QtnTuSda>TIwgY4
z^QTiPNZg-JSdaxkn%X&y2=WmSUyqFnw#qLR6Xer#rQ(8p2F$66gdkgi++tgi6eJ16
zts_Bp0CDRmq3dWD5T}2K1la@RW;>mHLG}T;+eQin`2vXBhXknx;<UzML5>3XPg`4w
zAjg3?wv`HU5{P44Sdg#$=|luM?N28v$Qge+F+swfFP*p`9f17UwmKn5Cm_GHk)$A9
z{4!O7yy=$-e5&iGI}oSlLV~;v<W@VKd_msv%M=O{@yiqm(i@1=s)_~a3*;_4of1L%
z198TYQb7g+aawO!kavM(<Tfy)c|?#X5U1Rtf{XydvsW)#a%N;bW2EE4^FDZ-u`?ma
zSdqruS7{`x1fK*v4|5}2OMIUzqoZL7t=5$=omKENm}=FLrB+vS)KyWQhJ|+ycz<da
zSE(R#fjAWw7UV-9|6$8S1ep)SnXaOOECk|owU{8wfHbkS#RXZ;GS)Pb5M&*Y+ijVo
zAeH`fssz~zq{x;DtkLC_^rsUN<Z~cSCFcurz%NrM$Z;Ufa8o46NtUsu>|#OA0%>g9
zRw79I3w#~7RFF<U#7WAjXPQAo7~TSdQ{+)WdINFBqnIFlf!t{u5*MVuom^9sTv8YY
zgQ1x%S|!L(AkL5(SgT8Z7!b$AkRb2-W%30X>z64MWC{?cT^0#4%`a0d$Q&T&E5%Y_
znC}-23$g^rwRVOhf-D2#RA*F><v@OA%ftj(=}#vv$T}b?6YY{0hD~5_T6UEnp8;`(
z$-p{Y@>_v8!(>R1B#<B3CguyW1Bg>wg@Wt{A|_fRQjy??fIH2qSdha&oK{mJ$Wec4
zrGk77<Z|1>upr+6x!y)1f}90%hmAx9X&=l8-eV&%L9hmOW#WQ#_RAy$=?dg0wzi}o
z-GDfyS0xCx5?z_VdR<GsfjFfT5~ME>X9CF=q(2a6JS`MtAP~o?DH7yeAWjc07Gwwz
zX9O$}1Xu8a#dc0h1&IQ2r&B>j0QrF}6A@$-5XV`K3Ni+WQ*JRqCIfM1o46oTfH=#u
zgdo#^I744jkQqRnHd-af2S9#pTOFv>^@HghwplY&NRW?!IISvQkdOT`g@Po2IQ_Ut
zkku?>&m4km1meunC4y}B%ajVT0|@t4I4HEcY&j?q;n@qG-`IJM3bNmlCimu9GZ(~#
z=MZ@Av!xS)90uaFnxr7dfH-AYCCC{dPU{S8(B&V_^0keSAa4S}XUlo#wIlwkT(P`o
z)GigdKt;kD@hd47WB`x{>}-?>@~&T|RFL67ZnR~>f{X;>3_%e=Mgeh#+^8V$196%}
zOpviaZnD#f3o;(a4{anN$V9)kq#%=lIGwCYkeNU(wY3E{>cXD|#97OS1StpN48!??
zQ~+^?twKTO0dWT>LE=E1>Ms^#0g#5aZ6$(y1SH=^N(EUA<RTjh3$g^rl{OL)WEqev
zY$Phkav)AE!~|Ih#90Z&1xWyLx@<y_)qW%?$XXzdYf~jiC6NEJZ3}GD^|J{`9=ZTl
z3f_ANRKc$F7Ccc)7XUk@{!9DP*a6hfT2j9#7Wuu9cWR_Wko~MVwFbc^m2A<4g{2BC
z&e$9gq&lr|Oju5VMFuF}3MwHyXTb9_JKsq`!r4CWt4fd#K%7-@V6!f(PC)DwtwAkc
zc)Ekf=_7@L^!7<x18TAG4Dd^r2vP>bnUYEciTY*2f{aa{VpMo0`K4om%mCs{b#XyH
z@XI6w`Ouf5RYFz5v&b(U_)M4Yav)Bd4+*l;FOx6Gy7Va)3D0J~bg>{i(o2^L&py9&
zSdc?N90wvI$YH-sRFG3ZoO+B2^0i+kE=b3mwCye_Jl(+Kw7V)nB0!vVMqrCB@7{iy
zkRXH7r&uUF!~D`kg1pbtIrVTit1Ta#jl-#*KH5~LkorLONM>rwKPJLcAza7qvT;FX
zq|`1|YG-}7+QO|muW`THB0(0iT139jQCb(XRywlORu0SL`yqU|AVSOi>Y{?IVs-ei
z!(^pyb;?NPN99=?EL9>@2_dKU0^79R+eF<krEbT!s4Etsy?%8if*fRZI5aues|yRy
zQNMIVkgqLi-;f&<o^TCcR^ozm1!51M)&Q6kp11wdRf6;Z;ttD6T}1u-G9f{RaEhpl
z9c{QB4#P)XWV4GOYPRa4M1)2{$XQO63NlL6ZB**Ue2coc2u*;H>*WeEiPfFAE~<oQ
z8hD%uEU;bYdIk_@eH#+w10arbnJ>s}AdZt%D99Wjzp#5<ks$McIBUCNLFNN-y#EqG
zJ_6#b(@F*T7>MJDgaug&#Mx_z2=WOKrwpQktN`K;rh=>j;!a+IeCn4;2(rd6lN4kf
z5NBel5~LD{<8lRd==#|R#M$@>39=c8Ga%;+vIWRxb}bYNvJHsiDHRE_9f-4%FBW7c
z5T~q51lbM5vAR@{y+HDEuVQy_KpVXEUW2ZN;C-EKc(STKZo=hc-A&*Hb;u#~-mg(W
zu(zpA0il{Yx`vLEqaiaUst-YRb}l+&Pt|*>x%brPan|1%13Psoo`8_E5)27)iq)y!
z<6B`C3eQ>aIBlv(koLLmSY$aWCBoAcJWkIo6{H(WBP+-7>2L&wyq%kVT#1WN1VZ=P
zm75Txx2XG4sq6PG>hgE#k{JXcr(YHdGMLq!w`7WiXE=DAk|_~nq!_bV88iA@jERZR
zI0!jK6&GYedSjBpGZj3}@j{g#(}6f!L4n;m`!j(!tKpC!vw%3;{`rEG1966>LP08k
zIIH0zLFNH*w(5!ni34%gZY6>&0FsewMIMw2vIL0J^}~WJ^UFj8NdR$HsZl{z1968x
zLDm9swshlytOw#$d_s^7K%7A#Daa-uKet<Ml^~x1amp>QN7v6*AWpf31W5vM>qn3s
zK-~HfWET);N-Gj%4-jX+q*#!BK;Uj5x=}UcFVK)PhM84psW2V@qwk<QEZ9k4&L|!c
z<SQUfRYnClqY|cFr40MbB`yq|^Ekl^t;mCfU;}|&gvcbs*LZtc(Va~%ib&qeyw@gR
z80I*gI*7Hslp;^F?hjstFW=l3yfUkqw;i8?LuQBLH1RrI=|tq;?ZFzYgO|?6Ph7yj
zV2$`Cj$@=zIlNcb(-fq4k6pJBL8b$7P9dU#e4y;NrDB3D1m=zef_%(U*7jsVkd;6l
zu(c%xS?!mp5@aKgG|psbpR_^nSf`-Y4n(1_?Dna#=7D12`2sx7@Khqmf%MX0;W+}H
z`|Zp{1Ucr{8x`aP5O>2!kW=YXObE|e@HqZbQjqpF{lOH1bOhoIW`X^>5IY003)(87
zeBpTuJWd4`3eqFJbg}UC0ndYW36%)a4~XN%l?pNdh*KwFL3-8l4KEQv`T((-Lsv5t
z#DrxiSe)@JF37O7!bxEn3l?VptrBEB5N7}le6CA(5)fyx5fWr75NAo4FUV{lj$c<O
zNQGafNRWj<xE3)cO*9!V5qt@7r?Zy|vJA+t?b-<ovcjKQM38kroZ5*BQt6k839=Q)
zc`Qr_z6-cBIV1)78i>>SssuUfPc86;F88jreKH|I-U8y5yCA)RIFo9jAbo*YEz25c
ziiKq`SkiQ(QsEg69;evCf{aWr9TlFj;Bh)jOpx(FoIUHfAQOQ&{#8Pd$>~$95}q0U
z6a!!C;{5=Klar7jvw@tys0xK8ZYAwIc`6p3#r_0J1X%*aiL59UWEl{rX2XJf3dCt!
z5kc1YWuk&?2I6EfCdgJGRu(&(?v@aiy<l;NXhA-Ag{?s<a6lLCQLs2uY)Ft3K%5-r
z3vvdCGsYGQ(!P#Qrbv)CfjBb7g1p5t)`@0`ApL;2<A@*w{W4)eMgVbYEF#D#zf4q+
z$$pubAXEJ^aY5z)acV3f$Xu4OT$7|Ai-FX(N53jTmif~O9Mtu*28iQghXh&2GWAV^
z$QNWI5NC!c6lAksrbv)&K#FYJiUrxuGFE;{1o<3D9b2YUkb^Aa1Zal^I|a<iQACih
zfjA90D#$lLoE@c@AZIxxYk3qGq;tr3@RtyzD-fq;Ck1&2h%+iz2@(O~SRFW|Yp6F6
zXCEviNI!o%`GS-IaW?x41sTaQUY_O77YP;v=Iq-P3o_NOuSAgPKyGmANRXL8oTXD(
zkXb++k1--hIS|LCi3(By#A$OeLFNH*N;@vdVj#{gY(kKwEOW7G+etyz06EXh9;ni_
zvB@tP5@a_^su)H3u%zCQw{~rdh4qkMNr@mQfH<egrGlISVhz;Rv>g$a@P$6%s30AH
zoL@LDEZxB3tW6Vwyp>kCN?3Y>#aVd<4(syj3&g3NkRXGBTxGYMd_ji#WeNp(&!0|_
zAfx><#ez%(;<Tv}K_>fUN(Gq-#PN>9g3R*EL<ITJpH5Ve`F@$0Aj^PAYqMA131LY1
zMU#ST0CKBcd{u&M^2-FOb!BV^;_S|Z1lbA1*`3K3WVb(^LP7QdaSEwOko`a;Tebr)
z_#xoVI=)npY9P+E7Z&6g5T_wU1o;MtGjv7;IqR2+3DUK$Z%~a3(hZ0+6(j_C%P*4@
zq=#RoN|2s@BydF6MK2(~vFk4+NM9gQdUm5I6oz-f;P@>?f(!xT_=d%TlmW3u0L#xS
z6_(LpaeVTyAY*_ywGt6z91y4DM+KRXHo>^C%m9nCo=gZb8;CRWCk3eh;#78(AoGAY
z4J2?>7xf|_T%hNMujdQTCw}QdK~|@iE*73j@Hj<RBFJWzR#98@;QqN$TM=R11y<)k
zB`U~1F~Pq{Ck%(c;6$1w1UYGoTA{X8!tf0koJt5B)5Uf+jcC3wbgt(s@j^j*0C7C6
zB0+iran`@Zg7gOBmZ2d1fH=NmsUQRWGGRdm`(+}63<YAh25XNfCOjj-<E&cZf{aQp
zofMw&;BnRnRf0@p>8o(L_|N<is93!w2x7u{4DRUD;~%ZF@_766SrgP(cwAS<OvpNQ
zTqMYcK(4Wel43#TvyAOq3$g@=GkBK@vJ8mhX@vz@4#cUUh#)J0Sfh*OoWz7>9a!vQ
zw+_@3!n4VrKvIy;fH-}lN|3EU9H%vKLKj$)Q*0Ql-wW?^lm+Po51Y}rr#kS@t?*#9
zx4#}f?G7N60{Sc9i*ta!3OH~M&|3kAY@jfhfy-?YKNb^nj>8=6++S*W-f_YD)~CY3
z8QteE@EP5J`o1$dE1E5IQs=J>Le5S}z91u59il`&Q|d;g)FGna9&?Bp%{0LxB0Cwf
z?oojtQ-C;|!!bdo196&6T#%VSoCzr*$SfevR$Wq%av;v)rAm+rAWp{$oYG}64~Szy
zNRT)XXIjV?WC;)ns8tR-9`MPs18R$fH33#<7qdi=wX8&W{YSWD`9m(2*B=wsEq+Vl
zg6sz33^xfu_WNa$f*c0o<hV+Zqby@>cm%%ErSuJuE9`+HB*<AH&N41vkoFCHi?u>Q
zIs$PV>morq1ChS&+Y~Dip0~i`tT##p=>f!DI117eh|`)Pg7gC7w2Y`AeSkQ_W=xPF
zK*UO4fKWnsMu5jT8B7ZDo+V9rw5E~3*SfsNgU6W$LxN09FI^}+)4=0Ql|_QgNH1L?
zJafR~Y>1T#GMA+>*vx;P2b*{U-(X`k-na-Yf{^PL3$j$y%~0x=vpN-$<`0#~Kdnn<
zHG~`ouTYS6qHe5Gx8YmVg+*u!gfj53Tx!&f2$E!VCT_{9rkL>TWoc`cj0<uwy>wD|
zj)KQ=<f{ZZo?bfijV_4O;Bl%cUy!rurHh28%SFDliv@WTh%|0r^DGsfp5SqYkgy=V
z(o08$XCQc-77-I<aC+&4@QeVDGb|?s8JAu<a7LHf6u)#xkXb%y%grbho_XLozqe8>
zEQ@_=tY%XxJS+T0gaug(#92*81X&NnaiOAuYy{#oqnIFDI2~&|j|;LFh*Jd#K|W_0
zt4SvXISj;crmF-w0>qhx0%vt`9|Pjd)geJn0&((_FUVIwoS2tFK~4j4EGQD>3=n6-
zu2_(8LuxLUnhstfNOvI47+Wex50<gYAS_5<AWm6F1R3C$i3(B%#BrEof<%Eh-6$@|
zNFdw^ee1TQFpTvlS0zXch*NxlzzqTVpEuPn6B1-LkZbJeJztPHKniT6P>{Jm9A~RY
zkPm@4Q);mw^MN?iaETxbfn?-bX9lH$ECbTWwjeCXDwc6}q$7fD1ttY(Z6w76-_5di
z*AV0&5XU)A2vP;aIYvwha@0!Ay7O5j_}9Q4_bHH}>*Nd&rz3_0X@9YQd>5o65T~{Z
z1?ddLsjVVG-UQ<8{1glF7La1Qd`krB0mO0JO9hDlahh~kklsMt0Zfp-K%5RA6=Wcg
z%WVr{f(!xTjP-FrM)_qDf{gXcBn629aoocyL8bz6oV7rvuAk{Z9A`5m$V?#4V3aRN
zIS@62F2w_*zNpO-;ra+%&T|T-f_x(KyGfo_1$YcaR)ullTF3I%Ae9j0Gm*#O(ZWb3
z1>eDv*2z?rAp3yis%ugR_{8)2__`7{8Sqo@K3Urf`JOJK!>rCa5-AkqD@LqNT_i~O
z64Tx3o|VwM;PoH8ak~4_3qrwLy$NsN`LDYX*M(PBn@qs1WzV~~1CYAjOYlFZgOHum
zo#InpSMjMC%`V6V&--sg@bo`UG_$;&fe!qDKEI2WOAkNd$!)JK-X*)UMeyRBE17=~
zUVh5;+BJj7FTA4xdXkG4_V)qI_1=1;xlQNSpkQ(71v>X%2Qr8+*PHxCD=^;Ut;2Iz
z9v)4+hsDRSxKM8kUeYmmGvbSDHOq_etq`b+i>mjK`CRy-HBi@}CxkwRpG3+KdNK{Y
zO6Zfh5If^Ta<7+r(=zbky{tJwuVb)2UNM_EF-Wi8^7ocsTS6?ES7-2^5mJl<b*AR9
z2E4;H|MhC`!wkH^m*s7Jy*ilNE|~pPuqHky|5O3Jq<<B@C!gg#**tg)Sn;i-3s7Lu
z%qA!Y&)fcwvS58YJEl{trYAG$A;kHdB0Zys6ic#nic64UcJAb$H#b;+4nEp|ugYcJ
zH<7eYdVK<kuF9ZCT&KLQo{OVm6H$5rD?1k-ANG#Y_iVh0mE3qLUq7eu90=f*xDl^s
z{viV#S7v!%;ia$oc-*sYGsG0$&--b4X!Vyg4%Wnbf+)Z<ujiB@GYB}`meC5D5$T9m
z^P13mh3%^82Yuj-zh?x4igR>ccAnp$rRN&-H5t_RUoz;xcN>(fp$$6B)%jSK_ZAw&
zF;$&oUN1jib>8i*!#B;S9=||+Qa$#6y%pNb#H>83!BWy;wU}_OPU34$VlKWPgcfr)
zqXYfRZ~tZDjoV?t8!P#%L3=Mj)5jB#O@mjgKwu{xjm(;e$K76N#>0%4btk^~_jFDK
zpMds~jh3Kps7W=*^7=eWFI}TC^!Ph{+r!)5s0q3ef3~(Btn_-e%)ApDFio;PBBu7Q
z^J~s$Gs^H&z1WHBN#to$ae{#{<^g<*>L1l!x6CH=FPYjMriNZcI(57im*6G4nprT5
zs#saoQd!l?wu+wqKG&*RSxvd*Cbw*cu7+0IwgYjUz=yz4p4~Fb@H1srtfn@rub8zD
zW(|Z{_?OHYL?zgo&06q3Vph2}>mOv)K4sKE7`4wB6{)3-LLf293e~y=MvctuK>w0a
z@4={xQq{Wf-&C#BoNC>oOdSbRw-{5CwY8~ZrTo^w)QKoR{7a@zB2!iQP5d7zzg2em
ztx-lzgi&jZQSmz3sF`fk)xmm8gI?>$ct$wzkId^bxPA1=jO27=+LtL>-xTeO6s=#1
z_IZldKSkT0q76vV_N8b8Q?$J)+91$6ak{%xw81I9T`Ag-6m4gUHZ(=sk)oBQXxme?
zVW4&9G`6N_BT{@@QnZnvDO)!GgC<(c-kA~oM$<F+tEKh1xIlefs)G}lbZZ40<a!hE
z<R>Pg9QxqQWw?mB5>G#RZCh}{U1M)hNz>!}r|__3igwSp({z>KDM^vWgOXBFoelMC
zL+xa!rwz5Ep?+hiczIIvo-x$+hKd&{B{jTBDZJrCih6c(irU^#&loCRuoMm77-~mD
zJ#DC+4E1Y6?QE!D8EO|pJ!Pm}4fUj<zG<i@47D4mTn|Lk^*}UT=En_ZcSAj9sBanS
zQA2%Ov=U8gC7RZH#Bla7)M`V0$50O&YEMJ0GSrBn9x~KkhI-IYdmHKjL+xXzUm9v(
zL;b=~`x)xzhT7jy_Z#W}L)~Yn0}XYrp$;<CJ%;+Oq3$+RbT%o;U4}ZuP<I;YP($5e
zsAYz_-B5=aYSK`nhPusAha2ivLmgqLTMTuip?+ql?-}Z5LmdSww@0Gsmh*oaO}FAr
z#_G`~FB=W@eaQ>abYA`)G;PlYlkylznP@uY|3A@mjw?-$$4ZWgrgQxN1)82}*O{D*
zmz)qy=j8uQnl9_L#=;4(P*c|!>O`@dXxeU~Y0lM#bCPfpO>=%9n%4NK(KuN&5>0FT
zJ~XW{VKl}>Bhj=*qG`@mhI5K=5>0awO>?d^oKuC9XquB~nsbHWoMx!Y4RyMqeqyLI
zL@Uv>*6%~psV_4cXNpFmX^rV<T6(FG{y?OOrlpCdIhPpDS;9#)%}F%P`LW@gZK#V4
zwcJn_8R{HE{m4)&K;?ObXgV#T>9iIa&bh)#G|fpg&AGsE&J#|eX-=YP&iRJ(L*XQv
z=KMZ1tubyi#!WgO8tQyWhiEz-qG|bghI4_T&NbA9hFW2$9~tT#LtSL3<%YT#R4!+t
zX$#WPv{kc>^v6c$EJIyls2>>WQbV0-sLKp>hM|69sM8H~xuH%o)D?z0)lgRwRV|BR
zhMEACb4xUx+jKOY+sQ`yQ<KspLtSmC6Ag8Zp-wQ=wT3$0P}hOVRuN5GMKo=}IK#P~
zIBR%2@m+`x!3K|FXQd?;ZF6zU7vCIi;tl+J0rq_Nc>piPwo5C7ST)9XTbl)MZ5F%?
z(QXC7d$S7ZDsc9XaohRtWwhYi=i$3NUJa=0%?rJrD*~<9n6W0=N@L6zL)~Dg?;Gky
zLmh3Xo5*n53?`b+Vmg}6_9!F0S)_@krHQ6F-!q(_8R|$w-D0RC40Wra4mZ?oh8i{0
zB&cjZ(X<6b(-sUfoZE$yXquB~nzPJs?hsC*X-=YP&Y^~Lr=bop)LkM^G%ZgwEkD?B
z?l#nS4Rw#94l>lehC0wt_ZjK{L)~wv{SEbVL+xj%Ul?j%L;ccF`xxp0L+x#-2Mx8C
zp&l~Sh@n;)YEMHwY^d)TYPF&EFw`T4`nI7SHPp8Z^_Zb{H`L>X+Rac;80wpbdeTt4
z8tN%S?P923fy!->Xu95srt7`4;rv=SiKaP;ra3zq&eMk4(NMoJ)DDJv#!%ZE>RC|P
zDx&GMh^EsD8_w{jpsJzy%%s#tOb4Q>y}WM>wWFb)Hq=g_a#}>wX%S7!e{DEB3n$Su
zC($(LSBA5Ta1u>(5>0cSGMrt7lW3ZgXqxk+;e69jPZ(-9ktdp#Cz_T&ZaBLOC($%#
zI+~U~W~AQ|X`*RqqG`^fhVyMhJz}Um47J)&-w~}u(^~&`Y1*R0CY7F&3ej{bMAMvA
zhBIQQhYYosp&m5U-iCU>Q2Q9_mxkKcP`@zLeunzFq4qb_{f0WgQ1=<?KttVYsDli3
zkD<P6sJjhyu%YfU)FFnt(@=*R>JCFKGt}*dI?Pa$h8i{0ZH7ABP`4WD2t(ass3Q&a
zGedpPP&XUuC_~+3sG|*aqoKYJDvtt0(`_UjO}Cp3MtY1$6HQAKO><Tn&asBN-cZMh
zJkhi~(X{+J!#Uni*Ba^sktdp#Cz_UDV>l-o>S{xsB=SVl@<h|}pBm1|hMF+cn8*`N
z%crAhovV!W6r*#cp-wf_6^1&^P?sC(bV;9RI{kDso&G0AdWJ|7O-rYvY3XG~dZtJd
zO-mC^b1pTU9~kNqL!Bk^MAPy_)AAo1&e_6AG|fpg&AHfcmK*9KL!Be?MAP#Bm*#o@
zw+y~#8Q`qX+cBAzjryz)0U^T@f%QCqeuTz#i#pS*kOoII-Qb9(8|*^UH0K)X0z;iA
z@<h|}{|=hAXTC}KLrIxvI^}dUEgd(~agipPmL{5ZROcDa1;R-*&6$p-)0u0e7m75|
zv~)U}maZ_;ACX1MNuFb<iww2gP!~%|MAIn|P0P<VoF5zNEJIx)@<h|}MAPyg7|x}_
zNi@y*eP~+aOrvp`NoR(kej@1*O{epJ8cpYAy0Ln>SWPr-HPJNZG{d>VP^TK|N|7g;
zmM5B)pJF&y5mha$CL8LfB2P3ePc$t*$#AX~PNHc}qG`^FhI5UfPB7H9hC1F**NIl5
zX{|)lTE`jA^@cjuP%8~}jG=BY)b|Z_qoIyA)J=vu%1}2GRh8CAL)`)@kBLOn)~2KB
z9E>p1TSb~^TKfCYw8r5^<2GY^)KHU>4$*WvMAP!a4Ci*?B%0<Vn&vDsoI4D4sG;r@
zd7^1~qG|adhI1EDyK~v~HPkPOs!F-Hp&m3EdKu~=LyZ_}m7(@D)WbyW#;Lt&s3$>9
z(X_ScXgUX7jr1vzCYqK`N7K?>jPzF`O*Ac?j-~^9IvVM3M4D(?`uosy8MikY&x%H(
zJ+EeWhxY69&~(bg=UtrBfy;9L#MCCufYqtm+-s<Vh}wmDcN*$Y!@J#3hY?lHU`a!b
z8tOJf9d4*w4RwT}ZZXu6hWeSIzDHEm`ZgKrXhYp#sAGt#0<P8>>UdB&XGGJb^S?{O
zhj#fh8N)89fzZu&r@YZpeTX;*F_!tgbcEJHeBQTau2+Fi1ZR7T0*JS+g-@d6vCp2M
z&?}u~8T8q2uLr`nvc1nz3?00ah#N)3=YR}s>}NH@ZEL=Gz3+9tVGA)tqkFjDmp>BW
z-Spl0JnzC*c=;R=;Q9Chbf6{3>zRBrzWnPwgl|mud_#RuyetEMfRPd}py2RVn)8R1
zCzuZ@hk9%K>6H(B92|LVZoc+hDCpZjbNy(sp!7aRF5*ia+0y@qy*Gics=E6AbMMYM
zxw$taAp^4zMj4aftOP-EBA}wSC5T$uYS3C0HJ~C17;v6~TCD>C>rjWFRK+2nqEZcN
z6{mnxMGa21ir}o(_xs!DoSU2E!bP7x|Mz+SpQn!xxo4ks_OSNaYp=bg9lX89zP%AI
zR=h!@<*`cZ)}3y^-jV=jQF540@?s5Op2gudFB%@3??lXfp>ij8F-fbdoE&$JlWU%>
zVUBX{a<UiN#HrEqz-6S${G|p%3!dB|k0}0L6UoDkF2_6=s=!mSOqAl9rkcv1RG5K-
zc!gX^udcDxx@K5ydZ9DStP6#U376a3=~dzsyK>E^_UCKLcsbN84K+AB9L<H(RhV%~
zoRO|I+sGdsf7u3H>2(=3Zm{)oQMLZ%7le`W?G4pE-I124*VJ&!J+QP`VJmIMWm#sL
zJ8Bw&XV?5#-Q+brte$aoGmqmq?GPVeyYq;Lg3jh|5LOJg|3U8WuscCyet#aRDW<CK
zxHOkda1P*p`3dwm=3ZgjxwU4qec4QvnJ3$#Z>lx;q2|Z5@GzoUj`2r0k`emdxN&ac
z2v>&~;Yz3%5BmE;TnVl#xPt*Mnp<fm3l*x*8`?Q@TY_h2%mODi+UZq_D|DqdoLjuQ
z8yILV-s}E6FRiJyGx|!1Tn4Pcd5{6*e@?i9k=Qeo3(6{~HwGT+on71}2e;*e?SS5{
ziPZ6?ySbVf8#moK$i+1}#&w%rU$yUBKoG5&J3@{2xwSFTR8!l=e5AHE*6@=X(~LI;
z<MVLRw#jzm>(C^;=%)@aXtTgIpF!(+US!Ba(0eDi*V`2JnF7@iIO4CN`Wn$%XS2~#
zRYd|-nJv{ea$*t4&kGjCWpU((cO){tE&4qY+XnUv!T!xqU>dyef|?2sHw(j&DsGwg
zXZN~z16xOU)z)X1#mzDkNwtK+vqP3pa_XGyeO+_D<rdvtvnbpmhHc)hu7nBUD_&vA
z;ewiSI+SDn4JP6&X1@JMg_V{d*H`=WJFF}b*icTL=8|xv$n+<2XJmQ?tGkvp>zauz
zfI`Lhh2eBFY0H=&SbLq=RL-TSGqZv7{!X$O^RvG@(SG}Nb)*QG9yt3q5oJ>D>|5^S
zC?;q$qtk9P!P$|&JC{3ql<Nc6{Ht06aLiS9K*nsY*3=f7D`_NqW4hQ)OLY}R1{7F6
z_+T}tam|`=BT1`Wb3cO!Gd0iP*KC$ow<vz$3Lpz6@W}+Nx(s55Sow6o{N2_Lk8TP_
zYQsGh^d+vJc7wk!J%>MeR&+jCD|cY=`R?(y2d~;r#>^dbD#9z*J>E69(+~E<*TUfg
z6ryAdQ1tdpF<>opzTY;fiP5pnBZamNZ=vUz3m6v!aWj?Wm^Z@ZP8z?9Y0~_I$RD-y
z3L~Z+A*IkfKvYqbE@ipLGr}^bTcxuDPi3HC`b?*@x0F^hEsmMS7SxIA4sKR$--h9n
z=TQ~Xz<1z_O3b+;6xe-uu!u0=$}@eYyu|kN)|zHZt^$Csg&F1n6Wc@ZGLwB!T`VwV
z%}bqZ;!x;aC%427;qhuJIk-v$Fihz4G1h@~1dG#OJjpQTFCLM=58+5X_b5!M69R9q
z)`PHCX?jJf!re64nN0S)#r|Z&3ry6+io%og!-s6`zk=`~{Ab@z$q)12Rv`lz0HzcQ
z@+bV)1z^)DJSCK(`lO9}o!Ulg@$Scec);puFea~(r1d8j#^K{t#ov0bym}IU0}%fz
z@oi!aUbzPuuQ2J4b~Pu3@e1RR-VX}nCG7ybS8;&&k2pL(+7<Gc^1pby?I69{+7<G+
zwepUT|2Int!c+Lq8@~OaVa9(1h)UYS!~yE70EhA4D(TufO<sAMx4|dz`n|UmZOc`U
z{V)D){4w#v#NY8xC`|kvhw1nuv2yP>|0eb%UTA;lFjNq(Pz=s%bBZG~JN)0Q$gB=!
z?V-*{0dZzzrs))^N&(fr*8hF}pN4^e7CeebO&9I^f7bs93YZXy|DU$x6>}oR-sK_b
z?8CXE277rkbXRm%w67I!-<w^j$l*l}>I?6p67%ltN>m{`5F>1Ki&Qp|&`n5Y9Le43
zyeKq})Tlhh$rE9TaN{mIYMN(iI4>-o@8p}mhMGB%3f^!B3UD=?P)E;1zIim%;^dy8
zh*QL(wX=&6Yt20l-i{RHBii<w;SA>FoT>7=`HFJ2M3%r%x!d|dv;AO)VrPgCyVSy#
zd$2DF_Bkpi%=td2lFyNBIg=OAC8~Nry_ukXK4MSKWeoLqs1d9Z$aJps5)MjAomS=}
z%-2oqwl&>E)Boxw47!T{)lEcA>)HK(brT%PIvg-kDqsFrH}U_nZlc!&r~iaR&BRcv
znb;TBn^XQjUM*3#zt8ah|E8AcI)NS4DzIF$8b!x{lWL;rXK5xXlV~P7C~g14iix^#
zTAGs<NqY{7C=?2Pm)%K!LVB_VeZqg+AuVlPj4WfJKH(gvcjU+Jyxer^BeU+^>%A9C
z@45N$k7mDp&QUu9eZ@-y{@nC+%f!z{|9Z|Zj{d_A^Hj_C&ptN$p~LzdeB-`ny<dCL
z(QiIiK;gbs_dY&($8X;qT{WWczCZnDYB-<5dv|#EwXf=ad^GFpmAjoX`%lL`kw@WQ
zfAQ&s1HuzO8FtZ-A^(2-(}!;BM&bSYy*l)SygPRr`{?A1*maYKR!1mo-z5op<9#xI
z8LO3vu5@GJo$}JdQvMuW>Bee<#5*$rxVJ3adf;}?BXyF^WV0c7`4KOj=s$S5w_iyk
zCJNI1=vF#fn^f+%)@)OzEJ<kX^%_zi)CPHhmIZ(tIsm@V1~7MmU*kV5U|+<#C_<-F
z2k0>Oct|!&l|39Udm>)8GG4YiR2SZ{zydbL3!jVwJ`^u|DnvI?t10EJV}{!NH|ZG+
zZP`S>BQID4k^Cf%gCpZ8u#iOu`;f0#$Q21lXhpe&oEJd88n4qw`WEO$G&x)JvpqDK
zzGzoK&$)6?=J9VGea^cX-~E2VwCm>mu^*qeFPxV{Ved~q-tWiXcKx=d?+s1IRE~cx
zDWx&&O+goTYrVda;<Q8jG49@a-vY(CLGhlt(PaX3wwvjrEJHobvFK-iMWUU60wx8Z
zs~G!E(K1_Job3&JxwrCCFU9<OMlkGCelzcdPfwe6WAl>6^S<o=`LGL18Daav!f_N{
z{mkfJG+oj7*!+#3{CV_mZoc+WA!vQ|kze<?zVzu)qr0B-^2Nh?l<z^|>Z?8;{p{g~
z4jg*G(7Ttf-{+vD>+{u7K;D1g-%fwvSB1SQ7nF=0^6aFhE^2+>fk&4_H#Hw#f81~1
zE_r`?!HZofe5LPzk&pDe=8eBBx@*<2>cif-cy|hGFS~8o!<+Ybx8&J7W-Py_?783Q
z^QS+!srcJ|I~{V-4X4k&|L`kzAH5HSU;pvoy(izY$BhTyJ+FSnVXJel&Z4l_)B_9N
z`)K14PhT7x{l&L`zBK7X&J!86XG2%1^Y<Noa^Xpz4eLLwX#WSxcH6XgUqMzsalqB{
ze>1+<WxEbM^zV63ANBcO6!v@diywC>>h$3ynbj-)eZYxLz0~^gZ|;2PmK%Pocyr=!
z_E_2Tv|nqX1QR<T#Qb9@pB*W7&2N<C2%Yt6^wR0*r5DXDPKTA4ay@35-FR(2BD)wT
z)#X^-c!hRa?=sC>X%$YfIa(F;;z60nGRy{yL1C;=or*DPC_~y^ZcMqI$X>Nh7WZN^
zwW0k7jwwxd3B&7zuVvfJYY)sNS5<{_XSqKmcNVscA!MeHtqEs){BP_ee_7zOX~%xr
zf5+HA@W#ILc8(lyf8>$#+l~8CjeFA18TZIG<6e81Kki+02c<a)^M8gZsCTl+t!ER7
z_1z-OM{3%&e-O<d_>-H5H1<;SBw%~VH3xa=rqaXWKS%ie=o@l6lk3mf=$fPcgOnG>
zSoROl*}c>$u~bLuFli;Ax{YbIZHdl@4fpX#V^z?#GK-?)FkxnxULK0%xeys`QDmw*
z5a%S2^yu85gQdsQgr=pA`P4OI6VdcbA5BAiG%a?`FaE=5dM`Pe-b{ujU;{KQ3%a(N
zZ2?j`8a#D30~#zTRi=korplG~UuP$EZMefce05A|WTE8`f9Nm;6->bcKWhq}PMCrR
z=aOB{qn1mst9M{gbRrysS}ZBoP4?%A9X1+WYHi5}R>J7aD8|2khuNy!Iy<$m+OyNv
z@rzKEWfIoAKgLbcQ7X5Xv~v3u%Ru=!Y&ml&*2Ji(Hx=pG=Gi$_;=ON(|CoatWDwp<
z=_kzrHWA`eX-)A}@wc?bw&V003-|X{#j*;2lz+DpP-HY|5a(_-q=|C9<#1ewx%f46
zLGGxZ%BvaI3(SRaW6f@)x0+F^KR;Dt?vZ{sJdrIj)6&5)9onv>?c0A=+v~lyH-~e*
zffr5nTYSwMxYy#84lSNYi#PxD7V%PeQ!PeH#G!&l#~$T3`fg&QlR7k7MWfgM^hQDP
zd)_b`f)*zo?YH<LEh_(KkQ*~7|B~`<(RK8+?-unm?4f5#@cw(=>*_b-(@RkPqc?&T
zcB1a<m`gR6A^d2QPyVDG9S;l?!KkXkd0vBSkMRevz@9%MCAL8uuF+P6D5Sw}e@272
znW#mbjeH?u?XUeNFG_6k_KsuhNt2)dtR^pQ-{dsE$vT?UC4~MoZ3xP*q&%L+SO$bm
z!JEtAS4eA7(>S+@%%gjosl~`_vpwmqQ*nB_c^~;r?lGNk-!RMP%J)%;E~SL8b;gl9
z62}f>I&u3<iPH}cv?A@O{ay29bs04qvt({F`igXX3^w8@mP^qEb3JYstM#g<YGWG1
zRT1XQJ6Qe3ACG#6+!-~y;v4f{l{+&SSb2rh6``imvhi!h+iK$*iT6IeGd;XEB+s!P
zrU673;R;2WYhK1bu#+<mZ!_h}>{LeSV8jVQ_B1!)MAQ$}7^BY+IoRQ=s;fitSu_Ad
z8tnk(PC53s{s7lAF%0j-MtRzSZIEi-PTR(25XjatkaUo(*<VaSx!p-#u(h>H=C*Ky
zEwrC8>fz=wrZ&smMvh@jJ0GMW=ih?+Q=ak={|O98vahO4+yMLr^ny$o&9;&;Z0@Rq
z@AAw5sE56$E98+q-RZj2$sg?ugebaWw<$Lp!Jq+^G4pz;%wE&e2eph2%3tf;vONdo
z==>glp4+47-TkippzNZ<zB}y5F1PQrdpe)DFFY$s;oaXK_2!iy4OsKi<v-k&*Ynz8
zNzZ{vPfbZrcQcu{&<mTN96rfRzVfqOC;k1~MHdX1r_Q%Oo+UjoCOzGW9)9Q312@(b
zL|3pqN6f{cNQ;v>!HIE;))F@F?#dgTVP0da?^6mIjUi<yMKN;%Ze|dv)e0>NHBlBZ
zE%uNRH8((h+>dCG(fB56hKlrA<;VnD<y=^@lYBIt{^d?4etPB@`RSS+tDOGlAs<vZ
z2Qi_wEcp_(cuRWNd_@xKObmM@$z~oGkV9Il{4`uer`3A)Hdg~W)*sLk2)tV~A0eaI
z>5W-XmOHt!4R_1=w_rS;hPt<kpS;Mcf^sI}=~T|+1mN*{!XSx58cuMnPd(ketrJ~0
zU<LuxnekWP<TyIvsR)=Z)UyLHrNCg=EU4gVOv2M(cPzD*)3=Gv5MXf1>*Y>%viIQ$
z?>=RKne6mEi7)7yOLpvoaoD3OlEAo^(9<SP5wAnuWlmu&mEfm08plJD2Uj}%_`cX?
ziRpa?LKq9co#}Kt5zbHy-bOlo%<Lkk+m(7|%@wmA^tXb)<utyS=E|KxRn8C^#znJ=
zE=&ft&AX60(MzDNlhdZ-e0-<!y>8jIeN!Z2eXqj_zSk6bzSlpcu#NAvtpVTbbkFy?
zU!w1|eJ9`RbbN`E_+H!hao_87P_qr+Yk*A~-)mJW-)p^)@3sHz**9&~_+Ez-e6MYp
z^}UvUB8l&{4<Pm!c=;vxUaKAXUhAdrd#xh*UaRS(zSp+KxbL+c;Z}XGRhjR5?bYWE
zO}^Lub6ekQTX)>|IxIuJe6LlF?3<d=t@vKsFXav`|A02W*ZSD{UTcs^ynppxE}67u
zeXrX#melvUU1JHZz}wlFXKVDw=i#Jvd{%&I@9y2M$wd3+Pi(W*_gWvy_c{!tZtHt(
zTa)j#s4BJZwFQ^&wTJvRzSmywJQCXm_S4(>UR&J6eXnf;^1b#N*gC>H_+G1dZ+1Cx
zw(-4IRSCY=VJQrg_+G05`Cg0m0^e)f9r<2+g{{8V_65Gz3Baj*ufsag`o7oNCgppL
zHm|+!bvyfJnAIuI>+Q2|>KrV$Yt=8e>u{^vwZ#u^*I{1$Jh$s~sHctFHB}|LU5nut
zo#He6B=$}9KyKGz=40z_*XnxQ?b@S)M7L{I^0v>&;#Ah^L~KWC*LW(oYqb-1yB40d
z?RM?kH>p3tzR7_mh3hp4vaZ*yOu5zdT78YXUIQT4>o)dHAAq&sd0vM*dR}W3k5{*~
zZ+b{euy3lKR-AQkyw>YBD`&eLuMNf&iO7>AWp0m%95#i39(-MP&CJhUpa1!o%yoD4
z8!+}Co%y_dA%ryVG@p=mkb#j~Tz^xiEw+e5-AuHPTv{<KvNE_wVnYDj-vT?m7cxyT
z=|7JPK#ETH>l^IX_p3UcCx%In9II@Fn-;*02sydYiHL4!REs${5{x&WhCgu32s8?u
z7Jg@|-i1L44aaTGwqyb`Z*K3nsW<1Xp`c>UlD<7ZeEObK9?Xi~*KKO&E;o$<73~Yp
zl>zs#m!GIQ?Xkz73}qkPd-<cg<Rq2Xlb!+QV~q(PJ^Q{@AFjOSn_qlZd;g*3LnjO_
zrp_mSd-BqwHh#M0n)|0ueRN2#e~gvg=Hm}L{dRSa!QbE8<&lyvKfd<%q?V|p2biSC
zlI=g$%o7!D*I5PHSQgCD0Nr!-InxbK>1tZibRSJ`bHiL*{kGtUSB@Ax%73MsIlNbW
z4}&;xP;kdvIhFd-s(jiQt&0-ZIT?F9J+H#Iac-RgF&n96qpjshzkx=@j5W?JHczIB
zg+oROZ=8HC0dmXDL#QN(TWP|wWSl{!){UqO1I!O+>5LpQr<GIQ-(2NJS}4yn-<`z?
zWV(|%WsTEmi_?oRn0w~d$^`Hdr(x+d43c~(wg4_Mw}k@Aq21AZIg>83=RGjjcg18x
z(?ksX537}oPngRN*D<Y!*+~PhME6Iir|$ZRG|;89Dmgh&S(!jlh3ouPV1^J?cd+9e
zsk-ZlLG34a&J_1MBbV_pC!1oYUzJR594!b)EmI}bY}HOyJ>x8oufAD+e?AZTvwUu|
z?a*3q>LPnAn_I^-_lR(g>uhdf3UkNP-?ueLsy&ZZid=Jn{Jru(MoFa_J*}KJ(R%nD
zXG(0(pW>rj`pk(?MB((DNa!A__-?M5hmxcpV;$J+j4V%&aHcbrx<nz@+)I<4Cpdec
zY%B*7wFzNyv<e5m7`1vR_MT(j3C5FlhCiOgoFj9hk$hp!TsT)#T*xIkb#_eWX-sAY
zZmQ!cy%<%DhVn)Y_~)XPH)<jc>D;z)hYjgcCm7Le&r{V@2+W&~UG%EZe8!-<Ou%t9
z68RTYV$a{R;M~2N5uOFYd>*TDZzfiefpj%jaA@lVr6XJatA@dGajv=2-U;*)U052b
zGB4Fcn)y)-SY#oe{u2jVi-RlqR98Atp6{3UGJm8{Fcla~|1-biZFQxaj%`k{xc}t-
zqCL!2nrnxM<*xE`9P8{@!g;^SJcn~FbFv(d;ZW>zhBTu^6#A;Y!A9lvdVGe*z<s3d
zEPt{#GFd&NT&EPNBBZFEb!`UA-rJUS^c?Zi7zzufoCCNik@tv;GxLcGBC9o(gwtn=
zIc!O*NRKg756+ELre}Fa%>ju=O<sCozF7!+OGAzA!+h;~j+%U&hp8L=><87g3V$WQ
zmosY~Yf<s9YGVy|2~DmAP{+%1TXaVaO3HQx<#K?=9-?qoF$NgbU(kNKxGZNX@T|h=
zTp@9FcS*c*!8XTKau^*@1<rWUqzk?a-JOwH%2`4)1>|1g^qU2()$*_o)nXLuVih%J
zm?zQ{eONaatE6i-L&o`2EC(*&B6cxpj6pN1cvlYR=CUK#tU@8H3Qj6_%@IxMcvKII
zG$r&{tL%S!{NID)0`qbjO0#p=OyI5N99Ull_P8J682n~FnXTVxU)tJiW#V5CXr#&R
zDH%+ksQV76D--H^mFe$<*4|E_yfwB(a%igBliZ2_5t1{P%#Boe9U~4t%Tt)NdKN}j
z+J$p#O_O~meaQh{JY=RDVH0_dLrkW&qs=TJ8`4k(AYr$Ug=$#=UQDVF5Jn{RLG0%G
z;9R(~<<|pUgIgFvKu_~V2!?d;oEB<r!!H`BR#^eixV9z**j{d-Q?NKeN5vFTl0Thi
zkcT?XGBke-_j@jb8+4b&)xB^h|GnPv3jN@Az7ok*I6aP}$`)sEi&9x<)COczOtm5#
z@fp9_&1&`vxSui+SZtsx^Cqz}m?-5N5I^+hP{$`R<07+)9WMEdiNV+@g>vH)Mtp}R
z42H<4SPnaB1#~utNO|oGQ?&`gR9T`hg^X6>^qeXQs$XQZm7C7ns@$Y(-Z#&cNZ6B%
z4kkPR<v6<Ykb+6dZA$Nb5n9}ZPQg_yC-Mofsn^vsIi15p@<I1EY|SPD2Iuo<qLXnK
zyXcPuGqb_=z|szNNiJh}7qC6Hlb<B`*}h6-LM4UBgyfe*WZI0#q<DQ#YLcksOHHG<
zCN+VqOmhW{KVu5_AIf7-%K62FCOhVd8oO`Ti_usa%kJA>1pBs)>$Nhh#Mu9$`dE};
z)yJ6$>SIeeO5O2O3friUJ=)V6v#C4UsE_SCsgJeuCQ%>T_i^=cvQ?Upx&z?0>SJ#^
zFS8^r^|Al#sgJ#G)ibWv)E&0Ws*klwlc<k<fMS0kwM*Th4oH2hRpzUYRV4MXnog=d
zwl&7p$99BURUfM|Uw!P==M7EjWB<9W`q<VTS086;HAsD|YNTG(jBZ7JY`>KHIIF$-
zSRY&Uu?87eEw!fZ(0i##Xw9mR+cuU|ecZ0G1a0PaHs-01{qcD?X&qlu^>Mo<6V=B*
zu}zMce(DZ=DD`nBoTjb%*tRD1v8XDw`q+X?ee5B>jr!Q@okwEZz<w6kPfXomaT8Y`
z+XkdQ_8Qna!aJys)x0;m2)k|6$EqqpeVi#pT@v-NDv<hEv=^w4ZFi(T_6l3o$Myy4
z;{@O&>SM{k@zfoe3F_k*T0g0el}@T%>JCqRtS@cij!af(O7$_&91*Qh<Ef8TztqPH
z#q_gwSp1+q&g9k4Qy(KCwN)QeRigS>i5%h#KZ*KSJ&^i1bDQd8bv>>=_NX9HeXL5P
z3T?B-Th+&Qly;4$QXi|GxcXRl+Aj4m^(Uy0GgGRMb@Wl@4o|ufQ*KostFLkOF#sLZ
z$36fbd+OuNj_PBLqP_aqLt29RSoO5xtb_VkuiLDg?NT5AKEu<S&i*NS)7*zeAcvRb
zo_x~nAD;Zmu}|**)*cI1P8f*puzg|DaD)dHOw)7Rga<yTyKmyF=TGfF^0=?Qet#^5
zyS}(_{te^L`10meAKY}~@lBT|)m$c3yB^Yq&u2`V|HET{&iTv8td&zPS^uv{Qg8d+
zI{|s&A9m@s^RCy;Z5c58fpaRCPuQrio1rfk&1sr+_abvy=@nn>Rn{-53w}~({X&$=
z;XCg7X5mejJTU*f`DxRyzy9)r=YgaS%G9K)Tq#o*_}Y<wpqdLiy`u)rcg7kHA~qcm
z9#s<C%wZLQwsdXq0Wm$^2S_R%KR7)8!KUDYx9tZ-QG;|_u+COHwLGXc_gudp?*ves
zoP4OG;!vdllsxq^q8lA2+%MnRW4%)l?w{{C=Qz8)>g3_>`E?Cq{BH7sBWaXgnbVyN
zpcXhzWQ0=?DRi7WozgFyJPZlpA)WQEcVq=GMmx?cj(OE7TJM-UUyYPFL-~Kyb_vl)
z=ldP~0CiFZ?%<)=)e^?>S#r(C$k%40(|0DSvkDX=q1fWU>upK!-4?!+*PHJRw=5{H
zxAk!Yeat+SRtHO2!O_E@fa}RsxU-IO1|LO7>fIv=Q@36-C}iIYOQ<#!MDkWR8IM?}
zrlHE#Hd9KQ)t;_wRj6X>k|{^>zZEN;T%EHTfGBYWt#rm>-Y`|}RHtYvTInw4Glo%Q
z4_ff%x=>ZgRO_MjG!3eq2e}EmXQZE#XGccC%DI(huSjFK2R0z-oiFrpb3R#@GNR_U
z97{|QNe5#b0KOnAT@n--)3aUx0X|j*bzDLny;||KHYmT0@)4AWyf!xk<#l1vBjoi(
zO`LKr3B^e#?IOQ#TpR7t?N`8cB-{%_amp<UKKT<(_KA)bA(;27<wC$&>bwwM<OMT9
z%x>sa%L(ydw2JX64Z1ch_<m9A_i9=SS)HHs9d&pg#$Rw+V)dbS(NVR;>3tfIAvCV>
zZY>L{y(t*+I~p-sXuM&~Vx0xuvY-!Z17L1@R==ZYmRo^uvB1Rv@TLIx?f|$s0r<|g
zHRoRJBXoHHyfzMuCp5sJ(Lu$f0qlwZ_UQ!J2LxLZz)lKa9}8etC%``3rrsF=?AP#v
z&Sad(0de{68P1;k&9NCUXU4sM1K1sX-_krC<42nrawIEuzi^z%PS>539zv+s0?D1i
z8)=_QoIN7gg<u2nlZqW5R9SS1kAg9>Dy~S+Ar2^6s0A<b%$}Sjocm}p(~OPCqA)+u
zP%jIr!7JZH`%h_I3&D#m)-kznY@KyaWTkBg-pB<y7f&{H;do+sl*#4d1&b}!oaoWP
zzkFlu<=Ppx$5`te%>JA<Cg&gb-2dSb<7f82Apb8XetOTta$Rg+xcysY(rt~g<X-Ay
z5!-B6f)a+oqLzq%9bO@IV@a9RRg5g^?!llkt|qyLYYvfO((GFA44=<jp+vw|vA5!O
zET9>t^_2l*K2-j4W{+a@<>CH$j*&}TqyU#Ya*2(IG&|X4{2@a}xid;R@XS%nKKv2(
zmqYYH6B-%(i~CVO`9O<Xt$J7hADml(c@N!*^ipVwj*x#(#0*E>0h@RT<~@!O8#n@9
zxl*#{o9E|LYFNji0yjN3d&-`BOnu;=9CG$Uxj08&B~g>BS~Jb5bxsBmN0G6_j~1C<
z+iSbgeba7Y2Rw)8CficDD^q;$PoD5b0tC|*J$z|+^1#!ly}$2_?9aYibi>h#Zfswe
zbi$L07fD4cZ4;OK6J8*eM8>w>in|S&V=Dt5Fe8BICJ6k%^SsF1Q{oWw0*Jo&l}6ul
zGG@AFY9k!ephAR$g6GK?09`Yb37+DNW==C?M2(!NSCg~=(M64HaAV~*Jce!+)%o2b
zD%C_kXMU2!mQkFCi?TK6Mo?@cCM|ZlJPhu!nPiw5HrjBq9FAL?Hot_&5P4{RNxP6%
zsd7)!32|z(=VrV8$xpx0_dD*iHL19L(0J@LIaS=9d&^Gee{pDb%hwmZcJP72K6+_5
z)7HLl9)KOB!lWW?(g{xLjGWY)cz3nF*6q4sK&PUOzxwgR50{MUdUjG$Na}NtRL<4x
zEc9omi$se;yGVLS{K+ttRZiX%ThP5Y5F^G0;N1kCp9*|h08a4Q&d~+%8Wt($<#K1Z
zCTYAgZoO5*mhl9uaMvk8ud|DNin{BaUB_D&E#8fDU$thf0-R*=K&Vs&d=+&kIOZTT
zquFH@m*b)fQ&CDs*n?h{Krgj0vQns+b&RibF>d@h&HOBHl|AQO;p1QrQ7E)N4)Foj
zv$yOHY<v|?ZsNZZAjVUJ_Q6sozmJR)PCkrJP<0*t#ga^UyMJr>0JHM{JAXn1JqB!K
z?A1QLS~ye?&mT1hgokMNAA^Z1%x>G74VbsY8DPP8P?<ERk!v|I6R~_Yc#k@h_jzVh
zT7x1ZYGs7Ik6cti3SxjfJ!XTg=0}`ARCEr87*(rh?E_b#XSY+a*<yS<|78k*{GazP
z+wf7F+pI5a49NS(Nlpf7yWo>-&Zqzba@Y+|PRQ3Q<VdyPB&u9}b<C?4b<!+2R_E6M
z%T&7FMpl>E5@JWr-dDRu%xv;u7KU1a#1oUQ_a|PtCA!FeOgN`A|MH{t0ts$L0634+
zS8kL;l=%}{XYIf}<e1|#m&DPKN!Ol+Tr;23W%nsg!BN!AxsRPhp)~NS4MDAIX`XY?
z_~thF5xYuKBc`91wXX;(81a27#ag#jDy?lxrA=`v^^AXzDu+NND~spLR>**tE;~-E
z0c~0uPoPykYqsY;JWF0lt4Cz2Fn7+i5l<jeCV6j0H@7Cm_2!cdOx}bNh}AazwcazM
zLGMu8f34n;gV~?d&EBLMj;=U0j%zfdo6R_w5-bi?cEICW`C6{aw!d%q2gfdNdR674
zEmF62thH<nYkV7D+l}7Z_8ctQVXXb?Zhe+b>?&5f26~e1mWQKA<yMi@AxF56BswEi
zx5&Pg>Z?5b3^4E46k{qx_D~E*39Dch@EKeJK>9nkJf47M*)9G$P}HAC-j{7Rd0z^7
za|F^^)Wj+R+RnYzuToKcUX>SYca>LCWzQ)HycN#QvYmHWIrD<rucLOw4EiCObwT+J
zwpH^B1gQ+7X18~#ZcdBfH*=Yd3ljB;MC~R}x{c-aQ#OGl3#*kZG^GifK(Me75x>Cn
zz2g*Tzat$pK2mDw<{mof^U6d23%s%e(d-|SsoIj&n<TRcSmmnQ{P|o1uPTZbGAq`@
zY6h{}rgI~5?J3xg$2HnTsC|na2uCpf<stZYAb~)NYeJU6W}Qc7M01MPEAJ(4@CR|D
z-M<JLlyv;}|21*LbdGqZ>@ggV<qJG<d!_g6-x460)_dVG?>x89Yw6L~=1u!`Y1tVs
zDr&HO;r7eV+b_K*75=whq#w|aexBNM^VE}@J{xuW)Z@GDGWwm_Nz=b1RmLYhk8Djv
zu&Ti)(jFnFQ*;R@Y&rK%fx_U3x))$NTd)q!r~pcB6kyC8UKX&dw59&n{S+=<;DnY2
z0%C6PW+&~_1CUTM>kyc$@bJ!bhdDb$COe&Hp?ysVBA*m|GQtxn!-I1IUV>D<)vfxc
z@RFY-P`cVCGNks%G|gwQc)Blf`rhE|yIAQJU_5^fn8`u^NoEo<53-Mwri5L}HEUWW
zuW5JtxPIIdeTdW*QgH>zGA&$DKge{I<878%I!Bic$O8=d;%088&zeH{5cB$+a^x-(
zGe^mP_bfylI|;Ue_u5rJ%$@c5Rb=z(G+UQxGPO1{JYv|UpfArL{=29NB7cE$n&DIl
zVd5|?8zw%zRSknwUt5hovz%XX37)`Cb6mYg8=02|MA6d}!Y3o#c!T?~;>}wE;cuCb
z0XbhY0a0^=9g!)42S%t2n(g8eKO2fo55&--zxv;JR+uhPbF%z(tSpH0rF22Y?4}5=
zz0H|c^-y4<?EWNJ>DaG9Rf|Q#8ZmA+(vzi%jHh<jTvuHdTb%BimT;XvlC*pLUTtD7
zyG6O2fEdy=1RPxtfk$#&x1JjSZuTbqEIgLupo8QB8wV{4Kyd}9PIHEwn{BK{ZLlay
z1CVb#kkh1+W13LOAuOIdw@%}_GgLJnbadfNCeQKSW&^6Y?hO1Pr|3w}P1OtOm5*wc
z^72k#NedYh>6<;9XUHHw-N|_#ccj_!uGq=kien&WbHp{2hF#0pwr5l_SrqQ%n%oA|
z{Wkn~xK&KzH!n@!RcQJMPX5G-$dL*ho{D);)bbT{bs)c$_2dfc@D8bSFU@q?Dlt=W
z&;C!g(N7-rG?0h?oF_@#jt8y?XFU$-LRC4500QIAZdfW}TvQRmb7Oj~gv=}FZuTnB
z%{eMQnXNn9^$YV+IL^a^8RQXcjnCGZU~MlT_%hMkI&Ha6Gq)kAMWPE7o6&ZXhI3Zx
z?xGEOrNc~NAWA+{V#g)HOwsX`j&p3_jK3^s@j_ZOIODS{8aVA)ULIeswi`L#qQVW_
zHDf^2=d4e|sbA;0i1`PTrp4La-HqKOCS^J*mix#<;9(8iHgR26XlEBQfa5N=w~fGN
zX58-&`Fc7~6rBQBnCkSx70>)$YCqgCx1%qSuM*lWmwfLv|95cRrg>ws#dtN-r&R`B
zD1E^1!rzqWhb(zyv+$VCH_{Y8#5oR5gxu?Kl11bwd`56){#?(3_ycpU=UR&kBPs>W
zEGIZ{<Pk!xdV43^O+iBsGFF!Fy>r<(Sx$wdU&V<X^c1o1O?|MIb073O_Av0J(SA-u
zHWhShD5OF$4sYa_fOm(A15Q{PeDJ*ez|1IeB5UHGL`{xD7`vG75)73wb1r!s{2VXS
zg707U8$=hJLxXANHjYe1W<6mTn0HYAmO5Q91;Y~fJ;8pSt{d6vf(o97`}FYbjP}iv
z;0@b}GcfVZvfzyp4h>3tvnqJ=Ja2j?zF8Z*k&a+T|IKU2gAyT6kg_(Jp(a^4ndwyq
z_)RA~{O5;1`Q<rBIp=?JPkQ$Qdk&zA_J#jOwhUj@bL?TudyP47=CJxd^l}cYkisJ9
z%qX4dSovI*vr$+#zhq-}%}C4~e`XD$33L@AfGspf^GtpjKS}4<hy00dVmQ%gWDh5I
zRU(+ncxLwWMmKAF1?1d~WR*-i{C6)@|Kaw@w;glefOG%;*uGV><Z{uz@Cgi79WtH%
z5yW)JJ9sZ0?Vt$XsKWL+QJeI6`wWHMhfvr)5o6|P3V&EKeAp+Y#b2i#zxJf}ewjIC
zsD{}-4P>pnK-#B)JV2kf&*ONvhIz<_s<+b4z9+9SyZ^^C)9Rmo8-0I={D{*P(3er&
z(&^;a25$KBfm6p6U-P@qFIK+E_K|~3^}JxB`gx|6!SKHFQ<!V{;qK<fs?6cO;%*HR
zl~Gm>_DN;skW!4?J0WM{sgQ!6*^*-voWfT?iX(YYvBp*?%XhLYbYz6M&`b<z*s(Bb
zxhCHl)&lWF-qpE;`gEG9x=m%X5_-6*Jy_i{8ht8LO0CW$a+B@04&QKzRb^(Py(w(Z
zGMJEddecW1mKYurw-SF~m67eo#61N@fo#^mh;?W#9nUZWaiN$(?#Kht8BA*o8c$o{
zH~yS;QNeC?taxls(+5Ft(fYvhP#S=}K_5He1cBv=zU^cCwi>?bcdjR!r|sP0ZFR1H
zYv;^q<hD`whWp(c+opS~f_A0s&SLL1Q?NLT*>)v5$PTO08MLd@$sCU4NO6=1o;H0k
z+1&Sd<<II36cWXLaeZVgXj?v)dRp;Tx6;Skr$NUKX=>B2(n_cAAL!S^GN!fdSRa&$
z(2VjeN|JffW`QFdP=S~1XQ`(1Z{E0tFT5>!(8EQMG4$}xwmqEju-`)^1_N;u8r2W$
z`wxjf%igGK)HrCu0WB?AFHmJ2jp9^?=Q2A|Pi;(5b2l<q6rv*cXl85fQJ?xyPe#F%
zpL<juwO;N~<#_rdJFhC3q?_GZzqe%@@FDd-<Qrv3s#8W&1^90T3BxOGmoe!4U+Z#l
zvMyKAEdtBlAgfcX34K(ve)2b>qy;dLBF%G_8QPf8ec^-H3!4d3F*0p%=H#7#X*|z^
zDQ;F;sAMo;%E<ak?N|(+#SkMTcOum3^O_bz;za!oR`8NXeR7y*r72Eu2UWJ=gd)X_
zN=)@{1S@|Ilo%FXiMxn+rJV0yCbH6TRi0)p)7>QukC0E^yoef<H<Vg_nT(F<<$IR@
zI)ug9b#97^VT4(p99Fu@@8Y?LIRwa)0tuPdtqOk|R`w;?wwu%9Y;)CPKJ*np?~JVB
zAMn-&<yTSOBYGrEd7QZ|3_IgUjOdHMf{jNZ8Wy>@NWvpr^%>5_ijeJ6?hNAyBpm>(
z{BD9}TThxo^p381k)=Z(H}e#?-i<TIkjb%aO12Oc6Hlf3;8jCl;{}KnnS?rLtZ>Y^
zP8XtXW7e0Acvx*WH-;j4f#ARPaeq{|GEMY6N(P=H+BN3GY7T5Aw&j&!oq_t2lwMg&
zXF6th@OkbN{^xfx_+Gfsgk}a(Yf<p(zJLI}tib_+7+$GH<BnLR0T{xt2rNpjKF^ck
zf>2dZ*Vq7PWl+~AwyrDN)-@>rTNTu`*{^G1YhBX<AjEu{%MX(K%?Qc~lo35fNdTw*
z-qf~@c>&nkpbZ?yz)r|4@!PO@?&<<C?qki4u4g)wT52MHD_OfEos1iBt3lC(Rc2H(
zrAqBK&NU0r;j~eu$nSlaB^MTRro8HjS<EJdq+F^haeFHv<XM+S67zWrJXAE+&Dyi8
zsMd5+y?wbbQJ1s!CQnYiQ@X|(I>IR(jV($JKS=O&tzVflv?@JY%j>KvtZ`C&IX(XF
zI9!%ES&V<!XyJqi)C$_l=PF35b~145TwoPgge#gxeQ#c_j^ssF0F*)IxX5P8NC}sw
zP{zn1u31lH3EwMKs;$Dys;abNb$|0IiFEiLA$~-sQEZuOs#!(^KT4f&e^?3h#+eHh
z2%dC(DFSz}Yd$1eJKy{nf|tOZhc}!42AGFeR<cseJ825F)rE_?+*0Sk<Eg%f*%X$9
zp{RNgQm3VLywgd#54B!|1vK1byLPorgji%gqFY{9%cgs30P^lhol%Jc!b%)IJDDfx
z3W_kUo@7X-A&eI0ojFn|ByK)j=qtwOGF7kg?Lf1bkTaA6zi$!``I-p6f~5|=Oa-pY
zcN3h#`A*+;&W?+zGiu&vmAXh$*Ro3cN&1W8{aT9<$C5t_Vd7x=425w6kazMuyv26q
zOYUs7oxfHyH_6^jr3#H%=Dnf)cetX;HD9LHYlC4f^g(YDBUE@@3=e~$re`yYW1IY0
z{EWqw88y=p9(OW58_;zzHX`96=f37G0%1feHxXW~fS4987Jz!u`@GIcy{iK03+eX=
z3?iQBh5xV$x}q{_`>#NBbt?JY_$(i&Mm~Fp&(s2&WIf2=1@5m@aJYe??w06)OGxY_
zIxwvse04&G>}F2a?BgDGvkf&tXF<q0Tc$1=y%hauCz*k-ffcY+S#cw2POEIb^%k}%
z5J_`lx`{rb<}O{jx_Aya-)Q*fdi{D^33_SgM`lOUSLiAcK_B!%<&^g)s<4R_S!S+*
zP9UAHU?w{Vs|x9u_Oy>x>4T7l>&QUhEk|zL_O?u)a)@v-my_Qyhnl}41;9W;(=+L_
zT`;i{TmEGYlQNJl(98~KXk}{ZVop&&vgBEBC2%FFW3v?+w5aKsF|x0!C5OZUEy$(X
zg2C=m3KN<#N7bv+$Iw)^_Wc>Z@2}v$+aII6;xb2^?qt3v!hW&ZUjln+k1I0Q8wqrd
zd6TWahdC8g=#t$&To2x42mC0vF#AFr^jK3T|MY*68$bfxlO%!YLUd{!1JNDHOqk7C
zsT*y`S-H^Zx+0#l5+;mh+%@DE9CS@EXUlHWtSO@eGl5??*>`n${B_(0JBmkj9DI>u
zp*I@O9?)ZF_E_a9`4jHUIl{+Ko2&U(8UuUs>M(>Qt|B5@$!)brV1FZUQ&*i48FmGN
zOE$#0n=bAZ^%>F_`keQV4w-T~aNbP#$faQfnTM+D5eH;ZNBpUC3YO4LtrTrvAQVx(
zCIx0o$H6f+fPsnu&*W5?@dt>uoQ`1QnN&;j_<`K4J9**GbUKr8YOM9HS;)NPuv|=W
zL-2vBnxt^6#F0rSlyqdu#*ERDv@Rf_(|neh?dW@3>1c8Lj%JtI?mQjl;5Wt6*xl9E
zjNfK7yTm(}ZDzOPUZ#w1bcy@<x_H^{YB!42n1#gDa{`3-zEREIWM$$@lnml?WKcK}
z2n-`bG17~xgG>3MEI<?}_J2lMYpracoVr$4^>gZq+j6dl3&S?pN}ws^x$VoT8x3;m
zO1}1T>Y_TeId!$-lv7vLlT+7LPt~nCb;We&J?}5Ei(wDDa8I*Xoh2t1R$D1r`!oBc
zc68_}$&Q{4J4Dol+__oLiAA1qdVHY0T3bg9Zc}VMrY(anTI;v9l(wAceAm2C19w}d
zd*#-8ZQTpY%`}4=#0|QvWT8|c!FaBP7a=~_#=f$R&8j0_1Y{F>u~{8{gV_K7@A<J1
zv~7M2qgW2>&)Dd6iR6bR_gh>nd%?%W58z^CG%}q<S4Lt_@e>$6yfK@j)Di+nb|G7W
z<i5tXs~q_%$tuqVd(Vm}zh#x5M<zPUYM*AE_wY0{ch3!%hRB&-%y|Ki9*hYuA`WU@
z5Ab))LN~liJ|%X)#K(YL)Bo3l=>cp;8K0L^jjbX=K3}M2fV-%5?_9w|_MN~Yu8LWD
zx=L~8PzMuL6Cq|Fr@bETA?#fn2(`)Zl{a3{`O7|L9<Wrh05$*?5K$irTYgWd564Zr
zeh|RCv!0?Zyp8L9q*OL8pFelr88xYwgf(kExNyu5FnmMW+ejmiI17@OK$}q8&vHtf
zE;B(nW*IYHn+H<R&akhTk_g!opq10t20bOPN-9^fN5S%?jbIANYHc5{X9fD&iUyd5
zcSnkrBzL6f2;5dr;U7nn>nWsC$TW8$k7RM>i7BaKtC2_y_3uYMnCVVFXN8D_#Hjh4
zxldP4fBfQ`gAVAjbE__~JRxZZEW5CNT>l|ONfRB+HrLLrcVmsnQYaqV>u&xVJF{&S
zNTt#;NC8r%ze3}OP=_X^KPkOz5^zu1;c0{pXz_Ahw^x8<oWQ1zOJSAy^IT0aJM|@K
z9_&RB+k_Qakyc8FAp2EMq@&Mk8hmX?gVlyuLTj^yfIctr`OhOJv4mCyPCW6sI^ZWh
zZ~NX8?^#_s+XRWvIj`U=Np=ODr{amvIXv0K=MIcGY2tG}vx(2O3B?ni>j1Z8oxkR#
zO@VrEBGrx8Ucy<5_P7QJxr8Iy9X>Bu%-)-*A}2qN`IysB#smcV-dhlBuo+7gJpukI
zDTnjVd}qg*B#e@Ik#+i4dmtGCFZ0xY@HKjOs#M{_NVrz(>|v#_7-yI(2`)_Ijv{!6
z_SS2-B+X0{AbUSK$|7s5-$S|vv}(OSpydpxOY|;F;|sBrCv+Cy0G<50VS{6XkhF-?
zCBMsqzYKJWj*c-7>zy_`oV=p2Oz*6D;)*p+zlHt-f`bkZJz%Q|_<Dn5?B3JqC_AUq
ziI^M7j)UBTXsVp57y4Cmd2cha;Unyyw7|(cR;eNlQR+a3SlgkO3`Z=H(`L+EN1NF1
zD|J1x`5-M)-zvYy>9$=ydnlJZjy|X4gXOG@xV|<ZT_k(uv)9MUXRieta8X;ReD?ll
z5c}5h*>^&;xB2Y#eUQ)IqkX>ty|DT0RY^RbJ%<DDY_QFI_Utv_Cdg<1S3j=gAuy8E
ztk^!TB;$mA06p!P9~brgdhZ{;J^hf2O84J1>OtAA+7~9ZoF%ofCACPssH~UmZ!1e`
z%!90_z2=j(QW!;Nx#m~R_<JE{TNUA<QqGxC;gO*unV}hx#ub4=H|=%*yH)Izc~MRQ
z56^9q)|I{gOvjuW>8dYCq!|BVM(_nrJXi#*(V84-#gdF)(*lQ<yDO-I1)c``)`1Gm
zmQvnpzn_j|Q|9zRBU%Sn#%DnECbNAIzqdB1pKxdD$7ezC?#Wb0Vmo&ks++k*#2Lgx
zGt=JihwvF6bc@d9g9c9bqG8TK=cQLJL>Vyz&-)u$l$9JHhw86+;cRJT&9Zb-x7~y;
z?rhQ^n&s)`&KTV=0sr~&Y#qmAW<C&EhODT&;FBt|R<8NtY#82!42>~=iN-t(z<q`!
zc2coEkDFE)$z`FbU{ROD<tUcgdDEOKD`qlXYlA*tXr!(j9jGfe1?8T(rLP8$=Ychl
zq2p1iC&v_<?=jfJ1I`U?cCwCycVY?S#3Ba~j>Ky$JBgsph?+CuPjmr1WP3U@#nad~
zCznwH?y8;4NmNM<Xpy=S!-<=oV@@az+$ZL}=_7X$>=OHTH^tq^);-I~MyD_nt)Z}G
zz9GQ}c1IK*jn*o4;aq#yfn&b2ksf`}231KtAq=@(*CGF{9_DIBhLWP3Elw;{x;Twd
zf%UoU{&KRZwde5=eh9LYgSw)B_&q`cO^@)FN$3xf*37N+N7pR;{O(|M_#v%6uy-yA
z`f!Vfm0UETiQPzo7NvWSLNnGC=|6;(%){(h&$SLM+2-GAo1M-pn8FF+91mCX-tuv!
zb6o~YBY(a9;(I$=*$@x`P=NbJk7WV)YQRa}xz5u6ADu|v%n97u-g|;oFl96|6ciDw
z&$2E~sgLe9rQOrW^V%1#4|_{!qqRE3dpcu--&5)7)1BR-<`C)Mdz+n=H@&NQ#AXof
zZ7;{-5WrDK#o|^9APtchARhm?B&bT4qIM(lIf7#&wis87`W<$lkh4E(ef+n_F!+U}
zE%t}MA^7ku`yn^ZBRrCD&3^c{{V;@?CGL%v_O{>W7T_~Uk1K@_Ci9FkR+v_5cYfER
z`>R#UpD3VRUb*?0YxK)xa_|ztqc}eU{RAsHe=m^{Iwalz@D;+U4lWmR1SgBhH%K`>
zNZ-4KB~fP1g$HniRYsk{)0|9HKl71L^_YWjf@(W&os)T&(`AaFot!56Ku%oJZ+=&e
z#B_dGu6gw&s;cC78Gjpe<or@iz$C@4x&pSPx+QR9se8vC?Lt(3vatAATN0EnvE^nK
z*PWPGP+?kzjrvSEzB1=qHlAj41X%7`DwA0@X(5`KgkJ}XJMCS+kz1%t4rB5<qcb}>
z=2(1h@_u^NCZ{5(d|ps_gRR`mCLEUPja&M2s?Ik|WX?)Eu72(aO}^F^GeR;TvBLAi
zks?gwqH~!z>>d}m1hx3Xe8A#t0xouUqL=Y@y4^q=3mF9plhJn+S{&x~2^5VKtqd3Y
zMFA_C7gV*1s&ogV??#qp9_ABx?P@fN1zh;WtfC5E>=Y~#jX4BinlgeaQ38P96~Sz#
zNoe7J<|W&+#ZIpy@k_B}N(iRYYwpi_Qp#~!xguz#E@<Tjw!cucFt7+N3CiE}@zSpK
zU$$v|d!J+(q4m83l;ytXkMTy^dO^^7QBeLlt?Le)9p(8`6^Kfu!RxP*z8)LA{+8FH
zqY7CkP&NMbq~P`Uye90{aTX(Q%--xZ60<~-8Rud10_e+({IfAzHlmHjiQY7^aB*P{
z`dk+PUd>qW&p4tO3BHWoxd_VUBA_CeM4NxjRgguI`4~dt1@tkTW&XjOBJq-s!(kGA
z_a0567|e3YkBrBr8RAJ=_Lf9T8zPUjbjFDit9RGgb!6O5uY`m@KU>Z#s0WcT;&+XC
z7IP)=_6+5$6LS$vst*ZexC_WzGOwI1ktx?Sq}RgKt!_O=qLxDPio}TVH?t~HDH?=~
zNWHjDaTm{9?p=DaT|#^2?{Wd1_F&(gUU;u&_km|zyJK1SpaEOBnJU!2FsXo@Hx|e%
zpZe<gmWpu&SByCKmhAh#$~$5hg}=S)=9@>Y|I_sqSN!Rq2OoX>xtDuUIDFc`Bky|e
zutQFN;E6#m{%hr7NuBMIo(_`A{0Kdz@B1XnZKgRS-!F{p48jON^1{43D?es|c?NIh
zbg1f5&cBiRbjJ-3C!LS06C+sv9H$95cg!rE+Yl^;wL!g4AUhXEt*w4rl@`S-)k!;E
zX$Wsun5rjqO!~m@>r>RJjlvI9n--MINi;;c4?iO)$9c>d1_z6+v!j?G;=mIHl342G
zEv4^W%wY0Q#Nv5NRs~-++m`kYXLvnW8<f9Jc_!>DcP5P4y0eZBM!qS4*#OMo=sMfh
zI-D|Come?Afv|it<lMqWOsb65E3|g8W(~B5dh;&U+nDua>8*ndlCe!GE@)=LH7CkZ
zFw1mh5Hr0&AW4=s*exktH_OfV(8t!bR`BT))fHV<QZ`zW|E5RjSbFAjo}mEmWAdO+
z=RJ~%w2nyj&xJ{)%gbSAdA02LG4l*QCC>a6l;xZIsv|AV!0+u&SL{xwvLbT}J95DZ
zPT_2VO5SFFDnrUfpj$u!nSP9V2nlSF)+wciU2HgmQ|x`IsLSpB62&lq5VrR}l8MuR
zSCUT26hEBvB4#*U5auYRxdfnWG`?6i%{|x%GuZl20b)vHpr$t}Kb6SUEP^RZ7dd5{
zFXqT5R<BL5x6Htvjs*P^gNP{vhq`pH=5Ot77JFg<nL^32a3}&KIK4f9JV$-l07#Or
zi$SQ+aMM8E1dc?15d`;Q3%z@(rr<jk*Gqhe-x4|Hl?x?i?hS>HW`qB`XC_&SeoRa7
zQ;BR#=~Xv3e6!Oz-D$HFudsu_oP{yB58c#!>p++9jS+tle~gV91vjdhtTHS%<v2j7
z@m%v(xYnEo`;M7M(e-Ig7lJVYkL6Ld5-S+<%`|)WAH0q@sZP1N|JX6ip*Mh4K}uQ3
zaMPzg8NSbQ*n#DcPRq;f!6E~vFlOUw2p;voKjASV#4n(A0;Vy9RGJN;TK}(9{N{29
zcxTv+JW$cPgNXxO4(#C)xqyr?zfcfxnfbkj9*`X9{+@vtaDP9>&OJvsJF_<^R)!;!
zb>=fiOC^UJSvif{i}09&-&&R9J5ZSxS=-f%CPI!4TD|hBk=55c%;s5UaxvJLQ^h%{
zLN?KKwexotjg~_Ngs+rUwr>DD2N|#;ipwb1Ngl+m@*8{$zlJg>jw|^5fba}1Xds?#
zB50^3rnc55!rr;eA?RAdm*v@Gbp(V9JdR#dde=J%)wNxjRx1T+veD#DhY~ew2}}k&
zus}T~Bi;a$fz;<aq&p+jC)^kwk?(0e#bu^#_NR9fM-?*J9RX9=3a8>qKXEEsNrSM3
z(+k*g%KpUZ?ccIjMf%FT0X;?bW>iooN*(rl!fKm{T4&MVtmH=e;#44inKEn($}xo9
zr5Vs#kU4_)Ed4S{KWD96%_%e!D_XB|XQ&$<+9`M|ZHD9s>O*CBq4^Y}R%A;M!dCjZ
zKO|jgyF(Olf(WwgaCyF6Or#d+%$H;%kx*^i6J_LM!AH#oxxJa2s!?QMH026vX^cMz
zMJm@Y*c@q06G_ABT0g<OpM-1-6hU+w@zz6GhB}94DxFPNx}1r_MP$CyEAlYMsS0%3
z6R_%+2Y-1|>{U+=WcrJR|5Wu6lN~gArjF#Md$Wl1Bi&HocY&)JFm7A*3FpJjt#e!V
zrH_dFkt_#Bk^db-)^QiV+0CltLEM<cl3L*O*%%5M`28|Dremgei)C#Jg-gy0=X<=6
z^Y<r~@i=BVhW|vG(Atr1tjumC-hv)t^;6#V!GJE|#P87NdKiJ@R#FwrSUP*o!^64!
zMau0RJ(@^I9=!wmq*b{ir{b{ju<$#;6zhf!B2aV8coga50}|o1oyo<K86=H)OdB1|
zt+FRAY@TKq&p5~7956qO;JC$12vp!&605%U(Z+dRmW_X=RV7e>WegnJIXuOOYs+XE
z^R2ZCz-qsxlMc_#PB;0LFrq?;R#QHx%-Olv*^Ny~+RGBobfnVauMXpVnfO93lKc{8
z!-y|ej}a=&XIx&v2tEl%nwaZwUr(CZ^o`%qYgoGwI@jP#gsOG#C&O;RMu5XeJtB2K
zx1Q-3sAx3tPNp6+hLs}j3^$OeLbqXU2;nBw%C{wHz^|Lz@atlqU+?r2`L+DDAiJwV
zoG31=wyr(WBunZnL^UkOyS)ka_ZYbC9Fmt9o-9R)>>KN@gs*^o%k@I53(otwcB0qG
zi(-e>z&z-F9vXcntwnnWW<ERcm+<bh(ph*Yd-Ea~@w<3%V9xVdXj7c~C1;^(jV((;
zZclTMj6mk9nlk7}l<spxOZvyu*-e8*K|eZf8nu*b(_mKR=a(^E9WXnPAHWwA;WzPZ
zsg=>j$o<k_H;dP?_t@QnYh0Bap#|bGx4|$+LSthZSyc51MYrLly}pKp!v1hOiZ*S~
z7a+s-W|g%LRf2*<(qy}Zhqf!2HMS(IOOegiyaaWA8m=(kS4Zm20yz$b5S${O6B}!2
z!@qsfl%sF~@#7?Hq;cj0aSi?MLHLZ=t7mEU1It0mh=`Wb8DD;eMPb}!IQIv?uD?>3
zXICgksw&prd>@zMN_qH4q+lxq$j{UY0XyQpXGQF>)R(b`M>ut?rDxzWNO$+sTBgEb
zjbvI6mFMC$q>c9joC5L1CV`|dmgUtDPG{9w$$VbWp%rw9#_tK0Col;1LrVa^VnegM
zZwIrxSmSu9za-WKTu{UfMv>ltA3{nVJX6dqP;Qxd9V*SlnH_r-!U7v7yZ1(co9nHD
zJjUuqa@YY;0V5+iP}yFy@Wiu3OAv-c0%X8jP-Z&l*^1$D&oez*`j00FcgueCQTq@h
z$wK#_6iehs=^-->2}`HLJ!^{0qqqd=OqT92wQGZK{>Ap%HF}0<2T`TL-$DjLix4C+
z)DkO|Q^wy-B&%;*z+RfTsc6iI*9l+gjw#biI*QwH6DxSQ^_T*Nn}uNyX|(<^)PWAT
z9}(zlSW#c3!Q1cz!Mk?gaxmrK{RAmqf}>};_*`$|8*MK3f#!O7s`wE0Sn-(MBzG{&
zV97DFzZJ`6u&@j{E|zN}^<c8i{)%9(Y69BI=B#vF1jO6g&gW8^iCJ&|bPnG2uxJPB
zBsvSG{gt(iWMkph$;U&ecU)n2ZjTq8_O|D^QkH;EwWXtu4x>T<dT=2?I!d_>$PoIC
zf;G-3z}bV);R^_hlY8tb1>G=5txjCFg3Be#O4mZept+Y!)aX@VFG4!sX7_>AM-mW0
zc65^8Y6r8_v@A?)ApbKXrQIhw_V>6)T3JNnrb8UlH0>gDdf9s5+QvaG8671a@8r_G
zI!snG)Ta0*z?^%7X4sK>O?bH-<2=(kr@_3_ev*0jj+{)n@*Q|nuHzVv(AT1;-5bWo
zM7xCFip>r&XSBcuC8LWTT1gdp*=R{swOmX9HHr-pNIuN^Jr-m+k#!cSqwMk2d`lkV
zfad6*m<u5Q?faMfxL9lZS7dHPW^+hmk4Rx1c(uGYrTY`|1Is%f+lA~5^El#hYL`>6
zj+Bp!QdU+sA%x?XN_=~(z94=;YNIa@Z8P9kv{n5ZKGz@03RPgZs0r^83dmP|wWn=%
z*2_3^_4^bZL%CC2<&2fr3c|2;C;3YL%(?P#1|Hfa2#n9ZXImL)7J&8`A9F97IYmnR
zGP4muOfxKjM%6IG9}4f^u<N92t=$y%CfqxE4<a5#htxaSPzRd*P`#T!h5`S(T^C3W
z7qV}f*jT>JFt>(Uz((hQo+Uiixjy>#($+8mH!m$9OK_;xpJ_!EJPQl3j6Zpn;UmjH
z5DB~s{l^r}Y%uangU5MBsWTV{nc+6kDslRn{g^ota}ewew=#1uA6RRcwuvH+9?z0*
z(P|<hOw$AGA`HFEo4^YYd+m1eqtysRR@+#0ySZ)3@CTq+?o5<!L=_cdc0h!alQ>}#
zPNxQxz$Hvu%q%i^63nOOT36;?wwx_<v<DTLcRBC1<I(;un2BYaN-#*1m_mLm-ywl@
zMZ2XLoT<rV5$QIhJl<;y&s0`WswN^<3ZC)OzDUO)gRb6{s*gjY`11N#tUlJOe;g)=
zuyJN?0g|K290dZL6?D%sAq)>TdVKrcJa#1_J8kK&Do)9pwf`>%{ZMkuQLmKU^)Ce;
zw=YbpF;1%TP3rq`k<`dNj@)nd{n5%@7yV`Iyq@bD7bR5=Lw92{{b}ni)#~)X&O0qA
zF475M0w;rtSX12W6}%jvZ<b1s+6P3D_*JSHD^~9)j~NBh+_eB~3d2OVyJkc^YEx)Z
zvh+@p<XCDaI6bC1U2PWGgQ_@~?IHKP#ZK4AUwNt6z0>%;!2iuPc26rwfs5;g%#S*Y
zM=^ZL&p^}am|rU%dbsIsU4>gX2Xcsq%z)y@SacXX$-5Q9m8dT=A+W~Gi20b1{x$;!
zMm{z?(&gci@1A6(#bbOAw79UCVV=W5g^`k(q)`qJpUBO{R5e1KoixZwrxbQV{zUh9
zRzr)EE&1XE+XZqs9K{ijj1KI4m4@3nIl5iJhcTRV%}hPvxeozrgvZkl#a>T@H^9_U
zb+Qu>rAdLV@?tph`PFa~A>&JkMG{Y?Z1Zg(?g|I{OKbPo$roC$7KCzqUhS!1HQg9%
zPOfvtgZpgsFkKl#5m8W;&Q8QP$Q$Rl$dUkO(>nP$Q%>;ws25FiXnQOfymfFm@r33F
zaKe+I{xm$x9-rVYH`%Dm+CY=OB&bz44`fD(>d9U~pt?i6k2v%9NZB^u&Qs`rTEiek
zdDQhr8m8`@Or?%|gnrml+r`*~yUN*J0TnYLNRBY5OmgNsSsOvDq7y!2cVR}ZWIO0`
zF7=e5DzT9m*HE(SMs^<VCPNt9ObwGbb8~3`HR-I#{&I$QdM|L#WMkljQsK@-JB5{=
z{@kp?>l^$=YPg->oXd$gncjk;22!a&e2<5Ymge}_Y(U7tTQF)aH7Hs+yr4^lU(%@;
zTF1`Ut@)JzVxRqp$%}fqB+wpH1^%fwr)xpvnJcR$JCt#`Rk1SSK1e_{W<awvTt0)h
zFR~Q~5uPs-G8-(r_g|ecEER&4a5V9}mt2@E-b=zXX`9MdIMIddE`52;l3+vX>>diy
zDEsGVq5l!k4{iMo{jjS&i!(|dHD(4rQ`^xFDt>7h9>;tD0-HPpg4R=8@WJ#GrEJLc
z@%9X-kql*}$yz8AEy;vNI@tNdB+|@X%rK4-FSGw^825x5d^fIw8|f9>Xwa*=R!=Tu
zsGCp)BzkhS#{~+Bv!zALM(<#+msK1Q_C=@X7N^g2=tkr9rebYa8&TBE$Dd2b#kbf5
zvvfH}5A!^;GjxhAy<1|#=V#k1JTFbx882#<kf|I-Os0P?*6}VRl`N&5$C;ITdYEYQ
zEMDL;=Pvi3pCR^8PEYq&BO5Fse%+Ngb5VM+??Pt2Jlifr#7Xjkf~_=j9bwr$Jrz-1
zo{!RxL1_-|WPU94lAv5_-#lDlx=jbo))3ZeL<?ZP0487Nry|Qe5A|JQ(Xe?(ZOa#k
zrhbrUtq_T6aF6zLTimY1rPDlykS-@%E9*<#_sqoelbDH9<QmfiC7OxtJjzV$f~2jQ
zi5rPg72=;io9MqBw*_bj`VK=*N8KqB^RFa-j;?K&-8eFUTV&Gu{C@hf(=NaMvN>f>
zpFR4rZ!bM;ET6Y8+!32}=;`+lSbOlB)hmwLbo`{|J)b%v>4u+lZ~s&m@btd+_5p*J
ztiF79)s>@e-m>M)q>{xax@ut3vYNHiKRf5Ww`T3N<f`a%j|u1bJ`LE!@S$kV*z*pJ
zOf;R#&pTwnYvc#}!<mOh4nOq7iRZ1qYUZsbt7_)W*StDw7J>d(o^jJ_OU#JoNFk-`
zOiukZOm8iG&bQCbEAT7sEh{6g2-%^yPf+k`kPWk#1y^JH!$G0gOm2ZgnK4!uu@7;%
z%;!+Cv|?a(630fkjId9VsZh?+%2c?NDbofpif5Sx7a#?mL8KZcU|nCJD)y{$&4FCa
z4euX~uT{8zXXsf9%_I2+eh6Ds#JtTJZuZeNYB0~gv7;(@z7tA$+#7R$FJ^~r3%+OA
zC=>dOGFqLcxqX_<7~=Na@ILv}jsue$8KsLvcp#q>_pjPsDg<yJEf7*>#pnYWs2XL4
zz^9JFX)P4=<`_eBe<i*mz*<pPAE8pM3}K^9ho;{mQDMkA#9VSFcL-zGB)|G7nU+o0
z250BU0t@qc8yFO90ZdO=0;x8A@1kWv{bY&bD6~VA<Xosb)XF8);5dWKyJskM7iUnc
ztSF|!?C|S|{t-0FL~*fg!eUYMU~f(eD`dp$-4MWX{YsL0+rYjPz%EJ+yD@;hE;a1O
z0W7!Cr0T<_0QU0Ku%896mnDVeu-}S@EdeYy@1*JjhhGoB7o>*$E`UYpn5y1?2e7rN
zVY%qo>%$M$h~J(b+RWR60QS4ou;&M`D8y6s_o4t+j@KP&vP~Z@4q!h|4O<t$ewrHg
z$^iD`)Ua0vupg#|y)J;=m>PC*0J|Y6Y@2m_V*vY)q_Aypc2fZRMrv3h`8|4&B`3vP
z$i2hzlJ%)!Zx3LbQ^Pg{urH;CC5qDP14p$~{e?Swu+OE2y*Ge;HYseIxp*LeZL*r)
zZLXn*0@x>0!>$ZqpGXb+Z~)6~$Eon}NC5j-YS?{zl<l3e)&qRVJySv+>O+#KI0c0r
z?n91933-GMIXn*OntRSrivB^Ksv5Z=flW{G!3V{`6E@5?kf;MJjrUIpIn9UclM?cG
zK4h<ykjMFuJyJqW_aVEbg#5h^S(p;?4?bjmO2|`v$n2DmTn}dP$*Cg6<j?XUGgCsI
z;X%$xRpV?APD;aVEz-z2>Y?@RLr2_Jm`bFL$hm@>k*c=y?RaOW8b#zn3wLHJI06bh
z49-dkndL*Co)R+0hn$fTGS7!RDJ5i~5Bd9)ka$(tu}@D4+0%zSHYH?lAM!UTA^ZA}
z$E1WD;6qML3Av*WS&<TQh!6Rzl#s)H$RkohmiUm9QbO+JLmr$Ga#tUcsP%2H9<#d-
zc~DBoQ9k5;DIxdvA@@lMxr&P}EHOQbPBTSMxy8(beIzyP;{oi$sbN<Kuq#r-J{7<|
zm>Tx!0Cst5SaL&nW4kLgEVu4>uy>}0T_3>yW!tb)gIPgNbMT)y_U3y^uzAD$V|6Wx
ztgbktDA=k1muHkfnBGyzVVSH8bm2<^=+`)e7jU;?CN6sDMl<lA<L=v`9P@ih;Yp(B
z52OXoa*wmBxWaNmRsDl@>2SZrjnuJY^bn{k)28Yi7@5Lh9`)YigrzC{vNeLEO0xeW
zSEnDSs?f*&7z766?S#@|06&L#KX>G3hC3YZ>v1jQ;y~Hq!wlrbP!DF12gB&H(7z#T
z>IzOgp-AZ9UG~<tRWAdx7sH2^qXNYQ{glOR$AnH!3&!*TolM79#7i(g+IsQZ3TFfm
zp8zok1BaZKsqg_=_Ph#72HraCB>}vy>LjX=XrjeVI!b+#O=mOePAGr6VP!%IorQX+
zoBJbuI^JHwV%y6d;bU1bvE{Tkp7#0|yK0KSqSl7S27oUD$lX%M*1KlBu(vybI6~R<
zRt!(a)YKNk6<f!!+))yyX_)<TgkyFl1&^?fC8-&c2}#7rqGl&ygF>41;s}-6uqQi~
z(qexs3U1!kSf&MlBIXWA8=VqqYlH8VJ?LPNHc?0`YlE~)T+b-F{db=}o(}>4F!RL=
z2Jbg{TG5InuZ?-}AJ=?DY)z<rVNxY$QgvujC8zY6B|fT#Nw=u5LeGi2#Ob;Y{b7|k
zsu+Wg8Hn$ZEC$2Kanqf@T`}(H76~O$zcny{&Isx%QQpS+PM3Au?~MIl&T3KjFRazr
zn~s@+UOo6?RqzE@;c+|0Xs36$n1C>mTs^iV#nY9W{c!M<H#o}Xu{5n57TlEEcQqyj
zf-*4)$`QH*zf8I()95;G<Z6%;{TOTp``A?L<_S!YPNZj`$zNOQk9su(B2%s|wq6%7
zt+bK;Tir;M&*H>sn9H3#{#O*dR2+?AlmyJX7faKrJo#8G6*$H%hh^@?PH30lt7*Yk
zS~P{2qs)937?w`%gN>qMti$O`q>Xny{>elf;aumMMlAZMV@GnW=u!`rvLeP&nPEz}
z!|EU>Ly4kVu(B<6vgTXcWG|AJnd_;sm)VHz->GnRAw?bcJ>VOFN?36=Go4{e$(kv1
zG^u#v4YZjn$LvD>BQr*3R~tDou8OHp5G__bYAYjL!77YL)+Mi?&}IVEB)*D;4O7c)
z_(cz)QKINe@ZT=R&vHb<JYvK862Jj#@L1(4Hl-D*WOB=#zBZlqK{k#WBhj7YN6Rpy
zhzAEtx8aF~3-gm{#2Q03X%=lec+dW0V5uqE#h+P4loxW_UYo8pZ-3X=0dO@G^T{S{
zjMqai(Gv-50u$WYpf&IIWGvgd#0tU?wdf{@1Zu(D4U1#^U#H>9bAiGe2-bkzjg&ut
ze_ue!akA(Ki)Y%d{$OMgF2urzxD|yp$~o6SNL*7A?%&le+aB1;^9fp=0HvG#Y|t?N
z-Dvo^FAGY&)+1lrIxfs=9fQtV6#AY&wHe+V^rM887BTr%(!QJ1%j2#QyZL=l0-GS6
zL=QJIF<G;alea)pHrte>*)_qMD%#y|^Ge#pBL1F&a&hyCU{fbclqe>i*j+}5MT8*9
zY`nZ$*$8nAz`k)I&y`+CaB>{spOm_<OrFdM<_65-`;y56dPm1DPxQnoY(-r*_9B;5
zK?qJ{mm~VM!ahJ$B(M>C^b-rz?ctAb86z~&32WfJRj|KaFi3lSk7I@e#I_-T)x{XS
zxSK~lJUN7@4u=It-KUr@FEsa7t~AgU58J;V9fi>??an7&uDI;&`zD{f-<4~o#+>;r
zBxnh>FWmn9Xlo&uxqJFUF0>Ie|0MSYe)O15@K4^1mD>89Upfb|MgamalAD~oBlX&R
zWZxQdwQg?Qf}<J}a)DCCevSVx-UXLpXl@5bLN+{FVQpA?^DtYHdLb7F1p9PpJ7A?B
z2^z`3c1$B5VpYb?$2D&o?-G;y=O~BC{BRLAe>@}XJ-Yv{j$b{rDHxk_-jOq#-L6EY
zt`x&N-f!7v_0aH~%>A9b6Wf=1gU(}|N)rdVKEnDSP7p^3`nSu%;ei;yw=!fW^H5u>
zTvNR-aj+m^KmIX$cNi|~e+3krH9QzB!QT@DZ<V4^{xsr2)rCWX@+}-|r(_4r``Q4s
zTaI8emoJZALHtGWWHXb3&-?20Li_pDCf5wE+`-9QNCk+(%Jei|=ZP!`oTsctRSMGm
zH`=nWCziA}r>ls|+^GTo*bp=_w6&4LZAOTa1|&b*M9<bf@`^t0gr`bxn7(4&X9YFo
zmieRQ;_fcdF}Ax0R1(u@4u~|Ey_9&x`h(Q5M_<m{I&(m?*^4f7RWS6U7$x;h$yX!m
zDBzAtTaW<3<{c%cE=xK1w`XwWA^8R39U1L+WQ;iCp_cdO!?ESM(}+LSljJ>YoQo`n
zWp|Bk8`_BPKF>HF$t1jyfmK+vD)7Z|XDIV431Qi}FXypNJ1ccaW}xGmg+BC?7b;js
z0gkh_AOV70vGkg&3*!Iwq2o%ph~tqWJDIeMA?ik@+@I)0cwoNmM~3@~lN%nb2OSZg
z5=?WD+aPz0KT-))Bcgi~wVbP)^9TZTBU{+pUg4n}8K^<-++?!neoGF*r{SY}C~Owd
z2)CCv)561X)u;MG49#kWJ~7Fi;)J+1Xy%_ZlVb@vxL`|@V}?jxF18-PJY9=BD+Y3R
zIIpaY95hcxT#_>gF>iQJhFnfFmy%g&OKWDOqOtx^lmtIBI*X6(`pCkMjmoKwtY=w!
z=r`YUR(+Q!f<=B50V4dQ;CtO@h73Q7EcdV<IKY8Q>O0OR%`pKzR(bTe1u{z@#}=O)
zNn8vT+L2>WV99ZUg~-AIfhqNf(*CP&><ye&Lpuh1$EF06HE$n(gi4STjebu5D%|M-
z(Jl$z>g1UpEr7C=LZ#UJ=sBC!S`Q*k*f8w<TJIJ;J#dldY$BW-BM;;vyr5g>EvWMX
zTI>=f*E%aeq9wC_aj0&(lVy`tQkR?<H0iJ7a5TFJuY2m$^D#m9a`*K+UvH<!oayCJ
z%JgO(If+z%kWx(%l|cbacl${Lrn}rAa8y9LIz8UWX+h1BMitjX#tA-N2qoGP7FZ@w
z=Z*}zQWt!1ZwM-Vwq`SxMwLGn7A&F4W|5^ApCVHz|5rCIcwvpg6*(wbN&g_Nq%!O8
zhC|kaM6@74H?gH;^T0dW6@AL1v62kv);a7+GDLX{ge2o|5W0t-Xxqge9)N!tRIYVS
zWeNX^A^59(PhB_52ua8%MT9CRJ)dPw1XWp<gto-vNnDODro@t$ixJEUr_i>{iZE2F
zD6&~wEwj>jNi42{zZ?%-zQ>P66*xK_kQR+tQIDJ5DW)Y(SnRX2HN5b+S;lKzlBndT
zsv<H14)jZLART`reH`aSYUv9wp(}2@mOQ@}`=*sCX1LT9?w82T%Gz+V2D&J(OEo9H
ztCa0k^jI1jaxG~;@HlE3lgWDilCxZBs<bhg-eGj?fj(`@BP}ah>dm?p>w{T03!!=_
za1)tZ{Uhv*;Hy=91;aQ+45Q2v4{`q7h2~YmXSGNtB~D*1Ayo&;bXgkRC0jVmY)Vkw
zvY@&(RM(m9F9*3h3q6x|U|57Hc1r@-wE^r4z;a*3F&tDmX(;~6CuDm#VE6+>gSy19
zfi@w^zp*zYCCd1&)+7`)6Gf=yvm>OHNf504D1p!ft-X+=qeoWKs8vE=n(l=nm+%Nn
zd5lxZpj#;{$30!JQfG-WEhtKpIBm73ieRVQ5+|)~ZImn>kkx+a1REtCmREStu20d7
z)NaFxwD_NFl@`zR7lV#S??TKIgSGG@ZnkXIgA<9n?N(hyZ&}v45`7Z!$Ru5+GY`U|
zpGc%`qEE$dSOVHx<Y#V#DRnb%!ITW5w5V`YK!rC#5pXY(M5Ade=gtB`!nD(oap0#2
z05Hlo(HRuRcYapMd97Q>q=WrlDeqY4=#k!{Kw|@QqdP_M?p*72=R>;FkM7{*Pmex^
z^5_u^7-B>u;e*s7NwOY+t6=e@9pd+Bv)KQou>a0J`<JV+W&e1TvmX$ZPsp?ODY%GK
zvQ;EE(?%I1b6K%cm|2!k%;CsNwJL_7RgBu~8+0_8=6-_Wt^N~LO{)~Fp*>BYjz+~c
zi5otZBGT}wtt62Dmjqp1MA&zS9e-8u_9otLg>AjY362T*OE}K9HU(eZ&R2$5gzmcA
zvnRRYbgigHdkCi%GHR1OLHz9bYHcEaB`gcES}yJ3-OyYi*0V($iIV^frhOb0bdZGA
z_D2fT$hx?%+8UlE*xlSmT7oQc{Nw`yCGtJVmN-vminEuF3WT1XWTTUY)Io|acfs9(
z<hZoy7s^=$Jd~F?2i7~~r0L*aepmRv$5cAITER0LZK_o5_;DB8yaUqN>ZJZP$M5ZO
zq>Wxa2+$*{U;T65gTubu^xOO99R0<d^SbTJ=j{uVM$RNv5+}VIFsY6?>FGad($&7?
zgAI*a(BtxZ_Br^e`6rLaXzqT-tn<fFc+vYWb?MhUX!%oP=09`L5uL6<zMXNHPxb{$
zoYadD;c$q&d<_J75;KAJGJe@-T>#nD_o2<_w97YR%sHVjH%qo&j%`!O-Ga3$><q=K
z(BX1y`~F7zK930X1|sx`Rv4}rX##&ava=>eN*Aki=Vr2^$cK_ONS@kJg71}t(R)VG
zL^Oj^vckZt`07OVod&p~#FttY*nB5vrM~(TpTvj?2v113CR??=O&osM=}!KR24~!K
z=O8MIq1Ng8s%@)<$UEiqQp%t-*7pUp#W{aV2yJ5azCuup@H@d|us62QupHb+agh~~
zc@A3dNNJ$<UVDU(XK+Tt(a24FWl^_(64Xf_BGk1wI~)T-s&F(OiMGzuADbJM8@d5i
zWSDn|6zDWp(x6449Eo+p4e%F%^#DpH38_;o+Stc!E+cd+t%X<^W8B76Ya<j0m)v@U
z0ulN*wMHlq!7uA7bt%`ZD)FK<GR&5<C1O!VGKcg^@RDQZ5rsPr`Q<=CvbVpZ-AC^3
z8K!<t1$g_FY-eV;)Zw+}7sdM3p9PL-&1r!jms(};4#mz0r3oluF3R0=s4B~RZ={at
zVa}t4Zf*s!`GA{U6@yS{R$&X^ZOlA&7Undvk`q8fNTZ>R@s&<rZ2Fuc;X74&<g-1L
z)F#V(jL*HT^lDv`Bc%c==HBia!u7i9b^?v5x49c}yFeDea@6f)&IiXAGir=9=8Ln+
zoIRP1PVON@rf^*}vqdfm^pdmiCePuE-a^Ds!UVW*lXx2Q6<_RazZh-_@Fa~Y(-i53
z^`YhmN92`hYG)%azGjQQb>wngXwIjITNU(?Yo=<nnPza=8mr+Q!c9l+k-ReIXsF>}
zKX&9b7G@zRxRo{bnpHcTfOQyMv?9dzdi7}PCr;lerlW^hHAgrZWd7-d4^>jrn7Nv3
zO?sDFTntoxhu&tYkBCYHhf>@9eJZ&0y^B2P#rTnJCs)$1p-}H1Q5`S~AJGTxCQ71<
z82V2sN(|`;0AhzfooZ#z7^l!m#?_`e-jS9gVrylC@oR1IYK3ODT`dmtew|8Olu14U
zI}_$jM`Ot}H*saS_`{?22IWiyWHPJl#zHKvbar<a@Xi=weTAaWOe6I0HXB(QGlw;3
zXoEwpH0&|wR@Z}3T=1)G=!+eLU2&EbP42*o?A=L8coF6hn|J~gysWreZ|Q^1ugNoo
zgnt2x-?AlSR-Y^Bbf0O?Y%#w>7SH2mnI_1r-r3Ekw*{~gP2#s_mA$HyowCy-0e_29
zQ(;0^ka|gB*6a_tQ+>Xr>)MqGPh#bNQ-F+RBij=o1M-7BiuZK&w(;%Hqj(}0_onUl
z%Ku^SO~9k9&OYwUoIEp=Nq~@qeG5AXgy60u7`4?3+G6Vxa4EJmxYSqFfb3ub?$)5<
zh6YruilDev4KBC?D#i_PsYVShbtwXB-M`=eJkOb##6bhr>+1JiU*GGM$@84&?B_oB
z`oGs-{K~{p6VBg|_2?Tn{IOlm?p%+}AMUG%8yAOa<_*34@DEP^IHT{wyKPB(52wAc
z(_Xo)`cr58iOOk*<{q(gzYmV-ao@X-gj>!Uup)(SMTE8oC`g=K0UT1!QX}^f#)~0t
z?Z&y1HPMi_cf6xYo2#ePQ25<Q;$ENaN(mmjV|=wWbj_ay(0ariY$+gWouwez=<B&t
z;0tI9RRVV(sauRWWvOeg_~>A-EQkss0pa(`DIsTWDRVCu)0sH>c{7yg-3>rRK?jy{
zdy7N><_*h>9W7ybne9Hi-S*C$N{?iTYe5I2rbD?cOFD6OJN-QegNHHWSGf+4J|^fe
zi4^>){F<;eJH9;cciE<k#&>*?EpD+xYupD?jJCF(C1O}6L4~AN80>`>J9%QcD@uqU
zbq}<h3)ex3cMz<7WY0|c1NgbKH#WH9WHa!)vdSiTyg!*Aa+53ahApxeIAF*@W-ryW
zIJC-12f4=vgF>|O8-8LYM|U{mdN5g8p)2*iEA@yg^+>W*u`7iNgt|)OC=95}-fvh~
zUrLrNb0t4<^{r4SrdnO4te$~(WSlGZxhwVn#WX7>vbSG00U%rB%6@Olve9orDr*hX
z0<Efm3Kl^Qb#EfcEDt3>cvia4&JG7NdA<Lcg#2co-JASull$x<_t~ZPvt81B_E7S(
z!aoMH{crc#BcSnx2#!n+y3Cci$OJvQVTR%lKT0Vx&Xrka%P`YhrUx^P=@SHBr(Sa9
zo&+qnO?;GA#k?{-yo(C!-iM6ejMGNwX_-0|F$l@1cOO0LDu3Elj&KKjWTN^8SMoVm
z@)b&ULfxd5yu~CuH8#0|>)n`MA!($FB*&dQE|{AJS7w!>N==>F?1oe9ioBmJ0$*hw
zG$1quLmutQY;|R>nJyj#K>lRM$GJkEGcTRv*Lkh3^*WEiuL}Tvanq7^r(2oEyk-)#
zZ;|`zJ6H7*Rc+QMD_-u3%*_mrz@;D!+arJ^K)GO!SGsb{0-jsTh|GrpmR8^@B4FEG
zpgx#ur-w__buvpDOxU69*x63oIPSP$vaaTR8ynbe%>3y{R<=CMXr9W?b?r5{^4Hsb
zaXLPkrmkO|nV7guuE=e!>t9WSqY4)D++@LwKLu@z7tf1tv{dtpX?3>Gr-(!9Zx&|O
zA=v-`fc&5IF&gc@eAG4Z22BLBb3?L)ajsmwMnn81Ss&(kxK)t$5=B=Y-0&!z2&P$s
z(#A|Ni&Bqn^~mT-96lxua|OvRP+e}yRMOHZas&co6Omv)xH5bT3zaOD?68oM409P}
z(LjDeMZ@rQ!C->cytA<8=GjFtpJ!TEfY$!`nDFOd{117ycPCeRJr;UBZ;Y6=Qw9HJ
z&hmgL8#TtGyRjIE(O*v`86m4U7n4>S3`lVQ=8LJU5Yoeio`o#jIu&IGfA~or;nKG7
zAlqEz$$MzaREr6AF`q}V$Se~V_iufmrYcjK;y8zUX?9#27FB%*nwaP}z%h8#!?{wE
z^w88Q=iiWfd@uoz`x9`yoq(L!>j+{<NI19*z-097RO^V*bTWR>!&N&v2`N+1qS>zb
zsfJL7UhH9HXNGMG8O<;gu*$3SC*Z#11T<6x6Y#XBQ?A>38mIgKb}P5*1zv2VyIwQK
z2PI#2CBMM3<w}xhB2lu?mHf(<grwh#lqOoLo9<~ZL50d(!S7wcJE!{R^e+^wPZk{K
z3eJfH7s-WFMIJhvtzP2PPjH3Kc7@Kjg}Ru}lZ9rxLgz*}uJJyOBU<F1*VvEg^OZqY
zhR~Idk`*j>1+R7m{}vR~MX@DWaHT7_&=stm3bV%)WSfm%f@^7T#cp)PzMhgGBxfhV
z;y1Z+x7u=$z0anI>|K~Fl5s+?LbtmjpV}gqMb^11CD#>so8k8o9_2t#JVo}fmrMbs
z)EwMsc9j&*b$SoBMb76HQ_4}pn14Zb0CgWS#uCX#rbv(i-Hv;vAB3ntCJlr2mMpyH
zoGF#yS(tAjNdre%T6*UciPHnk+6Wq?cKfK>?<QADFI*ih_d1R83QKLZH`{7^n^&o}
zBdldVPGA=rHjy)Rt+(>YR`H3!Ko>fG`!e|L05_q`y}T{SZ+r2xsUn-44m-J%Kid^s
zL$QJJb;y5FbK?}TRlAzk&}SKl){5yPNq%7lGQOvw%Fro?Tp?WnxE;?*Ap~#-$%jg$
z5V8{~g^<4X6+$sHAB&e!LKgG-WI&yy5c+#GCeagkB|G4#`G!GyLpgI>orC&>liYcu
zgB=C60SJk?WwOu|Im`pqMn^L*bE&saxW90BmnK?g6>a~qEOB}DG85_lp+W!6vs@AA
zn(oQ;Wdu4)aY0QXN#C2POWcE*VW?|(mSa}gj7S4fVL?OK;)o!12rd3~bFJ+Sm(xqA
z5u`qt7e&#$fTw&4ctfT}yO&vsM*l!I2`E~Zlhp`f|FuPD&oQ@5XC6Z6S6-#Xb`;6Q
z58dkU^30vIsAlpaf8t;AVosCsj-*fMU(TUtr!M8Y4r-y7S%nv#dVBSBm?&=#^DZs+
z<C|~KtU|F;9joA%&2LEsSFlt)Nv?v+sXt?v*?f*x;yaLi9<>;@Y-G}~jJLZmdn7<K
za>wj~=cKN&ix>q*Vk;5(hSu+!4obB-L5a0H(r5jOj`4OI596~GE!qBmRz9wt<~Z(d
z)2qtH7yJso0!fnr6`89ilSc(z7%EHdC)9&SG*WHU+%p|yC51l#fcuc!w~I~*w#5ap
zki47QdB4I8p(Id+5sq_=O*)XF*!jPd8DxrdQqj{gCGDi1>SeA%S!jFN^BATf*tM6z
z#%c<`wVJOE=0jE_f#~wDGZjT)uCyQvbzIA(EK`+vkLji7SKFSe&#dgUI&Zh399kLU
zNrrx#7roHIKG*+W(Dw!Oy;EG6QqtwJo4+|zs;(aBlq$7uUcpFb#>+T0_~uEzf$DWg
z+#b2^&HMHZS!aCeQ0Sg-vCrE%Nwe5JKPM}=L~c0~CYiL9veN2c7c)_wc?J@d^#zXI
zIPzxWcRAZl{YqECvMly`90UxpMyzy~aK8f9O>RcZ?1n)ZmJ@;VSUpKPd&V~mM(oEL
z;!Z%P;F(hroXN@w!MGo!F%zF_8ABW}SrxE@?fmw!rm|u<K8s!j_8V}tlQ1Ql$Fi1E
zVuTlk8?T#$*+}g8YJ8%ykB8s~7&1XluF<Cj4LxHUTIO||@AcjX40qDKmwSUaD{Ouk
zTeEmlg?T*-5L>XC(O=6=e_xnYEO|X6e`B9!K}Z%%DuR4&%wpsAw7pX)4lj#~!&C!r
zWqvPU0Q-Z!jSrD&#hr8abj#sB&*JAqbbz@^uC$8E9P>W7{$2LuiB!OSu`t+;RU!m5
z_9XL`BT{F~Yg1s?4X|8!?n<vYJs8+G3@jG!1%Z>aJqeBdMDRbsO60oQNzTiMsGk0U
z8^+t3)%t_4`H&XyjG&G;S_ONi?sVO{Pqj+09fhvQZLRGFJ(UCT_Gz`I7S20^Uox41
zAp#tkVhMI<1~#)C!y@LH(+bVqtrOz=1Xs^Ls0UCq_%)E*(Gjg^4MMePJUx7%RW3;b
zBw*w;Dk_mM>8O~#ji^SUvk$b!R!jno)!K$;%W0CnDS$*}7Pl(*xxq(HYjjh#vN9O%
zO4|pU=q5DL9jpfT_C4PALJ35IZ84UhqXb7UV`A|AJ-HymQKG{LJ8HJSsWnfXT4?Ui
zO%yJ4g;!8G6fba<6uakZ+lIU%lJR7jD{)>?EK?Q+uIOy<K=2_GWMH%_UQh9qYR0+e
zFWKj2kC1C%f_rmOp1v^c6K`g_H#hPoitkj&ko|}pHs!HSPX1i(%KU>e_}jX<t9Q>=
z@H_@_0}#hNc1neLFi-!2E(U@<mM2}5F|!c38)!dAXP8}@*H4kR)F62vJ5^03AFvt!
zhz;ftUCS6zp3jrNmF$bW*l>!7%AlCuMa`T%Zr+<ukxh<w12F_NtB%6DY9Z0*<XOdl
zdB|2U++tIi3-XpG;geAm+yDJYsvseYrk>($X9==Ak+j=Nez8sXr$m;ZN@WSMIG4*O
zx!s6rgPF#bWbJ6?VXm_1WQZMf!%{Z>?rDx<lj(XL&uM;upHqrr4}n}Jm@Qk~e{)dY
zVBu1*oZC6Bv(UlI_`aCVw8;1#Y2{ILkM%`g+}!D!RfkVKtlL=+jbE76jnA7uOsh;!
ztDsM7rcbNihXjs3Ggt+-b-Oqz?5uC2zs=%o-~trdPy+5$an7Ws+>323w>FBzxz$o0
z>j{}n#bXaQhjN^Uuv9y{n|_gN2&^g!#!ZgTxJ;HhCjNImY9o>_ht<-EQYgjE5FI9H
z_mO@ip)`K35?pZEGPabhhfv8$C4aGp^;Kq6i!i#Six3_-jG}Ms*6HJ|H}-k<*8O*X
z>64EyYdD9M2{nI67cwRX6ZTIu_Xor~c^w|6)3tca7kRs3>%;N%*NnkLc6J1**nNR=
z7{;R5+gr)=ip<%<1BV|Z<-Bsw7wU)mW}N$`9>0|=g_Fgm+IO=^XRFb*94KwTu8i*V
zSAX+mkD3G8eERvZE0+AZ+xHKEx*BT!Fzp^mYwk(A@9N}>@!rTKot9m6=KYU-yK?@s
zbH5mpcH5@iq0H@UH|qYfrQ|!BT;m?&IyZ<LqcD<CrY~}@hG<ujWvvrDYzmwsc6m4v
zt}!?fuZsfyh@sN9g5CY~X0Lb|1Q$j)g7+rI`WdyWJ@Y^{7iVUev^thm-is7$+^Fz>
z;r2cUI(tg618N{#IdQ-=i{<ratHDmSh|isr*WilosY?T$AD6}fMfgpd1EApK{l!PF
zT5Yn;tGz*gv&{s7JY`dZW*z}6Ok;a(hX;1&j9P-FANE3X?9g<Dd>4jU;O_lKgOhJc
zRtfNdHkxcrMtQiMKh(l$K_6bUeb}E|9AQ6H?D83KcDGv4Q>GY_O>M?pp{GmiQy4-r
zf6`?aPgUiw1ssdkg1k9hSc_h0IQ1zaYYibcQL8e%o}AxdafvV6YNO^tUKJ`X81YdE
z!=}8>Ydw_GjA|<ebg|*KJ+b<gYUe-7#Ya0-jWXiy>Ei$K=-1ua#k_~>R$?O{7cWrD
zhzK56Ud$Zo<IXAmIl7Fs*$SIbRK>)ZOJ_AUZKypCqlt8S{w7N{$RxiEF9zluJy;S%
z>U{wFMmn&=F<c91501ba>GU3DGlIz`dRy(pO}SuMw*kla6MOkhyX1+z{IXr{{mm^i
z>|QQOEbz>r%?qJTao{Nf_sU}T{43X&gDu3a>x>*;QVy0<2%;jY&?<a2U9%~NI6p@a
z+x$Gl8u`q?*Y2Bo=%ju5$R4^!Q}^a0Vd>^Ofw&oC)xMiqr+Z}%jz|jD+bg!lo|0gP
z&Yl$vpe7hV002<qo-0724R%>YSD5T_r|(?k-d~w`&(2J~U+Laolm2~!dw(78hs6^-
z<|g-iG0!{4%?fO@d$ESf?p&mlI&f%D2$b_x+)vpex8Up0I@Z_gxJ8o&ENzv!U61g9
zk1I-mry?xH;zvkkfl@<4?Db&~keTDSlHJdy9{D2fqLkuUXW0K<%YOkd##4hiCrgTO
z2Lk`^O|yd;dxNW`Gc%LYJvljM9|_Fq3|uoC!bNVj3eO6@S<5$Z!cDZmX?kU)p`YlM
z7lJ|-IkA0_tC{cwuJi=WU+$hOG_f@x5YcK)+)^e~kqak!p%E#yY;vU^O)0&C(oayj
zpn2)ys$evNxot(=W}eRIo%n#_ee#?fjjdvRIT>7xO>bw(%NM-g=5cgMG+7=t{M?x(
z<1qt~!8o!x8^L*MUF_g+zkIcFyHdVcF8&!~Q)Y?@s}4G(yvX40jsVRTxQn5yy}=I<
zw;nM!%p~rXm3$kgxTK=A!F`5t8dwb!MQy`_@M^dif4->slyP*xr$@*a_s*2ps(_M<
zBu^B`AvXrsaUpV~F<V1L{2CcC@5!TAuE`YGDsMo8+hJRzvR<eQIv{dNe>K$Gn{k;_
zapt{*s@w3%c|NG{H`-^F<N$!iU@`iP<O|6U3y6wSbsma07$8-kbXaY0xiI9tw@{o{
zr({N{nH$WB0v{RNMD#F&3P3O)V@GsTFrspQM4Y5oLv?7HNFGM#=i;^`Ke(?!10Es{
z3xVlZjG=(@0uF#pw7V4JQ_S2!k|x27T5`WyS1~loo|+3_%d%0CL`c~M>EZqlF6NAL
zf}UYgw`@wFn$C64m4;clTvC$37P@!ydDn(_0=&*4-@Q3WL6U{$3JzN!W*57%S5Y=!
zG1?K%{%v@BM3rSv9#A&Hjxpb{#Cg@UgBKh83hs}VU;dC#%qVkPHIo?$z<<SExq25;
zZ-;oecRu?NYefi$*Pn;rtZIC9cn9dYF7F(2EQbe$>{3a$+=<C|Tkk8jp89NFXLSn9
zVl57mqeXqYz@|ShB-?elH&CJoP!K4g*gGAIStI#4Du__pbv)5H4vaWvmeYb|t>g^K
zbHJHXpYxD|S5N9jIU_Dd%4`h>uDe}}ZFQW>wAhte?0MR~TJ69ZG48xzwQpnAr1T)x
zj35=aSQFgqyMouPOWE|CoJ7*q8du^zN(`g|@+T7C;wo6=UO!-8xA9*C#2eylRI>Tl
z9M6Um`lxzWypiIA;)uf;+`NM-&DCTB;r4w$tYaG<n@=KIWH+LNVqda*#&N(}c9TdA
z)BGl3))W?Y2R?bEYeDh!rOwwFFODz{8D=;U^VdfDTStBYruys|u?oimue>0bzYF1(
zU2fTQ8H-o{7on(wjKy{nyfsZMqc}f$w$Sd3R^~0tI*Codt8BA?1jos&#_JMUjj3ah
z+3Z9BZHn`7M3)FQwWR`P<^v>>$T;MgxeID`egCwshuzX^Nx#E4PAx@TYW^^-2$ojb
z;y~BCshtgpH1SxUd2nC8%lSEop<UR9=!{oqz-;CaFNJX&asV=c@Vou|5h!}$N9Ggm
zCP!Z)`4puRBb{Td*9EJc^uL4_c03r@S1BBEsuY+%7u*#g_}C5t7!v6`!|OH1>x`rG
z2i%ywVW?>xm?qSeWSZfx8V|o$lJjOoz;-B(;GRy1^Fn!f@>59~03e1okAO^Q`z`Z2
zk3w|@?JZ&uTP<%nSeFv|?pNUkJE12I!AkW}qSY*uU*+|+O!g~W(8$wEXq}O+qh|iH
z8LwW3%5yE;7b0eiSF0%4gTLf1u!>jqBVJ{hZnb1iIY__u8tDyN!5;_kufksk!GGLC
zmTU7g8Qt!gY5&!MtF>{7;(2c87*vqILh&>o$(4}V1*e5j^~(r~M8f$PKj2p8P736V
zLBah;A#0O{RN-+HE+WxbCF@{%sDfv$3o)VFcoOqpRik%z@v(R>*2yLY_#<wxszZTQ
zszb}Uj51nI4&{pi2Kn2p8rw<+ziRQC-Ap`Xt}V<&4pqnzW-VsrDORh8xj$5*U;HZ%
zZ}aX<7TUE)b_2RI97W#-f}rwo73S_fL6HgA3~X_=?Mi#Cme}8|ph*7KD&<l-U-8an
zAd(sxtCWeu?K{l~9YboO<$Tai<o^6b#wxVylw%{;^E^rxHTkIcpPA!GL&kvFVD$e*
zyQ<6Vs=mmoLZ==LH7a>R4W8oWG5h)JGqG>zYgF$G$Y#NJPulP5826Ji33X?ApX3nv
zn@k~MXu&D<f}Ne)D8(;x8>L>*)TH#wua^WZ^eUyjRkpdD$QYoZX1mEDBH942lS`fx
z(qJ*%11%i;=QPayB_6lEsbt=0_<4A=9tZ^z4vHikNJvl_1hPUirVfF20K1k=u@8<y
zcs4L`e0J+B-1@1DNM9Q)StMsd3DqP_9%6o@^Dm&A#k26)w-kYE&#+<0h(a1mvd;NZ
zoCty;|C}{mGNUInUcw?C=ksgp;McUx=7unK#_|P3L?rV!PSU8kG$8$NL!Y{E^CgW9
zi`=UZd6lBpU+$jkvNZAV2?ba^L)nX5RQTx2Qf_1Hqf=0f^`yYoaNzT_$Q4-Gq=GfB
zg0lHR1yB3$j}l7>Cy+hXm8f?mUbL={IdL|g<WWr4_@(w$mYxFhIt0sl49I}J6I)H!
zYigzs>lH6ofq*xH0-qs;XD=AJ5<Jdq)DL@o1`}K1IzRgIpmVRfrdNl;CxVo3K$qsz
zo25m_JCx=`PyAqnls2dym?JtlfY53hCeFcva_p~E)G}D;83_eL^47q)gOV8J;Ocpf
zuf~(&CJ&)<DP#sH_{I+bvNF_MGH^|k@vaYI0vR%h(J{}kO??x7i)LUbL`}Y8cg?C|
zMEdOk8$=(3$Y=2SmFFND9jjaRxD&_;8)VVgbVV>Rz;1BS!M6Qjof%RSDP4vkZFH@u
z#?BQdKpjrjO@^4}Q%ER^#8k8UW~O9%d&D6b01xI;f&5998RQKhL6XGsF3J(W85$7z
zjV+e^d$3<1<vXR}%0`sIx21JtcTP_cVedA`c{LQ?m@^P5Mp1%2A%?Zs8!{fw(nelH
zOeJy_{u?1d(>+eOwgFcl25-AJ^_19p`a$(LRtle0dND~SjHg?~^w-s^=p!?>O4L$;
zvRv<?ibsh0+H&@0*oi0^X~^U_KrM$`m6kcqnm^Y-x?7pUWuQ_ltkKO%W_5Z?#fQc6
zN(uWIsFJ#H07%L)hn19KZnDo5eMSPR)Ne`l$5nqfZp+UwdeuSkR)4caT9G|6(MX~o
zbra`Xo_!7*;3R0MF%!{~<;U{Bm2pUt%7S3Izk)hs#|7?THnQci&3X71&tvjPq6*b+
zJId=`?G0EBy#q&3Su@cp#>UbiU9BfUoa;Cz!*M-<E_H&xfG)mE8p;0nWTGG(yuxdL
z7Spb*P9xaKl1g@&YepeGlTi)-0JEDFG>96m^8FShZpb8cJpylka~iG%gC~0JZ%wH~
zRMU(HW&)3%T*99q7Q_LdaI~3m9JSEe2GoMUF8R~M6=^2%#Y$3~^@(aL*Zgyaf7X@|
zlA1hgYUM}1hw(9obMtStv1;Zb%k&ap#er`-FPO4jH(D9AO9U}NTgRWTO0Ty_OY<oc
zg@8%AFw+l;NXNdz9^E#rhFdrjEg&t+*6Fl`f$En-XVvgdS0Y9-HIs~;1)1Ce_MK(n
zkSf*2#1z(?OfuBt0%cCj@|p3L2g+pkh8Z7v-6dClpE2qB509OBX7BI*{`>1Ol7yN+
zOl#0c%b~a2%8@ouCM|=XHVBr(QCu4w4iFP_<K|=(PGn0r?V&snb11^eg<gT46eOOU
z0=SgP*GWrSPA0f&b`?oAm1Jn*0Py+gNJc;e477RIlt#5+9)BR@<r3cqF98)9C<iqv
z6Ku;+c8+9194+>y3L6hc6~=T)NWd8jgC1iAp~ru+J<c&bRE2HwPqxQ7rd#64UqD77
zd98Y!u!)tqYD#U7PyV?*md9$79`{M~c%^F!+Y7VP%j+_c8&apBU&Ne*L;$tyR{?!F
z2nwoe9V<cyZgP>$<DJVu?xJAWPf^K;I2`W=0YFrVBD{x$Wgrp##&S$56e#E$oRyQ?
zla_}m^|d^tOtNMac}}6cLW|vryo;Gk@ULvBW?R!0#Woc94r*=(t0$I!`L^rBRgBxE
z^Fmt%Sp`I9s6#y;89AC8cpe?_%)tt}=*BG#1&3L&X*=(lr6|HE?tpa(cg*X8F$!n5
zfcW~B4@#HFwg!dKx70SuNYueZN1T14-nIKF_6bzuag4;xCNAATsaJU!G(g~Bn-8Y=
z`d~0GGnkI>tEdp2nx@RXdxdvHFhX{H1re1Sp5&;U0l9~TA}H4U3v7Zb-9YI9)FiB(
zP-K*QKij=mx<442Qf`23bD`O>OlsEiq^VnK2lGVW@OPUD50yD7%ldex>fv5QY6Se!
zR1<n`o%^oH#2{^bCmYcpawGY!lj0WHvfsjJ2rIJ>@_9SOH;g1`-2%N{C=p{TV#+Xc
zAu>i@#Lt<!#EaF=S+Q(|>N5YCQbkm20cs*ITj3aFS(hxyd06LlS;67bHK)TxEf9qt
z9xl4Lvw4q1Z@4yco2U3<&ifHxOeCkrGUSPA-45OLeeMj741zfH1&)A}9db|P-kzT+
zg}*#9=^<|(nr`2~t~^O_O^_t7{>EUJ_|RK8WmeBaeQE<5`odfeW~pJ`T5Fp8pKuHl
zopdq!@4?p#VQt#RKj6cW<~3x?Ozuzk={Vd-YwkOUD{`JqrMMi#rKtH3_Ba>v+8Vb@
zOFOdwMuO=iTRkLN&!C>}*}oCYpwx9@$A-Jdx-{iJc)Fx)RfIJyh00easXQ#|aTRH(
ze0Lawtb;{1sl{~%U63G3s|2f+4+ECFg%yOO&V?cLl|@~v{QD_tsu^K1yc&*91i$yQ
zJoCFVOXg?*xhMdSL;B=yfuT32GljZ%l>O7*Le2U|a`&{iQ1`w4kI5vClB>P4d6t7+
z1*?XXdq37VjUA=}-UEL{-C<(raw@ELKb<Ghe}S>Fhv_GMiJCGtwks=d4t7gD8`6un
zyeoyINk#)`{sYTrO|x3>1hX}mt1Re~T%3QzZ>wpax{^Lnn$MQQfc7ke*;@0|@4vWv
z$QRe2y1ILO*{UVG*OIy~)cj#uzA7zal$I@nF<O30Fj=@V^^G44;(#ot2eAPlYwb9h
z1t)O5&2cu(O|Ezk+1%E9eb-BB+vy;$o%W1WgU3{08#R69k}Cj()%I~g<Pa!a6+5TY
z%fl+Ah%OTu#jg(B=k2QDo)N9Ay3<71(xYolv2vX9GWAV0`uUbTAvnu5P#v3CO`<#e
zJ?R}3W@l2UqQ()mG5ZsxfVv9-tdCxTel2xQi8qR90Duchz-h3D?Qu%T4$wU^ma%~=
zBW4>)zrcE0c;;QuNLyFq{XP<b^C$*wkB8Fd9a9%u!%ej+wnqPl?6WF%NbF$1dZCjC
z2O0FH;WM>;n(<F?XUdo|@A((n*4#Yu?cw9+zcQ$5K$(Kcn?G#%{$MLiSQc!BHoE%A
z|GFd-0+0<?BjK@wxpX>e#R{t}?t+lA#omwq;d0?xLD4N2f9pIipLu%^>O$ybCdijX
zGnR$(1Ergmkx<{koFe3vsQFrfjl>8fJmuH$sdVUn@Y{%*6KnkU(j0+%zD62v^WHSI
z)B#-63Uu|dmYSY4iVaXV_~xJN;iQxve(HLt@DKksw>><=@8M@V)I;4UXF*umL{x?n
z^%4S3Lf~l9!2+|^*ejN#u+xDg`IX#dP(fY`%1Hp)`VJ9-5TZrIuL<BUm;0hU=OtT}
zq0%@z5P`qSv^v)}DGg`+5sGwg;&`;AcYxF;Rtsf__PzBcrw`TBn61N5>|%A^OF9T0
zSJ?^W$F7?ydkbe6(}Dz+VLHBajDc)bJc1uV(G4sMs~@8^MbJnG20H==WVHzy=66ME
zb>}LttDu8>ng1ARm1V9-?3Puj^W4^C`SbiH1^ta+{xwa3MTDrpqJO||d|t02-lZG3
z;Bp4hjXIuPsvNyu^*lD*!$RLV6YfjosGiwifOmT0^M#92>~)=h!m1KveSiACn5xZ*
zmIO4ZWbt*jIPfwAI+1D0PV{F%$qQ6+EfsS)-c1aroakHAi2_E(%-^e?%v;FJKg1=E
z$5JLv2>+hCG&YrXqRbx_N(h1dKN|*OH3OUO{IA?v^3+ScwffOF9CLR>OVCtdvGQj-
z9EYJX4R`&ojq+ifIz@h4t_z9fL4LoxuyefsaqEulvb^BDUoW`z&bOosY5uU~<TtG@
z<a{ikq0VD}HT56yirucgscd$a4RzNRbmHMDE05`Y|L^XeynDyjUwn2{*M(_8zSH7~
zu?F>b2W!wpcX;<wyZb*hCDrWmC}HNX`J;%n)}Ar#f!-H&f}20F!ZGuO_XHI@h9H(N
zHzTWrtx!dfqs)pO0*<m<G(|BFbAE>WZer#loDaHlq4%otN<@Tc<T<lr=v3QkglI$3
zY;!A*m5r@mIaFp0ErBskOrVU}-%XRO6*u<>cmAsBo4rA|LkN^+eK@arm^aNjs2~0^
zHi{5)>vYi#V|t3r_cISckPEo2l=<5F$=k38y@ra+idoa!S~MK7F=Nmq!G8(g_E2VC
zVo^186iL!*R<Z5bkTBbluUf5V(T1BTqBa1H6*wQ@+4lwuCBR}6N0V|M7)D{Agvz-|
zk#2TME}JgZMQ@p43%#rppsB*LH^(joxgDJ3;atg5_>vKM3%wGBDfS93=2BS7&r(Y(
zI}&ckU0q}a+%|*~VeRRGJJEN@GZ4zNus3qA3u{=mFWJzG%-6i5=trcNvG!g#$GLKJ
zu?#>7ULE6!XRbgl>Y&jh=uzC-hY9bB;qJkL)sTN626CLrpiaWLj`qI<sm-!(7lydx
zwqkfj+y-vm;#bm{OUXa4?R}kvbjrn2#B2+Rb`a{Kof#j--8KOqbEQ?LWDz5@4i--<
zfZ1n0&Y70uX)aQ0h{+qv>p6#MMS*hm40J`NypcG}_Lvy)eJAHixApuOX+EW^U6tkn
z+hjHu({NJ1^q>#jboXMg(xoxSl)H$*(RIP1zRse`$*&#xkae$*wC<4@CvFPa1l81r
zrN0Q-G06O3r?53VeN<4aUi)G`sxS*-a)+9?z{iy~)cPTze7Al-z05lK{peZ)?4bzo
zEpHf0T*cNH1riBb2#J9ofUq5Sc@?e*`X)!tc5yt`Y_CW(VtXY4627HrujqOSJ@>;_
zZpEC|`p?YRrOwxJwyScHMSj7uSPedh&eeC*6Anc+uFz6f2>*3k=zN<xAjiC86Vwy&
zP&=b)h_jH@yU*@)pM7fc*0;y6S~FULFfOWQ*$=j*Rr91UEFutJ7-6UlwO?3%O@TzP
zNw0(}bj$&>Q%8VeM#<Az5G}pT=V4|*3{cAEAvRVvV;dVgD2R<k(pK<JPrhdhZJL2f
z#%_i;!swt<V_=)!)Y~iecKL6b7Hn6F9}Kp^_I9PI1pYtH%M?DegQMKmgzqS$R&2Ic
zf_z31RQD}9=)}tpJ-S!%^TSVhv1aDXm&ra7YW^@SW0IDyfwRbcIG7RTX6s1k2G%Tg
zEEmbqvVudc4@Xxhb^B(i+iko*FG-L<45r+5IB__`M9fYxD`K0D3Se!E%F~(=C|X`*
zg$w|q1o-a<@xB863A{<`T?f!`(*gW}!3Pk^Iugxi84@kxC{HD(s4r6AV>b35S6E!g
z=<mEv|FXrp%hh-3taTBycey!DjEFG>Wcnd1sSJ^_^Pa%>b<|e@;GyuvTQNADe)@Tw
z)ss`%ok3-VrlQuICR2c_!Gn@okW=@jkpmi~DSjHwZX1DI%CvDFG#R1c3g3ey0~fJI
zTN;%yr7_O0vMPxZ1y3Pe6=C4Uu3=lg!(LZ-U}Gll8QzIYaW*$(cQadn`Iy$*MTXig
zEwSZ1o4%+$8i}3k)kfEq?od7vuV>IrEQZ3~fqJ2|KFe!0*2@)N6zA5E->hferWxjv
zZ18!0`-2oBgXLLYlY3s*hq(;eCxrD@k<x^R<D<b=Q7A!25`$@_-nGsX^kI;;i{VzU
z0ZG0ycSaZPWdQA*Lb&j;;A<&_yJA=<h$LX7%G^7G4AiAKv#ASX1R0;S#eA>Ohsp9L
zw!h_&zlVbEn{n0A4c~r#-s1A}&R+a=*TPfF`?lrGG=G@(%%lz9x{DyXQ16)+{AR=a
ze$QTd(rH(iPfxv~T$sH#{UP^+#r=nO`RL+xX6wSehNeAoXY{3HpY}T+b@ZDTH12e7
zWWl-1Mh;2K_NUbea$-lj$+D2Y%vF0`Z$f=!t35VDI|1K?4#GnyHY;J_4q%%gCg2c*
zlC+J$o|R^1ggp@o*FX=(VurHE+iGgVHO{ct;94pSRwWJtN+FATi;Xi08Em()AcKv!
z6LvRpjy1)-weJNPY;dR9q|^iHAcM6@Eo87bJz?ESQDK4i5`SmH*ME*qu)w+(DTtC2
z37iT084wO>dm$S)frFU|V8>lt+N~*4M09c#+5S1qA-E6n4sqlzP1ki6Xai+KUop3}
zayn-#_a{{I8Je^>p9iPle5yBs95xnmm;kIMki&Wl<giGP29TQ`;FG{*7!=6>W`Kem
z=nX@--89DF&u*R$6^-iCrMP_4Pg6%wUu#o|gQ)2Op9P$uSL8+`heXRV(b7yo9Fs*Y
zu<(3^>WY`{@XdZQID7;Bl?0of9`5)$x-mj#9w*gX62BM?v;%<N;nlR>fIn+BhiWx7
ze_iDKSj3)r2Jvfzc?qAGVdl251gAW6%{2C@PZ}pl-C~#fpr5te(&h<gIskOVi_Iar
zJ_wy;rOkC(Xe|{>typ@y)Qzc2Epx^;mf9Nc)^h8lEN$-UU}^UV4(S=f^$r%7K+^w#
zLyA^2c}VNF|BUKl=29GPR~4P*FjklBB>Pz&p`^~v>jGrXAD`DBJwhE(@PcWqdi^``
z6gex|9dYQ--Vw<S;ShfDQdf3Xnt&|dA>}7R*Vz{5bPtTM5KtxE+~AY~E1wS5c#Vxx
zgyxS6_fIVHUOTeL|Klfkt@Q=b-M@>DvH1cm%MVYnNX(^ZdHyS;dfR9B_2BGQ{REM2
z;mKt;!^FOV3p8W^BYcr#u*R(omU*39=EDSmWUqE+VGt5As59wqzRk@K6)+q>aq;O;
z(T&%b=(bV^2ITS+Z0$dsXe-UGLEY9~SO;Tlr(O#|(ahCg{x`q5Bt<v9xzIkD<4e-s
zT(`|y>L1)2Q)XiJbHPl!<7VQ(CQOcv&9;Hgq`b4|e-7#)8|TgCDmpty*V6o%77#5J
zEUP3lOIJMh4vOuKx_>joNdZwOc&#T<Lp%6qVWsynkHco95&3@rhZTtYa_&XsUu;Ew
zE!qnqi4H9BV>5j3@psw7{CTz9Fd{nmY27d)S{3Jpk(mj@SN41`8EfI32gS|vD6$f}
zfQ3hM@x~1QcDWS?Oq#f5dV<P|zYr$`<*^%LA3hK#<ns=W6T&atd_Du#g>(qw9$z98
z3Z8I~?dotEy2=286{faa^Q3KkfAg5-RY2e50&^?gyglgngLI~q`H&F$0EpC%GWUgG
zlow#NV3g;FYb9T1k;?@}?ZxSe{6LEe!}o#4On0?ohhfGN*O$wRim}_nY(i(-jU1QC
z%p$k#5uUeEW5K9}e+HBD0&pvXWSXnBvnb1!F5(u=V)J9wY!-*%ctHnEEe<WTw1z8Q
zbGY8vg@>iFe{S&qOl1JJV?vuUf0?=%9}HXO?~3UqVZ}b%=^XcBFrC#vf_6u)?=s4U
zSH6jR&HiRASKZM*@(i}Q#q4yewpzvvdic$6UgnYiS4!jEh>^=i;~bF$dms3&KfpPF
zMmNQru>9;N5`*R9WR8JwpHHp-CU&v(Hl{;{`So7uS@zr!hHa{}sAf_HjTrOT%vhZP
ziZv!6bC0N{z&YLyYST->psr(3x$)JCQIrhOn_jjqRVTrqmBaTsKF%=rBa*|RF2M_i
zv6yvJD=C6}u!{Q?I0IYeJ{<FtW&Ckoi>X)Y0vFlOfn0}fGSfSN9HGkP(38BWu~j_C
z!BLy<(fu!5ZJWP>c)OHODZ)8_KU28;F(7aiSe>kQ|5Rv!1kM4q5`Kp6yTF`+{Lemm
zom>s{;ckyPg>--_HnYb3-s1j6&Ac$thzqTq$Ij7&b-^4d`gIr%X>>4H0N}>_1%Sp2
zx;C*4=HDEbLL9RoAFW1ksEDr2^L}Oi24IA|OFSJhdUZC47dm5KBb5&#I1=kf;x@{_
z7Orz-<u6Bdoc!gezz^lG1P?6SM|)}5)4T&DCCfZc%p8O)8*M!ip;bO+QUnURXnimz
z+pFCCF}0t)N961jX}M$3{&;*yh0zKCr4|7vWsgzMFFtndpkrTp>cUZ9ZG5H2m%o|$
z;GTTm{9(&g7-<y~Y4sXueY4w=vmsO;>Cn)pRpG`XzW?BZMZ-HxYq<&|tp<ces@V`M
zQu~lshJGk|jdjcEfv!wPdLjl1{Y$`!RPy*;OY{wy@d*gE23G+_(t&YYsuwv$M(!&?
z)rGLiAVEYCwU4pNyguaSS;)PeNX}d4iehR5dC804=4IohD6E{wp02*xu28XLE)y?M
zPm(DaYTWBx1CeZ!lvz0fVM}rIUgS!lNx}7+khX=csmjQnv3ahs+*gCn%SSsF@4&3c
z&{6H}eLVOL&gN3=N<B>}FonFViC!Cg)k~e8b+ju0#EmV$u<!%RVLqh_l8*}3j8U%m
zY*+jZSKRC;MX4b@Hwxb3TG#L*SMYrbM&s>E=fE9yHS0r_phHyQQf+yRI<&03HXltd
zIoKZbOU%eo5cnM8PtVLG+1#{RnXK^m7O2*$jD$aO#%saoWe){;d~&F?R9?htVdRds
z5JH8TXknad!H0lBxLQ6dT$GTmcK#9Rs%l$$E=gB1(^~xq9zu!yPLQS_=topg+Ad#z
zt^`@`Z_U^rbWi1&o8)n1UgZ*vj)JoRZCDr0C~IYHXTMY^q!escXdS&xp_LewZE3C4
za6)U9UX5`s-sA|a=1a1bNC^?3=a2hmuu4@<fqU~T!dEG3bxcJ&GKYLDT_GW$30>j3
z?U+37Y%5R0?9W9X7$#9tLdkRB&zc?UvX#*BBt-g)u!5k@vm*W0rqh%0W-vWu+?9GE
zZmypdhByS%Q?et|(`1Wbl1Of`G%C2{mXgSvy2T>c8T<q*JqX{j%nKJ<XM6#W1lpTZ
zg=H#eeh_!yToe%)U&a}259n2;*LEyV#TdaGTy9Tzq4|(4s_>ZBa^?CIMi|GjdH5CN
z0Ha`qOib}t|4_8*IUmlU$_Zz3#W&{Dj8*G_Drvhyd%o>KUgv7IyqpGzL*OR?x@QxI
zvBA1%yod+aAR?r#0{7-wbt0t!dBb>)cIEHBW9VzLyOUU)2w?2y<{NN0eo}kT(j${b
z&x6q(o=x1_sg1GK__RXmiLIZCnpejN;4H@y!6z<2U)=<B*v`US{19LQ-&PDV^Lz!G
zUGLi0nRHGeKzbehNQ}aFf-UwOA+;=OoRC}HIxSh%I35eSO6P@9%r60lV_jGhDstc_
zGYb9nwl8f|2aGj-r>?aHzDPQUfPw@om>E|;C6X%sBt;~RJLb@9qMYA{50WCOm!h24
z#ATZsm6r20bgJy68}$31D7%9fG;n9QJVo;t7X@32;f%(8c9JJnLiLb)ejuTuO;2rJ
zu{FLx<?g}JjlsP7{stV~|IE~Bl2Qf@gX}GK`!CLviL#TK7kD8&&G8v(?vVwr6C#or
zVkTt+6RS13g;K3a*v)SCn0yiV4{<(V5wgLSlj;#vn@}Q)SH0`R1CD-tB3K}b{iF)N
zWmgm<*+KfDu7716H9CPWCsT0=b|#JeVJwK9SixZEYlETR<qw@8V6Qz$88!aIyrao{
z-3DG`Sba1kpRqVL)WqpL<d-1X2PY}^ZcXm9`gem>f67hxJ2T~5w5?VDd$8*Nup?iC
zQYmH89}7nQP}-4iNRAwnP`x!Dt0lo`)H)t;+<QTP*1P_^IMb;HE&T=3N(@#f)4V53
z{WZWl2}2(~f|h>n4lg}YKJ%O?@)sa8wc=Raq&&5+pw+XjH)5g>lE~G&*2zKaUteoy
zwow24OK(4c?i0Msu{F!W(8twGLO1h*mSfaB2*4xYV*qUq{S5g1kzQA!NXL(5l$KLE
z;Ad~F^qkVkzDDk5i}7bLJ)tL6yjT?I&N|CG1wr7&{|eT2qnq6){#OpS$Cc`=t-8oj
z8(_#u+UEj-u0X@4)>faO`&XMI=&Ve>IX_U(-oRC77YnBa7AxD_k8S$L7wY%F&_bOi
zqVJiHL#c+j;2LS<RYH^9Oue6Oj9$@TyBCMb037A|^sjekhTZg0u+*S|_y_6X|D}WU
zc!qzHYLh4FEqjs_8ZF?^EU*R&6dt5_J#xsy&#^_|83K!Z3-(A-gPLystVLc{+m(2m
z4l?Tkt8G=nY8xjv6i$)=^UY(;o(|Q>oW2yVQwsx#D=naa76R#O=$NjM<!rd%r2qPt
z*+4tn6~8z*=kI9AAveIQ1?IiL6t7JN>td1IcrRtJ{|I$3M}tRJA>87P=No~ueeSPp
zQ?RifM(Nkqy5@{D55q`A6qiDV1*$(9P#4pOxY9_O7u}VcKtLFiFwCG`<n<Bqm!k0F
zoXoD=5>e$EMGm+a79Kf3g(V00OpBER<a};05bzj{q{5E&)3uF;K%#0fkVV`|Xgn9f
z_Ng)|e+5{zbX}2@k<u0UBgsv5EsAYpl4+jCeHsiCtVI9(ES(vi;P{};+qQK#yM|Wj
zWA@rCEDI@udJ`#1s$)NswYHIUOKC1eDZhy?xU-VxZ4yCPlGf9F!KsnR0;CwToNbvI
zP)e=e2WlYvX|b?WdHLhvu}AtxXnt^nl#}Ks$m|Xsp_;@IGUkS9HS0Y*?SNu7)mT`b
z3pIkRg-Hn4v~#5S70&mUGXg)rK^msLDls(g3d}K@utV(GO1i>qzl5~T_l)RhIghV!
z%t8LSMDoWc*mWEe|I_5xH^BJ2<^2rlkUi4s3dRWOC^(8=2llt&wJ|^De}Rc<hp`d2
zaSEM5cEV@De!JS@6v6!0%nbK&`;A<HE#7f~hNk(BOXa(_N4a57-WCGY+phNd2_-@I
zSAzYygu9e@kUp1qQK;ll4fJ(T69G-Q5tV(+qPi&Jac{Bf^Sl@!x}al%oMcJ5R$@{u
zw$a6$=`7R85pJ_6N$a-&i6~L;m#6iHn7b;_WeXDYz6$G$2uw93A5~zT+nA$%*2dh1
zK4&<RyBrYAiH({;2)pIzgbpKuZOmoDwdYNP_7}8$G0-)3s{KE24l5opilIia*gEEx
z{>%4>Qrp;hZtJ&v=hMOOeCMwtlAZ*Mz6>_*KitOM8rY?+z_`zkY}~pQ=`<l234u{K
zbw5cfmnLYX*uaYgG&IWY+ZWh)0o%UJ?pyLoa(fb6<14VG?2ntHBz;*TH1cPywXVAK
z{5n`vwm)kv^)SLOb__e{Ax$~<Z-Uuf?`HSK87WRz!+&IUn_WanvcT|~YdhIpKRK<*
zD=9s%p7e~|E?(VvMxwI#Cwd7Fi~h$fPSP>Nab%)ATk&MEE0_yZpeT=UwQU;7x3HRO
z_14T#Kkf4Ud?qZ7mhE2|I;;K(L-m^QM+xo`aj6L|zL^_Xs_KbDEgV5>Okezi<Ngw+
zXhsp014k3*%Hr>X?EqA*h8suS^mDZ{KYKe!Fn-Ybka|*}xg++ff5dPe&quKeb3#bJ
zU809(#B@aM8vSm2j~E~3n#G)H2-~^rhHSjciC}iVkDq1uJ~_3)(E^(ipP|@76rzjt
z$)%cWW>y*OlQU}-jhjL{VgJm?z>U*-h5Ub4(O4f2M(SsxW*4oPk-}?`2xgm}oWC4V
zNox1<fgOcgIS<rpfYu{stxXzY%q^(Z^CLBaeGt@lcXUcJtQO-e0Q*H*E6ia}!~J#%
zdqVEFGTi2Kp;paYVL$4_M<coUa2<f^R`F45>${y$%{Emeh&UCwi&DfXakj05?mUl9
zq=-{1SDKeBXgEy&x1verHe$?z-Ts^H-R(|hu%@zYFb{LB0~LMa&VNM|35n?X0Dp(*
z4u6-+c$OJOoo=_db~*l?q$==}EHr&1vu@wiH}zhyU!LJj_tZCu+7!~$FZE4>`(|zP
zH;Hea<4u>;ZxV0ndDGv$kzd*-SKBY*Rgu|puL|q;lIZ8pk9wDdqahMZZ^9jI-8S9P
zuy<H@G>_dacQmdY>yGCAz#T1D?r1|Sl(_uS?CqS^AFYKZl;)zKFAi}cY<vUV*<OD%
zCwWOX(E{^{b?HnVGcmbo{LwU@#j#*MQ~c2sYxxuYXwuB3^GC~Uxj&jcgFEDpCh)lw
ze>C+a>5ryCt;`ArbzNG2G&-5^NAuGAqh*TdIe#>jY37e+$GHuEG|%(>(K5ZjA5HEQ
zDgJ0`zPUe|r(0J3X#UHV_@hzlkNTr!CjHT<y@@}X=BO+?n4=&1qXFP+E(SHE)NCP_
zj_(=BJ>t6t3ZIr-7qD7|<U_$>59HF}?C1|LQp+<9HAUc>X3vUT({$bvu4%eY%sDuP
zCUw*7)`+wA{m7M~6b&ug`>wKP$1cmP#SY|qqFL3iWo5?Y{Qne1;Y^l1%D>M`XOL|X
zML8$H=W2QW!B2q<$@pi<euGY&^WxDr)jxWF&7!}2vA7UNw&oAhhTWz0t=qIepC5bV
z)w>oQeB%K<y611s+hy2wX?^QB2FqIotKTl<^&}-{7T^p$;d|2j*PLJ@NxNf`127b=
zX{Smrbj*(Y;b0hxTL+B;0hR^IXHQhQS(Uc^H2*cn`hop}=l~GN=+=9Ofg;`KU2o4J
zR#vc?2=^hKDCX*C9n4t0Yh#E2Mp7}*8{8Y<38T2wWj*M12$!^j60HN5!6F)nO(H*g
zR<Rew?|4Z1|90RlpOBfyxg9NU6O3yF0YXjMT<P8j@1&J76vgM@0xpHuSulQa(^>*M
z{aIlFkcZ-s&Tyo9jYg!<H|C*mtsC0t+@P5^sVqNkkdH!U7e6R<UBvh3yr2EmB9G4K
z?L>fPyw|16eJe*yzU1Vo*N#5v)qf4$uYk{+KTNyp(r&PSDJZq?ihi??smMF<(lxJ+
zJL<LBW8T#E?)}<HIp4ni@!B(E2c5sDuuWmw?alri=f>DE<n0TW7dUoJaF06&qHV4p
z4^2+rony<^dqeOGGTqlj%>FjWt8aLJw)a9JlYO1{@0jAO3nOL>A_^#m<*av=*B{c2
zW~~nvX(^hI;7@EwMPxBv2GVSmx2ylYl&vvxA&I^s=65_nj@YBZ>vos2ixIb1?(H7g
z7Y}yexF5?Hseg5oKAjOBf))LqS&jS@2Gs*GphNDQ)u`-9JqTX`1zfL#&K~~5&hF`T
zJJ|0fT6)*Xk%>;0Bsytd+dg6vN$DesE7MNc(f~$8j`yM`xE_>}KRdif$Qzj~&#v#Q
z9MdFG_u}Vhu6A<G?-bd~>vaeS-q;V)bz|2tFnJ|P;>U&lYzDcMT{{S8&t#|}zY9mg
zwTvzL!J?!cXUUI_6JPt3afWwM%=rIRw5;Zq&R}bR%~-c<V+W5(Y{B-y>67DX$9Ow9
zze;Z>EpF_P#a=5k+!HNTI!HUy69{35fd2%<@h11VAZTL{30%k|`k}`)n=R@Qxk9dE
zt&}=(C;e-lxBmZTd1j^8{RTD_aZer1JwUbXqd*8`$P^4(Sgs>#$Qb(>?wj)Q{Td$@
zfIf`Wyai?h&TzSyOn^%frVcR_LI(Blxdq|y&%D~qE3F_#bx&bcJxExIYLrzk@L`8!
z+AtS|5_yptIs~(EF$$5UG;fo81z;;xk;HOIki>Gm(p8ethNpDlDT{Ti*}}uCo>%}m
z)ph2y2{E)wWQ)~00wIS5ZUkSn$3!N$0|k-u!E-p!=>zchBiZu9iVb#tB8xf(eY>8%
z;pa9qJTTwO4-d%)%2}Fp!X&Es2^PaTKuyo58cWSd48c<~86@?Wxlt_QojJWm-+AF7
z9b)a>PH5;9)PFbi#{v+mHOwn`ME8N)mV=|9?HPHU__wX^C)k#i^{nsQL@$2?Bx5E?
zJ&@fWXC-KjD3$12o>}l}V#C#R4w`w1W<(1Eu;WGUd4nCEnKaVm@NqTV;tFaiwb#Wj
zgsf_}-F70YLe=YoF1O|2;PcF^{lwY9L^LZnJCnJl6syqk{KADj`K0{z_dbbNhaM=}
z*l^I;qxUGhLeL1!AJVnaU4mIAueluUm6m`VH{Q<T`P<$0A>6|mhko8>M6B)5Z!ao5
z=dHGLM!&G6h_;(Q)Me+|=CaF<Z}3_Py-W1-&qA-xHodU~A5+f1MRx1uxAtdZ>b9ab
z%oAj?Ue7>tL3}l#9^p1#_uSL6=u!POZ#;2o+be%};!_WP^XP-OUbInFG=G>D==B?7
zJi==hKQnpd9mh=l{HQ&iIOd9@OPA1-@XYDAHbe^Vd*F;EAMO_Wb*q!pW-d<4CZxUm
zzs;o1%es^udQaDV&Yn1^_OD&N1IyF09%*AddhX2U(~fxZi<xa&XAYfk>KFU>&w3{<
z3|`ul0PUEu@RHlUoOi>8_g0@>oiTpFf{PDH8+O9FEqCkIz*Z(0ARCHH+{9Q%AnM(F
zA3RMfdj2e2M83$t(0|@6NxMQ6K}ik13NODRBGcRGb&4GVnHogQ1ebO%A}jJZB@oKa
zpnlB*{B}3Ij5k(eL`84S0SPq%V>*|3saiQX3xKl|3=Vft7C<Z{1B93b`6n~X5#fFo
zJ#?<1!@tH(=i!ZB9+^*@WnNFAi&sRZc)44>o~825!{gDK9IwK6655ofxV%xZLi}63
z*l+ngQ2BI;bYCU4`RBNpBrMh4gVVGO+SH23xGW;s8G|x#mnhm?r2DQ=Ex}cBa;FKS
z=R9=ny(5j<@M9t%M^|N=dqYJDS0z{B1}Y>~=P<9w(b#q5gv<r_7T;oFCPTfO`m!ED
z2kIzK!lDr*SJ>!fZ>0T^)*%!n!XsD+qGK%uF2`Z)F$i#!d?wsHZ8-cn%Ic&!WkQ8z
zms6rtYMm8Aw%Usy35pFx+%vKagd)da^l^T=#f3rpcVmPhG%JoqPB$`1m2pt<>3p_a
zUE03vWs{6x426vO3^9}&HrrfGp(u=Ed380Z1_jR3I&v469{+;p1|Sb;J~*b%!T3!+
zT6)}P5=?Fhaey_dC}MWe_2^dwo;p4ZX`iZ&^Zh#bd4lc3O~T=E-@556?imc{HC|=M
zp~@tRm_(1?AngI84*PyTqg}E0DMogS%V&|@E&Bmhzmi$ZQFJ6wdvYaR#r9GZ;R>$}
zj`8ppZE)q$T@3Dc6s;t1=N4=g0KgJ&&mwOyv9o|0!3n`R|0lBqnNetb)IvhG0P3!-
z<q^tU2FcuBL7)HW$pxK_cB?LB+=44m<zNeB(_$Ndg$4zxLlyj~xd)G7(28rTnf!qS
zqMKJOm|F*;<JS>IAWOnLX7ydS&6m3-?nB|$D!xX$i>6@ao}TGLSmK9%9p@KW&hc|K
zNq!FdM00-5!h?&Sqn04IXIpc2NKu|S*gUnWo|#VoiT?g@rY!#>%;kz_a6V0IFnBd#
z*f?pAdE%-@f(z(x+E8S!viQ2RKfQyA`oK-p!X2KddL}AQv@5hM+qA246IIqHXyO|t
zsw4D3yq4gv;#>Hs(2Vb=;0gjiipbkqnPx~Y)|TY(yvtFsW978vk^Q87iP?;3&SKS2
zGzX5gV~N=`MM%fyQi>FVCbY*$Zv;q>RTO5PsP*otHd2_$f}www$$(saM1%0M^CRIy
zAvq=H!zl{Rr?4VPQo@SeEUoMt4B^U%*4h-~s2m;!)&9qn&7wh1(>PH8wUM(N$Z1)>
z;EV71qGOVA8||J~N0r*NB%JS`o(b;xT%Hd|mY?mOpUZRc*nJtJ5(0<DBzQqr<Qj_P
z;7c{sOhzPze3h>9%Cz_fAV~ApWW>!IfbhyXTz+t!jzyaSgjsgEd1bPVKfIB5off&L
ze0G@E>1g+-BL-#tgK^zKV?*Mm+eokf(=<ksD&d-ujAXUfpCu%*$@(M0h5q1Hx{o&S
zQ9SN1tr0C-JeeiOuusM+nAcGDYmk*GnDDt_ylZ18Xss4bPF^je2L!FZOGVw_mUf9O
zW#L!>@K?Dg_XmNMr6CBMbteHmg8H)vHNALcQ{MN*UI+cymE%wdkfmA1$NQVL)3`yJ
z299H1!U>Y1adW)HS}iXL!^RU_O}Jl2h{kR_qusxVWCd@Pyq=0WuVYKGtIwU>$XDjm
zC>jICaq(n1r*QQ&L}B{9YZ+uu{lly4CT|Y+@u#X`pxbFNE4SpY-V!m5XKuz1lGN3;
zw!hcmibQ`Io`YT##Z?+8mbu|fat%hz^+feBG3K#pBZ+&&_68w9h8(ul<CvM3&s1hB
zvls|HO~3*o#i<ZWpbwtjtAxDr{zq?-qBV}xrAeUGU=rxl^^<GO8_{z9t<p5#4piGN
z{*(~<mz)#Lm<lsvJAs*i3cZ_OYLuER06)C_B~)wSVow*Soylc(+8@EQ3UJsR^uux1
z@9Cp6;WilP)iHwd7xI7C6JTu}onAgulgq1`7@y~u(PB=R;*CF?;fx`{4Bx{HhvFUl
zNFs%1!z0`One8<qHpQK+XojKPB=f^|1ZdNvT-Vq0Af4?kAYqC5Y-)vM#)~{{{VXim
zvBo*`ss*y2OZpOJtd5(TK%!}b(EnG0&-A9R`Iv=l%@VeQrxb4Mr_=&LxWogYuOIW@
zo##DD+4kj-nDLm-ghfyaEt#^4^Hbur<Hjz5NgK}s`Zu;BhDgPkRck+rjx_IrX+G3k
z?QMn-n|ACW!8~*^*W%|<$T#3{6p9TZ+sNCnTY2VWg7~;6$m#}8$G&5|QD=Aum3o~I
zA*m*R6Xv%UUyZEvK<nK#a*9HThYHrXjrf3J-q3@*j-%MhQ&-W-&KSZo0r~|br7{H<
z)g}w5(u@dZ#k%rloAW&#qo{cUKB_5dD(oPD96eJ|C?Hj+CF%s-8vPR+R(Wb-55-m|
z<N0@Z(<BbB2w-YTHrw0n4CcQN0Z#&Q+wBnxvmZc&%Xn|v10C{S=zneT@a4zV55B6t
zLYRcjAEp)I(h7TNWyQ3@-p>6HxBmR$G5M>Hy>htenQ`;bWfQ-e2r5DN;d?(m^sw9O
zs(bIz;r;8bx*+!iiJHSN`MThg-5;xOJn*syjy+>yyR>3qTA`GKv}tH?=+XNVk?wA@
zjN;(&Zo=?LJkE8G_mZYL7B{V005$??FJpJ-n~yOd>Q3O67VZcP#v*Dl<^qHzpUBU4
z^*@U2*Z~U+cfbnpOkKAYx%YkmeJ^iMSqrku9jm?4y;q!jhXDPOd}g{`KD2Tq4aUu9
zVJX(~&9%5wdqNi20C_#RoUC+-Xufg7f(am4T7t@9Km%_?%{GL8Nv%fC&vK_xTj`3P
zLs1i-h$xFwE(dEpEhauXnP!AUd`P+3IoV9v&OtM;(c!jnk|0GldTlQg9}>!5jwO{#
z<X4Q*<Ay>Y4Sr^T5nbu9q3c~07y3ONX?vP)_HaEtDACj6U4qu;*`7iqK30PHWLN#K
zTvzumHzmogu5=$?&d1UC+1LiabT1;Ww$<uigWM`{;>ua!ePy)iVE#pXH6g&@lmvE)
z(YprC-)MW9fKwmmo=Yws3Mj;{sd+$hRcE{Rx7+V}LaL#;$@ewxJ&~8lVe9?($|%+S
zb~c>7LSd)!sTI*KhtP>JkZ@(907zRIeZ`ziEJqodSpLzw1zmkQrOG_iuI=RcdNN@F
zer4^stXF#-=2;s6(E;pffGCb|Tcg2M^A6Q?i2s(}EcFIpc8-i$t^Y-`s~kkb=-q=B
zU#2Lsd_GqSjAN0x1ut&8{Yo&uug!GxdsC=U72@6uq6(l42-HYeLd_<V84ls0Y&2%(
zN!ORMJ%R={(Eysp9BD&hW(}866oDbY(c$Yo5&5`=beru4R=mt4rqD*GyfcGrBxEbC
zB(K<!UYiltF{}eA7>m5&Mc$q;n33!w-GG+6_O7=cmk^7^%J_w%-ET@#8+^$dQo6i4
zLz(@|P2t*w#NQH2jofmp4RnWFUW4^7VxH%$VxGE_r+ESh)-7yt(3$hGE_|DVf>|LE
zk-2EkV90&~=MHWoE_csga9!IvgF8~<SI{>x%s`zpldV9?5;qC+Vr8+f0wU&chq$B8
z1M5qYChG<q^30Ohq}5)|E8ZS#5ifECvd|20;~DC6glvW8c;*lx1cjyLY7FS!qP>ES
zt|K%kCw`+BC4&>$VXfnu2%tWHE4)<lX>gbt+$T7++D)Hl3d_-!0Zs}F1X@#;8564+
z0lEmdDG{H%mhBxhGY9ytym%6J#63UXJ|7Y8;Fq7^p5yMwo*WSlBAI5p=b!PsC-nr+
zYuxiMd2S%T{jl7Yc{v9#F(qaWy9deeJK+~`3uMFVN5Ce4LX6@ZU|W_6;06~fpF2G0
z>wNkY<tmU8@hCU?V)u3-Z)5QdvJp*`{eb=R9@zSjv}>4ycCsAw<s5aRCG4H_gJh&S
zS0(6226<U!x?x_1x5+ZMS|iUNW1O{T05T&G1F(MC?{u&*C*jVB?GGTV8XR*&*>^jo
zRz^wC(G_$B5xA8Y?(8`<RYX2!_A2NLjQeE%m1f5OmyEd!S9tkn!mz=Qqs*!FcCGY=
za5t7minr3tpKFDrAYJF~Xji1B0H$Pul3D1u4)yZq*!t|xNr|5!GeCfClpr8tBB^p*
zsqt#fmUC-0t~BWOCw6|vc%9HctX^%Y$bFV0gt8f^NNloBS0~J*q4E4O=T5I6-%t@d
zG1yfNuBI<jYMSWRv@EFUC31L#Vn?f{xN2g>6W@;>;dUhx470&ruduSm*#VaHH!x#q
z&GzY3`tffEQ@q3UHvf6^qy_gxp6W3A9~YeYf5j-kL(LziWt3XZ(52<w(!x=v)yb#T
zw9m%HAe_4?{$1U!^Y`0-_uaq$^6d{#hD{si=IDHgcY8wJNOr_Lm6#1E|DeyeVEANz
znf3e$?Yuct;-HZAhVqtINRDKyh;lYccv4N?q?w!fa~>+}41RMk_$_kvTNtI8v8CjK
z3a#QO31}R}-C||Fh|ck{!^Qd4A0enzSatQzIO1D{Bke%m&~ibc(N9ZLy;PhQh~pyj
zr27<=e;L1Fz%=<JWLj;OhBQcTv1y-R2mGDF?V%e@My4R9l#xky@Pm<sP67{cS|PGd
zk<<PZmj#sy3dO`%V<qi%AYC~jH9tfk2f?^V4icgX>f!24;aEnc;ApWJ#CiXL7{t1j
z?HhDrg<Uro4Kmd{#*JQv<3IiYHaCd8oMBL3vJeDu(}wjj3!(j8!2ITB&M+&nNo1Lo
zC^VyyLrEMtQ>l+14Q)n=EjuYphgpMFAehbE{em{H&y<NdC!Vhf00$^Akrul5*9PxL
z>;11)lth>Owq4*V9~l|qsPSm`jrbvdh04uc)>uu?=VhW)3BL2>w(XU@%!WhzV>>s@
zX$u#*TArtt+<0suJ4wkoZQzN#Vau(ppN&^<Q_rD3c_l;vFVYC`fuf6O%m?u(Aaa)L
z?ph1<-SjCWot;bo99nkt=LIpmv#upUI~09A`(3O9_H?3_S@X7vn>}2ICAlm(dnjYe
z&s-HcOS|WDE~P|Bbrkpj+fsIdeV_fSV6}Ajc8i-oGhWF7x-FpY2;^PK$WRW7Rt^3-
zA-F;SQmVYcW4!)jbhUBJ;0><Fn1SDbGASfEEEPxLZ~+%?XA%Z23qs&t4l{ALDxAYV
zp|M69u_0%`silcuh!8)!j&uauf{TyXpeRJ52M~J(qhDm7{vakwx_X7-s=q$Ck^7sy
zXU@Xh;=SwTgpY#y%Od#v^jdZOwJ@$i+~O}Q-EX*m2MoYNEi#1JI$eye2dATG!#K{V
zR%K^HWBR5E7Km6h_*Lm~wdk8dV;I7@vbu?iRe}FmsQqa*z{7zDdsNW%{*3vCh$lkf
zf!l1U=EqwS+i|YKv4bF-bI~vojp3ePz}n9a>rNYvd@%H0{Sg1+XPJ0=x(=#)USjTp
zg`wBaDve?eLBo-yXmO>gBI3&S`8CVFM9ocH>D`nUtI+Gdo;(zAI-VeJ5LitI91eH&
zqNvcb$y)?B<>3$mxizn55R0ohdU__zjWb`YJSf<?ms@TRTWK^lmYoK7HIl%QieR#s
zwP1VUjI_<|we9d2e|vFp{5Zmxzn%P0HeqKq{afnXZH7(xdt)2n(RSS12%9n_)KJ(i
zN0vSNcP6oqT_XZns_+szu8N~+1G71ArV7UwGZ0W@^E8{g8~31c6l5a+RoXYqx3!Al
zlIE;pbxErjxrlGToPvT|UQSi=a<Z3JikA~Le>S7e3gPE!okT7{2GNG@VCK-HPO#c5
zye>D`BHz#O<;Z-%0UEu$WuCXp>(by}jgSZ*yuz;izw#<#o|@4pKcojCfs*Vq0C%dw
z$J{*11MeV+`8TK|Or$1x$@cLtS=}+*6E9f=CvG^Uin5oiQrM=yWZ`2TRh~0lvcYDS
z4B(kd!-Up8I)gI=>3tH~)lErTjrnMX!t;8V^+u}WDzAWDKvGZr;5+PUD`#5voUg{l
z3QaF;%F?%T8f9GeOrIKew#FSAoUJ>9JDMD~R$vaUY2iQwoI(Nj#KB(P1GL;Rb^_0H
zO*g!>5QJ{VdVw_aYNm)NCr_?p`G_<&TXnv@-^e$vYD<iti&v1lS2Inp{gLcl8PV;t
z>KqF&?)O2L?&Slis%Ws6IirG2HhCnS*;jnkQyH7RyhGgOrF&e}Bf30DJT@IPlL?nY
zja`Rc!EIo5#F=%YzqhrXg39Ais++=1awxUcz{M70Mp3L9xAyk{x(cD|!c0N9WScY9
ziBt5f$aEcvb{yne1U-?WPACaRjKxTVTWSP{hwR^6=e7GizqNPnz-h3Tc`lO>AAZZ@
zu$R>3YMi=;8ZyBu;&-C(IMB;S{=~oH!Nz;t@6d)I=b#oM%UqjT6YgC=;R4b*QV<SK
z#x*mg2nD=u!^D*ErRB)?0T1PeGk{Uxa$+Px1W(N67VV1RoH|K+q}E$d>9(fTV2;Ft
z&vA&$4-1y^Q~0nh@k6C<klO|Zgq<*2Sn(Js;tx7WP1rmwrrmbUxja~%p(r=9Z*#0(
z;o(6OXK^{Sjc;Kb<b(HO<zu|o_zQD5I<qqRx4^=ca8iPQJ%U{^Nt<(~%KM;P$qx<M
zrY>96ORmi?$q$mt?a6`Hs$}5ap?;cYC1Rwoq)qx-lERT@PzucsdlE^+!&kD%>w%`M
z(=vO8Zv@Va_&z{jgaQg@SVV@na<5Fqk-2=XmeE*RB61KM!?d(SHfEn;x3pwrz_4@U
zUKlQt?&lZsM6{=5J3SG9nXV}J+W-4|Q?T8!EQ5Z97GmA`i>~<O4}FSXIJNCZXZ8N=
z%fD5Q+2#*#>%zm+AGm#G&XJGYmhsj#ciy@9;%olB3lF<rbxWTCiypstYUQQF7jM}z
zIjw3Utrh~wv;K(S0Om^i&3EUr!9BLI0-?B>$W3qBlsn;YlY2i%79QUJW$EYkICd4O
z?Pe|^*o)AMTytSm5=TGt4rX<E*}=(P0?FzYJ1fm=lmzxQH4Pve8J5+!iI=Y?#k)r`
zOLW-gpdy%*J*=<N7^_)px*eayJknAA5dt}qLmWzU>Aw2#6YaBp_UC6bGI&?Q92&{E
zT3X`D^yS4)ewluL87{UQ4&=SW`s!AGI-^$VC@EH(w(7^`jzY|W_L3wIS-UTy89`(H
zE3C<Bx4^o%c3T>%uyZ;M98P0>_M+&Cx7R=Z)P@6}cse8Fjb&}Gx$NTJtakH<Enn@l
zS&Gvtx0aXEP}fg79#wGT!r|YHIcfF7b9<bZ)(?~&)No|5gP!A-h!T+$%AW5P3<A$u
zxf-1IFSk4>^$DdglwCIqev1X&G_#1*t3lGvt4)mo*k3%($E4H^kr+0y<H}qGuTcS3
zf`*yWqqBOE*d-CO-#WQR?1RL*?>da#W+jp!@hNOQ_<&QC(;W}6Oz;D_Dy8Kxw@*V;
z6&^u^>z%Mw5{gy>gm1}q9V!5zaS839dhk^Fml9pbciV0fmD(JMK+aYwSE4rTOY93g
zJc>}kS_#y@{A0VJgJ9?2Q9^cG+%3Ve-8q851RF@4!&x}y%o5{oHfEGhT!^6)Cs2f`
zh`9kYNb(sg1w9+d_DIP6=1ipw!zv}PFVZhoR**sf!7XC&`t!(?S_C5G50cG;@Pr}&
zMG_EJB%zm|AYh)vIQj|Nspl_6uAZ*kJBfVlegbyoYv(R~W-9o=$t>;}Hv2l?UPd;C
zpcCEcf76`o`0>MIC)!my_DaXMHjE8+!<%+7dum;%3D-zz8$5Hi&3{YFTc?q~O_%6J
zB!cFGAfK!D$1bEt)LFBH!2FY|nX=cQXX0D@H)Uf%IK4wy(LA|1T;#aZ;-i9w-=*PJ
ztTAco0X!2`d7o<@eLoy68imV|gV;GrS&09o-vvJ@Z&&jQTay4+Q;m1MrsY#vj1{+%
zZ7wtAQX_h1^Yl9Kx4;JI3Q7Up^UD|U8})s-j3_B`)!^Sb1Zcb>56}eoQS|#*i>N2h
z+#|ybqjS8W&|^|~!w=wiD!qLg#T(ikE>2hNSHtPrq2C1xmuD`XRprcjD~}Ea4qTzm
z-{KaXzkepr-zHZ9%ph&C6jz7wV_iXeWR+%B6p459Eci#egeVJ4g=@U{n4s}5X#9Yr
zb6%Ny{vVznn0!9kJ^!-l^KtI^S52Q!aL>PP`h2!~{teF$N!DNEo`2i)`6BlmOuE$a
z%iZ(un?7IZp3jDRJt$d!y?ZXMHf3B5?m2+<sn0jL=hZw<nV*bfgL#|7^HItAbKUc~
zJWq*gDRj@zZu-2~JwK=E^D_7R+@{Y*yXWWee7|J7<J|M}n?9f5o?p=P`E2+6!lut_
z-1Cc?K40XX|BdJSC)-`_p4ad^H;%X&!hAl_nbzvv>q~i^GEEKc`Fx(^c)eVoh0OCA
zi3qGst^{r^z2bO_8+lS>T#3Ke)Di?l`9>Vcxtb^)d@Een<pVDvt5{kD&=I*9x`G65
zB2|%%)NM`a=JZCp@qd}L&^wx?n43CxT(F&PVjB+<o3tMKMRFRh2QtmeA%)li?58QC
z9F8ORc|yJ}gyM7;;lmM~_Y-jn%g2iYwVg!l59RkOn`ugIAFH<JbCyT)cTF0#b#7kU
z-~8I3+@G;TG8SJQFw_^flyUN2FQVx8`+jog-@ll&wshgxdHYQG>gZ)~Q=#S$(=tx$
z_oL*LP=}XR9ew5FSGDWBq4>+I4@w)6%Y05aE|^c^L<-}vIo|H!z6ElaG_ifEoe8?Z
z73w6ixkubI+FW8;X2HzzQDTj``H)=);0P$z#50qz+GHm*JjH(s+UP1p-GKN(Dia&K
zo;y*7@iAUUH2nIh3t&$yJm?7>&o$e9yL-^TtZ?7<p0|Mhl_mPO$rVyc5<7~VaNl<D
zveqksq9^)A?QWj%=b$%FP&h`Um#hewfQpq&j<7=g_G9Omo*GwJs5aDmg4aDfAef(m
zMEe<k392MfUZVXDwwL&5lqAYcaOLE)Woyd{59sK5SJTB~5~Uhksn1<)0~2d9B2jAe
z@j;v4yShBGs8Z3wM3I%Qh`hCEu9sID?n@9O(_$j){bDb8zSH14E!~}24KUUjRYy+g
zb#i{)u2~<QnYGh7pM5oVclKQKhiUops#cVI{IZWqpZWdo`tEeVPPgB)Vc*}R-N&m{
z=Zvow{b}`IKb_zE?|Ww4KL7I`Y3<8t1;e!FX_mCGBA7Vxt3W{#;grSh`8O`d-)*yG
zhwE+n11=7?0*e}?<{qQS)f_WW$YO$%CuradNnO&J47Gb`|5TrvL@F&C=6l`@ksD*P
z?N(%hR<br+8~Dc5yDrRT!C*y>kb!JG3sxQ(dcviNpX0}Safr8`dhv<yi#-hFuT!L!
z;{(HY2kCx2ylS6^y`&C#Ocj~i;O)EP9!?HpscrMEt;GCVEKinsGlQf?LL~2N)sD!2
zR>PQQ{t7=^rZRA^7YdbiE560H5sckJ;oD7ay*3X|#5_zuTh9V^U@5QKnG(^8xH&{P
zc&;FNP2Z?<hG4t0O|I09QZo+YgEM`DS(6VCIT!?iPFOQhf=lJ?x4*YEiX#{!;S0*Y
z)5DCjD7y3_F(!qi)kvcG7=dL-MEXmtq{3WnYkDFmY!Yl|#JrH9EAod^WLruzMQT}7
zq@<6{rwEQ&Q>2FIGttx}hw?tu!<bAUIm}{-N;n-w;fcX!<cLx&Ol?~(Av*IO7Y||p
z*@F43z#Om?eSa6b{oowzikRJH7m1qT{!(6SGvWAaxx4wZmZy!`xe{s3B8~OOcT34g
zxaYHnX5wLQjRdILH`^XNuhQkNKNndJnf|Px-WW4|6ut>&s<eQ=CJT<9k$hlQVS;~g
z9})-73K9Et11Zzhzv>X3kn0^j)%S-G2ivjYCNP<8w(?FE9vS{9@Xz2Q{cZih9+%A@
zd((q90fS*ZE7($#wWa(r`PFJn&*@l*jqJPAaLh-a6wK_UbO?2P+}t&bNCSwUzk&YZ
zpW*HjKV^PKV7h0P1T_g?$^sW(Psn0+ZBj#1?`r&;o13~>wN2(m!Rc=>H`@1hmCSn?
zR(0#sh<iALDJE;*@ecEQ)36gL4we>(ga)=~);ehbxDz*KRN6T|7p1w{c|Aj&O=*WV
z40tw`q9w0(^r?EUZMlLsSJve8QshJZ6MOdCa?g>Q-|F+x;J<tozVimTfj57cmR6^w
zz-j6Ki6f}<?B71~<U3XG50A!%mHn&j8~;g5)zcF2<A(D2uiv=z<QZkhPn=Ud=ip1)
z3|c$|-|*0Vovs>u%yqpVIAqGZJ#K8bU%NBFHTgki`?Oi**e$b94t5KazBO*OKnSu`
zOyyEyumg1Nx7f2V445==O{@0$;I;_C&yU>3np))sSAcAyDC{vV5Js^R`+QGXHJ~w5
z#TOOcrGzi6Q(UGz9^_!$tM&HdbLh!~IEJ<p>1<;PAiMXH9!iF^Y_q$p)8iY%Jy=+}
zI^nNDS66dSQCHkd5WR1I7R85R*w$}zqGaeS(~;A56dSzK>ULDw+X(VG%HK|YN_FHg
zwGuTS;H{wVZsE$4YN9i-abOWZBx~8U%mjIPrP>hf!(39TulkuDc#4uGNI6q9Bt1+H
z2@3c=u&YB#DRTv;$ZeGQV1^P`#?4qA>Slvbg?11bcT!>>e`q&ZGhVWbOwN>|a8G}7
zYEB6z2X{?X8#j;Jv_tAvrJZSm{N<Nj?#hBq5{naY57|9X>KEV}9_60IIbtU~GvM@t
zxMe%`!I`0xy`K1cvecnSX(W3UhQ%U`iWSK)*xdgMdV%ch_u?tM%b+QcQP$cgquJ(N
zk_zAes1nL{BI~t*wJy0Nme0|SnS1b&*ap3@p>6($dvo2G!8J%ScgukYgD~TK*Bd&D
z%jx=AxXzjGEaMGO$e6a-K#NFoBz`&(Gk7Er6nm3!uheUc|1|qB50;`;1Z!`e#^0Lo
zNH8*onvQa-Dd(;Eljv1<_U8>?pbbVPh(&#OnjC^m)zKVW36)3F9OM5@Bs?_VObGW5
znKc>sIc=E1RkH**_smRl-po}jXd9mq0)_|8*Zf)92O_?K`sZ6+0p<^lOq-|pA^MZe
z?1(>SD`MH@qfh1M!#DF_p|DGvx)9fFNP+Qmb%XFq<#i3~Kn`<1NF5J`M3DNp(}HC}
z67ZMlc=-Yhl3~?3X6nGXVuTX-#jfIN^G^-Bwxhgw4<{(ic14vZ+HSuiWOEo??p#r8
zTmd8izrYw(MI?P9RCc*5@-h7Bz<BIks0x})tt?LTC9bB7(}T|6fnq=rpW439657nV
z{v)g;edHo%(VK6&tAJd`)T#H}{4*3l95eIro040%efSmrET~NEiHVwnP>QbPiGAhv
zT4&Q>_u7fxh}flgZJj44n>OqpQ%TC^3c%W5Mia@!eP}tmE?armaMPs>8?Y--cVn&S
z2Zfk|r~9agIf005aK)CwqUydZjhG5P=nCWT7co;PAXQ(Y0WqDX9;;{x6qrEA`7nv>
z+9Kppu6XWB3PXlBOHVyYz5ssjUko_!+ruck|4znSu_XUw=>6z<R5^N?lC?Gj-@`+%
zB*yFOAWxjd2<HAo{0~~;S>>Q4q9-7Eu{eDfJDL)U$#||J2(oa6sx_|#O%E(L--g)e
z=4t#VEb#gvHQL~W9pIa$9exGFk|}yMOM`IW>w(P!VFYT+642uem5idAjpaG*WX8l`
zH*8_kv_mLRqlc;LRL-p(aXMmcI5`*0YP7BDSouz8q3qR;D;@1>`@-&0n7I|R<b&3Z
zV-!A_TM3|XpN2NLl3&|)x7c=X<0zwQGEW7UT|FLqc8}L+sf3cxJnu?ar_Wq*#L6ZG
z-QVO0W}PM5s>3dxN!9_IcV4*8bzq$Pa<)61OYGs4Cs&CpG{F^;AY4GgK+R5B)3c#R
zt;F8OFm{L`3^vW}@V}z3d_zPNbVWo<H|amQB1D>MxD@p<X*Gh6!%=^7>Rq#6xlyg<
zAT>RZx#mrLVO94(NZm%w+2&~>6AZR>FP;JE9_Pn=Wo9u^8D|C)^)f4?a}Qt01=WRP
zlc#Kjc*9?#EF!m9*9uhckHa}b_X%BE7cocH_*8#P6RKZi4z+oy321Lc%#Nk}9L>vQ
z(ZboN1jrFHK!a;-F6WL9Q`^6kBrE|-#a%L&yMd_kl7zNjmxa~#rxxpej&*a`XVo^8
z{&@sWKyID@)*y_7L#A&)$OZvVk{0hYLIev2cs&EOMwKO$$~%S^xg>n%#X>8u=%R~h
z#M^Pt*~+iw%;pYPHNp9OT&g~pC7C8336I48m+@aeT^Ed-PA<_SQZCWd>BQYk`^BG5
zRg=Znw{R=B2D|{U#nw33NDCH?l-e<ZW<?QW4qzl-C$8)m^vX5XbnVjL*tV7ZMJ{s8
zHKqh=pQVLqBwD03jR^^$LNWJrA(+L57^|{aV4N*oe}G}=4lPa;P{_(~FJ~U*Ff>g^
z*2lt(y=TAh!FgWUQYZ@Chlkpec1a6K6hxD<<iJ+h->ED$l~8^umke0!UKt>pg!hj0
zddh&r3#~iIsjK`h;t<T5Qs-rvc!r`|#ALJH_V!bHp#ZtQ+#OX|CTla}`tpidBgZiI
zUgld|Nq5%1<c6X4V_;pS(SiD0HeGyfo^1SuROKd|9ud}*!9dEm5}I~O!Z*ijekSej
zHlB$2Tex+iWial|;)SvD6E93OLC-a)B|s*`D;oDH9L}tdwA|#_m0A<HlS_a?1AL+p
z?hk{Ml6feNOH`p<^y&_U8n0&1Ph_~f^I=he;D6g4q`s|q;~iDVTZL-PjDRN9Go@a<
zn!N4Pp!6@|To%LliGyAu*bK+h6vbH6ka@u~u;8g-&_AnNiAnnT0NkRyvu6gQY#++Z
z7U8`mGgcn!&8lUa{mXSKx|?f(*OQg>XdN1uhpRG4)RD61M4|f`fE>DW3Q1zG-TGHJ
zYYgHgR0`+z0y}p2Mrg#m8S8XSZ$@u|dG;b6ciG^1(suMR*T9n#+`-;99ogL4+a%Uu
zPj(R>LkvHW>ZEo0f0#QHxG1ad{|`J2jNmeg3vM|sx#R+xmF3d38*0(FZ-s4w3Szm8
zfNQ3NR#sMMR#v98TUk+AnYPeQw(-kSE9*<8Wjkq$?f=iY_wx*J90)AyKd%ya&wb9l
z&)x62=bpO~oAIJG%g3SoY`YmRXCE<1vwQJy*6lP#Rj=1(JbJL#aOfp&C|)oXbA8!@
zM-}HDyr*i@UAzm3%oI!lGxFW7^$7d&`@RD&c?@d@m}elbx8U8NoxB|fS;UOD5C&|W
zr(-K(h)g_1#|J|loT=mt6Sl?S874*=N!Iw4GT-;dE`^vO&R8?_+>V|4Mp(}*o?B=N
zGz|qRZF86OX(%;#{Kq45Xf4&2+iOy8u&nC^C3_>7NyD#`^AHv=?QvTWO05x&H{Xi!
zU3T){l_Qk4|GE?Ipe50}(d%=X?Cpp{)wLuR2_9eMbP$_BnLx{FJxqREh(fEe^Bo+$
z*GRIA*r-8wVf;cGdpqCYm`5YTUu^c-J2`JTMP}oM=QU7&aWk7Nm0HP#SB;!$*{GRN
zaPSRH&Gg-eFE-6ZLP}tF-UtpFoS|5~4HNTCuaoW^CC|Xf2t%gF$(M`Uh(#iNsv^_G
z2b75HjU!j+&s)Mq%2fGYJ{YMuSDT#sLvjw)T@Ly-g;OS*OE5cOF2RDn?p*Q}R?j9A
zcVnC~)UP8AZPQ#|Kwd{O2dd&S>9{=(EWMC+=cqFCia2*%{S`5OykYbMiDo_>F1g(i
zOHLLt_<y`F+os9XGn5S~=Vh>BX*vQCqnx&4g_|I&pe->l$-t<EQpK{P5_Y?_=l~Er
z#ks=VE7%?y&{PcZ#>Gtns;nap=A9Sjtb}lxtZqV2fJpAMv;ITj6%OJZT#}7pCJtsp
zSV_pm%`pk+>5499Pxms8>GY0wHlsAcmnkD}9-5nt;zXNUfo0Yq;>-?nugj2uh}hXc
zhuSZ|JYW8Pf`6AnyVY7u(Oa4`c<oRz{FwX1<0lMOnxRLL?GrzK#^Yxd0)Wqt$loD?
z!*Nyu-C1u;Wn5qCPs2*SDpmOilmN9Ga^n^lc*G+Lq`T12jaXa6BkX-pIwxR2-v(P7
z1i<MbdFmKv6PAYlQ@rg5d;Q7SADJMb6LA>J%Et@ft<$T^$6f^TdJjzEz0R0ww?Ip=
ze&(!<ZscZh<t5DGVw=U^N3E>oHi!4nb9{t*b0l30=gKI7*(cI<81x!YQ95_blZBz!
zeoZzS5xKnq#s8>-8-Jp^xbYQT%J{0?b|A{*Ke_Ed3UKe+4n*&anz`<R@PZg47e{u{
z4!Ot3YgR^_PVNscmMguzFNf4`(0>lS-}J5Dz@`~p&(dfNMMLielGC0EIF%%7&lxh3
zI@NDIkTg}@dLZ>R<ubn9EpU+Ihi|c#(#Q0J8vS4l!ZOd5VN<R?kb@MPk%b=Tam+C>
zgTDvHU$9YfHp5~sS`?e^)dvzrU439)x|XXC61?{61C08x(e{-gj;gB<7Eqn1?><OC
zYRugSlDO+HkmGE|da5cn(RtR~8-UnY=W{f>md<49V{sw}E(D<fTZ@6#p0r;+uLxJX
zNta*q?gPoG(#x4s;kysY+~Pp{_!e#lpm|vmEvoM!PF1;VcFScLT>41$8Uil{{b0C%
zXEe^r4&zh?Df<DNR#i<;I)uZ}K}oam!&!iBbod*oe;ckB2g4{H_o143&L$kBL^pgt
z4w$sW+3gd}#R{lQ3AnmTHlC*A?3ObYPAl4@N*7=YSsdN$!4xmzVQIa)5nljnWY5>P
z9e8Zc%JeIaZ?R|Pb&rlZ=@Ph^iPA9Q_H@LJjEH*{5jRI7Ze&E<-M%D<B;_Agymd$8
zCob%G;G6RfZ+7#`5%*S5niW^DG)s-5urq3KC+8rT{5TzDgD_sSzhJT@2=C;wB`ENz
z`9|xS;Iy&p!;RPwIMm5BTAAu!x;uh_#f;^7#L_7qu{hoTEDmhp8cjwM(H1nu;-y*?
zRo}TWWVkVu86DCRJkvG0M>}`N-K^KJ<1Uh*DwZ90tRL&S<Br)mhFPP0>+Ix9QyC%`
zvzbGW+;sO`$rpSI%Auz53%qhbVhI`=>Nb|;(8yfuO~g6=gwd`|N}Bt`Z%A+#SdhSa
z3L}(kA^wgvyGiiNP_)HXSH3x)AQ${No6|^sIPeZ2#*xssBfJBhhB+4n?axQp=Zi&U
z$0`_^i2E+#^29zS0b%rOc$^sTdYi+0rl{NfvDcxOli^cbFvt!+0^TkG<#Z#4Qzth&
zx*ggIYE9rOzV1tSU2Fc5jwyQzUJ%pdwQ}eBt5K<L*#zCtuOajx7)vykAw}+pN^`$S
zmhIK}`2Rzx90TJ~aS5Tu0JjjlT?a)h)WX^d6<x5XqH&seBWB1_ExPQNY5`qaM`t3m
z7r8Q)dSX9e)yTM-c_n7yhs9?{H+A2>!2L$_t#Dzp%G(T1YcDr#3&ThsQ4PjnA`DyT
z%jRcej+Bm{qH^dE>vWe*<D;E~GMGRcAMv=~6y5LWHBOIoXY_h}iE-gzEE3GTCwAzF
zp^K~i4jJ8@kgPLPozdw|5BWF3BcW=KcpIN&>L)y~8j(mFOtW!npsj0N{vc*=aaATG
z&~PY&2<ci9Wyd^T3?gi&61EwUm&NSf*o-uHPQse%Tycqj0b#LtRuF>-ywQTF?cVrR
zNC<qDV3(;etdJUEM+4q}WsejU_Ta);XOyw_fN_UtNom|6ac9`L#kj+bTzWLDe<b0E
zsS7T6Tp)7Ll*FrW9JcTZosQx_UJgE(E3Od7EMrC=8{Msi>m{~w3Cmax>i0tevHgR#
zD3ShV4*ka)lM)rH=|f}DrlGx&nL1MN_<W@QyLs!~rjxdmg1>^`-tiMCGGgBe*C+1V
zp!J}4g3W|0_hu!x*p&<KNqA9gaUK(pT&-?05{)7Dco^P`$%vGh7$Zpk36l}Y8kmd-
z6_b(b?}e)6gI$5>5LQZ;0E-Cm1so@Os^TbVi5QO$Lsh*S8AB=+SSSQL$J$PsHYHUV
zaSb!#9VA$~3akL+pfq+S3emyArbH6cTlJS%_$`6Kip>1saT0eJ6rx;32Ypf0W^d9e
zK=EpDrZ-^~*nw~tWV%grP{7FRCC9kKn;%|pih1~I9CfoIN4i@l{$)&_*zq@w7h@s^
z7l*EgSrhU~>MgdR!uBiMkujK-v6mO~71VuWz9KL24eGM}o8~LnmbKO^A~bBh;s!|+
z<H@k~iWvLbvo`2ni`-Q0Fs0l%-Z>|Fpmc?p6lLSi04AWfss1n%Lg$4gqXh7^K@u9T
z!z@y6^;_U={9k<453?(=yy|3*CrDCtQ<9g(F25|b;eHp+yPU`Ha%{y?_83b(kaZ$m
zZ-lO8w!s)*GCbaS-!0X!rH~()C*z!4r=BA2=bAiWUieCE7b1I+-Ev?A1&p&JQX?=f
zTT324N*T`15eE5#wSgJ)j!J<2P(Fsnb@Tg$3-3Cn!dmph$_JiTWM@l`;9{K)D<7%%
z2Pi8aBNbLYaCanDJ~kz#FmQvj3K`*00L89kSbo~~J6AL+?VYY`FzEL4cIk&xH!IP-
z$dwfs`{JD&%oUXFoKJAJ0=u>{oA;RE+N7knbQy_GPto<u+5<*`(qD;z9=`B8y-{OM
zgd;5}NLUp_4{9L`_aEHkgs$g{VhhVA=Qgxy8EU_hqdPm{$OhdxT-nHCJl#(9)<Z`l
z78x(W5YTaY*mh>~(s8~)J5O+?!2b=-gUNVjbh#C7G-BgqN_9Op^GriRLAq@f`()8D
zZ4$*gCu8+9ZNdm{O`D|Fi)j<OPAv6yK$#*_9&6fUcG8S^<Z9TW$;>%I%b5vsD`b3p
zbeEQ>9kP0~RU1lF$<VzADAOpb4N#^|eC{#*we*PS-`>Gcjf^lWbeY~%xKY7dq0NYs
zB_s?PB^w<ypKRCIF%J8pFm{xvM@e@eVbI*<rYf1-#XEWOB#a%=+o5>F#*TYGfOIsH
z!zX3~(UeE<2EOtCZ{@LzR_$}@y9SH7Rkkf=KQngnRcmKt^h&+#<+*P@yLKC{TZqyy
z;({mQ$_fQl?&c6KGJ{L94l;g*{r^<BU&6!(E<dY(Rq1|(>#@zOA6uWe5(dZc)<~ay
zWVYCEp`xG@nIg@oh2w`N?nrN&WbyoO94ThHj=n=TvXnkVy>#NY$|kIIKv<<zU2X%R
zonRR=+S^>5oaN)(f%I-h=YSa#EDA7Ymzn+-A<<!lLn2w&u9HK4Md%-#uRREkRxH2Z
z=SuWOFvvvW`yh=ia9rVJQ_Y&7$F<c>{9@s7mW-p$@o*F1JcE{dm@~#)LvY}BROSuy
zw>zzUWzKoN?Ph_EY%RTsuOu!o#lKzajk5iROBUKh4^PA}BeoikLu3(@TX2<ShQ$M7
zai9cM90YdevNP8PYm4gtrs65Mu8<+r98>G>g+Y>N9uZ~b3PxFMqFf3~2OO-tEh-y7
z3-L1_YQ72H{>OP+Jnc<B4%(ztZ}NIHH2c2L8I6Pk2wA_}3AbFm5fN-hp=<Ar0|s~~
z8IBlQAS{0k;F=3$4@`w{?#{U#-DYo5P}y+(jpRlMEhk1AnNY^Wry|a39&WF1t&c5*
zlEG8lb>(eG#NT~!zd1)e{pyXQy3P?7Z5l?b;1Qcb>rQz5(Gy<Cd+hX99kLfD2YbDm
zR(5hXz>^mr-ZtUq1HS!Z%dT@*PI-EzbYwR(SKl{v6OOoV!9v6=$u5a;?<i>T3NpiW
z#gbjMWgp?fDfC+6l|qyU=BysZmueSIwRI{k7=nDn7>pIc`uUr2vcu^sDs1s6iQ}lw
zsYRo`El1;gD@Kp?FgSpAAQcr9>K{z@(w&>2n1dqdbMeXJU=)u*kb`7^Jl5O$QL}o4
zA5u)~6X0+#?o-o|KVmMoY64e=Xc(7cAuaQA8TUImw>#Up#{m(d2bUwcUT5ZdQJpQ7
zGtp=h|HIiw{CNxhbkBB{tYW5NG218}dpNC;{$$x0ILnxf_Y{jwQP$<klV=u6o>=Cj
zXT#@*(;NO1l4UHH#voJw#!w5(+$zqw464rr<N;ne1&e|8qCgYpUBjS5?GanxZR0!8
zd<mzKn6shInk5k~7?F8YYgo$5buLgMB%m?0nZ5x(MTfZr@uV6TC%COz^q8{PjCV3%
zOoWjq><*){m@g)`k2&#6;E1WJHWXQxWVK#DR^+XIT}8S$=Bd3lSd*V$pmX&(r;Fkh
z6IbMmluNR630$Ozndv4|kK&!nv3cd70A)slJMqlw(Rl@XYRNFhNk>6m0<#S3Tnr&F
zO#V1mw#`N2?63<|&#inn_%TK-I4rloDg66iX~En=Q=WI2YUeoH;GatB+Co@7R7+xl
z^V{6;DW(!PY2wqcW4!NE6f4v=G8dd7dz9%oKQ8VfN0f-yUCgvwV2f?5@gXZyI`nYk
za7B!_(QPPOX-w0+jb@r(TSy*EicRnZ+)E@1ZeuRogQ%LtLp9X|rMwB7Y~A69DILGb
z337Nd#E4QZSExS(`$$~%gQ?kcOg#}S_C9aaM45#&vHh~=c1jclnR{cMv`gpC`M%AP
z(@wDiD<QaiD)ZLXlQg*Qv~`|```Jd$X3Xcqok&^rPGpJOi9nrp7dAt1IRO%b*8e38
zs3LY}^-hEzS20NiZVXpw$YEbg2u^k$cmA~2c5)s>t+mxaY(8vleKo4gHAO4EgPAER
zC1xf{(uc!vL%lvZ!PQwx*kmipCVD?q;J5Hf!$y0%-iw!N3@=eWv$BUTLs(0SV;H;x
zY((`tyj%4Sp1b%CADAizt2kJWHN28JPM>VInwiNiMoRPf+ss;Q_ax3?qOmGk17Bio
zq)(Rqb~<eB(KU|1XfR{F+&AO=G!M2msPRAWL|pwHT@1r*Yo1?EdfeByr*rhpcka02
zomQi69o7uvv4#;h5+ZI7M4SzDJOnbYJpa{|`6;`Dzn+%!+-aF_EvgWwD&vOC$sF%%
zdyfCur4R4fGV1V%^9IZY3f!WD6$|FJ@y-BQGlkV8sIGCn@rC0g@uGyOGD-Eq=-nIl
z2<*Okn340T`J3_kv*?-Q#C8$wZ!s3pa@gnpW{MSeg7c}fc>$hj?OcyOUF5YgO(DU7
zJaMVA2H|57emBBPuye$Sp?WA+ixynXx>kw-s||GR7;ozxJv!2-5x+#Zs78cQ3Aecs
z6uS|06}Kz6_zA{JKFpnE!0L$6HXb*$bo6zSpSXT_Ge$7yv^6pW4UrMZ-4Y1g&16ci
z3-9Wmi;viJD0IY}ZV7rLIBe;MHs!Pt{W#v?v3aRtyaAJaIhKr#3GCj&0fw2)KZ@BX
zf^38#wF!dOwatq&+_F3O;=J`T;eP@iY!T#nJkUJW`51r6G$08F7jWVYui%_zoFpCu
z9oYgT8#WEtArAkwULL|h1DN;?!*n9u+a=pO04CJX`zB&m(cFoL6}6FfjhTNOfmi5;
z$=iu2vD@dDd7ItuZL-6ggo{;R+4eD#33c*HoOWx5S<*qVFPUkYHx=LDQrcK}yv+9|
z<rqQTVP(^8G<NN!gBEGajNy1haY#%}c@S-G`^0-M3+EoJ%bYA+JT$W^IXHYH$q_Y8
z_2eLbKgML=W_mh~bsn?}TbQ|Wm+IRPXUjcD%H0{AT_d}6=5@t!7!)khf!%kk@|_o<
z+HU5|$@cY!vJXXqa^8d@ITWuMNJW76B<yNLf|7e9O6*b}gkM{ml+DQQEl^cYTQB2&
zk}St=hLr}cCV?kPU-;qx&;6`m<<Y2n4_pYl4S#IB^Cqmtn@somaqM5Vyqn4K4Ii$Y
zkovIG7`cnW0nSC?h#U)7Y;7@EYy!Mg$Kob4`ZF(g<%^H?cDR)*0n<@jX^Sd%1Dw=j
zu-_RSJq=bXa2lSl9Zd>TuP1SqrlXxBls>@ZiJOAP#_x=&@wkq)81IREz%ZH(F$26P
zcd5u9!@Hr($nTBDMC9!Yv24R{Zles8P}~C$*9|8{<OFVWi19ej;EFbs;e)7D0hv>Z
zL%TPyM~tk6$-cMmi{2*Z+Bi_e3ERz6&?`<y6FB%b)C}0u6~^A}?Tf7d$#B?>Hu@j4
zxh?O`u7A&eM`!f>tLA+3X^%dQ+r3bB<oel*Ufy)A98uFSVuu{jn^DA`zy1S9ZR<e)
zgUr$-_WbRwRn-KkcMIH%PjN$a6jKWEP7N~`43^b?%D55hBuamt=uF-2Z6bGeg#{G2
z0#IO$VT%!uYSbL78yN94kqa6ZxIJcx8?c`oggFSVSg_iY8P}(?#bc4oNk<ri#^!Qj
zt{aZRLLGw{t@Fuj0bQLH&^^j}Y6m<gwi<2F4OV00R4~XbI3n#lG+X93sKC(5MJ0@I
z&p7xWVtzn1)IA>eKbT|8@v+AvaH4};8Jgg9EOhfF=!QOACe_ou-GAHrfmTizthV6p
zeTW-Jk^7VkUbrp=<Ip+KNz^M(<D5=~UR=xUjK)ros|EHv%tXHgt)v;hC)q7o4DEjR
zyH^mIEj#FfBb8kiMy{1^Ii$Idy=fjpgvi$z+0ZI?q>S4-56ZgwUg)PVylnPBsUqf=
zNoKVmLyoK(QZb55!&6Dl4mc?`dj+@54d3@s>PAt4C|W=b7d6724wt)M?Jzmi+1qTl
z^k-3WwMcKbbMdd`TYq5!Eil^lr?6!yO5>$s+Vz|{O@ofE66WNI&`q_0GGQfF%je^D
z(#>AFd8I33WLneC`3zHFF+Ib*RlVRW-g-JjJspT{M2=2%K?eiXQDhJ5rv5LJwJKEq
zyAT>qiqH);ei@=$IiEwcSkxi|VVQ~^rioQ<KOH4eV;p!++)>9!uY7IH%4%NMvBH~$
zK2;f$+smhRiAk^5$-~%A%W%1b%F26cu91iS_(j|fXK$qvZQ3Kuw2Lq!X$_wnGA+B<
zNfoCjoncccg&gO6how?0=LH;ylE&2&6FHnz6+hfBMy0|JccIy(fj;997!hD8d7xAr
z-u0RBC?i;i2;!ihkV~~&$Qcq$s!@J%=;xv55W9&OJ4h6}Ph4i6K#zbY;-HX$hE6UF
z!e5Qdy)wxrj+hv{M!EY;Inrk)Gxansuglckl%OmN%v7{~%K{IMQj5`17JB@8867#d
z;-s_ZlC*wT?J7O+jN>wY`XOd#R^K;daL_Q&0dW4Z)QX#W+#S2)+GjjFw~ziBFJJl?
zv-n{%t4xe`Y(|#5{4v8@y1c^WayyWCLS*<#m%mz#+a=dJ#xPWJ*FIpr@Vj3;gErTe
zUNTBxMkRUBrQXNNJ3GjOg6x|{#~(HCi7wNpjhOJ$+O*%GMUB$1{v%w(0kZz%YsAZ(
zBN}r>ES8AlYi)i718y!Rvb#P+Y#p%Nl1mPZGle(@*EKc&1pST}C*hcp(@qo*=r?j?
zW#+vUf5c&l3ME7<w?8a)rFKGxgKOveF0cwi@u`8K8C$7Q-WX@*X44|$uty#^Lwwj_
zwd;2Jm97MMb#E2#%-l{H6d*lSTM-bzSr+)0TaK2Bp)EE=_Gw)mX~yHMI`DN48I$Xl
z(aLEdvq+ILth$tPR|+a1q>TGBDIFq839e^a%1~Q7%Z*ZTXG&kY7Nb@#=T&TV_D_d_
z@O@@aB7ymoVe_er&8Gl5p+qr{Nf!^AJtU#nh7I56T<XT~mOJvE4u4uGA#<AmoD)TG
z_wIP*Hi25*y^F&|(V#+O^*ruLfYbN%TJIE)aB`=B*^zR@={x_%kZk;o!0dhtu42Za
zq(YerZ-r7F#`?0DxeFR~Y($B(LAhr+5!SGASiL9P3EU|dJ#Rw@*jQn~f$ODlx3}XP
zCFlOdO(RY@%DHs}x3k4lMDHK(oFto|2@b9WNQ-bSfP>0&9Sl5=ncy7`opnO&3~!rv
z%yZl4VX|-E8cKsbb`iA*?~P^RZfx*0JsKVdVRm4Z_1p2fayiL$SYG+DsV81b(JOkM
zWMRLF-V9fJx5R6j>wjm6I$svt&NR%j7MLw5Tt5bn@3`w5t1q0dc~u-6H^Ky=m)VBG
z`ay<!Jn;?&LNC@y7`SmJOXe_fID$LG>pK--0`|_dI9LZ7d9oAmXX2zqH*(&*5P5ww
zUKT_Dr&8cIV0~=UBs=NVg~0Ge<_Jg^PNLX!nH0SaYu_k2&o5?b&b2r+h8BzMnAqq(
zQSQ!N$y3Z8OsVV~@wmTQQ&9>nFtsX_I=bH0QLN7Qy7(eoDNR{NsH6d_ZMGE?Nymcz
zI<L&mJ2+p#0vpqO{4Es?HtukV-C_ZXU37LMysh9EbcCsh?QFfxMqXZJ>+NpYD+<@!
z%WS>H*>4*hbJy?Yi4F=qBX%EHZAbpi)fQ=vath{)4S`fyC_QS{S^qOEloHfgxZ;AY
zSDXRMB+O)jRC&MiB^>s3fZqm8#>|rEI-C&`{sp-2APfqV+=W1fD-TYrY$2D-|F=sS
zNeyJ!UZ)U;#j?fL@lhP9z$-e}z;%}Fp?@_;+M8R<_RU&2sj|qF72i-(X7RR&#VGQ3
zXl&3mEr^17OSp)Cf@4#3O@%lXAzf4aKUl;?)r$CAT>4n8TPl5yMSO+Xeu8dw)%@s`
z@DLIFTZahrV)bu+EEsVSQKQ01s}qm?qA_&N%=UIU7=Oqf3YyAwFlF6OT|AQG!`85@
z#+~tE#}?;&tg`Y2X614>E0@lXJ}{h>u-It$Xp4BleHiW(64fhis>hO}eQbZzU)?-E
z+c~4wE)0oIbf(z~H_g`1W~iTXBPi<UiYRnJ@iKHS@g~kUwQ`yrFyZ4w229i&H#<*h
zXxD=W<;os&yk`-ret4`znvjKe8qs3ASF<|4#5^oEbABiVaoMrCKuC6{CELx+#a@@u
zAq8d<a?}PTe}^ggI%h6_F)!UY3)=AQY{NM=OOueLVv&I?y(0?#Il7zKNEdIb0vxe7
z*_vas6%J`S9gI!@OJ9@dLSsy0W+W#zG=KkOMk2Wk-(lt!axAiEY{%QjVS<IDz13R3
ze<QPdy~*wo7`sK&GrDRLB3cbpkUq36Pjer5#>`@5<|!NfFo=mwO+YqaG7~`rF0kPD
zv&^Lmm<|=Yf}ksnX@sS|^Q-adGTd2c+`#yql}5AH%ULF8v7I@aN1zK%fXA8Xc(piZ
zbZU;*88O;>JRC-6<L^UPdQZfKXo$yoII0i@fO+%^=os92f7&a2Q)r@P%3gjUj6M2d
zmjo6Kd#xWP$oioj_Az0y5Sxm_&V9}^v(UycOoe@wcStIJjPN#x$Az1*|A?vZ4sk7U
z6I9Mn;(vyH$d)6#$hK}J*d)ci9gIRn%RNA(;XLPuCf8a+2h~ijf5a3I11MY}nqcj)
zl_8nh0$*d3piJ~=iFOKE{3_pJt$l|tFT^%XZSP>#K#9&WoK0$s=}27cHK=&IF#mcE
zDeeY0IZ!zb@pc=6zx&~@A<!eir75)W-DD1js{A-AIl3(8B^eXNKjaV`D?v|-oEYX!
z^!2h@p}#>^E9ZaEg!4VT?m7ooyM)(+vd?iQwis@WE;Oq!_a431nI8o>kBBlYu9hFQ
z$dUCD%C^-NZhVo;9_zl9T=$KHrfC9{_giYODb-eqZfIMv{yY&8rM7Zfrn|9MEB?|q
z__|l%b#Z!Q0(NXr0kA_CA3ISRm{U2|>?ka8*2lzSHu}zm`3rIM+v{6619{mB$Axk9
z08K-Tc*5B%h0J9)u5w}a+`#v(J6Ln)8~6}Ph)zxPc91)r<>a(<9k9Q`<z_?W=D-f7
zSG*Ket}hgEk=mLq4=21rJ`?+@??CJ!@_0-0I9#aU@x+Lao2FW=xNwt@^9ddwERVM{
zk6&aSucM_`E59>Q_q#Z|aJO6yR|UxCnio-`<99xemW50lE&^I#O(EJlqCyl)d-1Lj
zfJPMgjqz`>@xOyk>OGv~M)ihT70Wstb@rwDo|2}u9-Wu;W-mEw+%;0^CRTtuO?{sU
z;}`MOha5lAOVe#=JG{Po5W#t_HsA9WA9@iFiAiMKQcQ4h!5PM}R~+%KjmRFsrNja@
z;nGEWANTpP=>!)atLLtlFJ5<|(@n^SaE)K}kY03GQSV^^A!(gzM*DSP)$U!U3y1u7
zW>@Qp-)4eJ%v;=#x4`wfd%Z={edsZv4v0MgZ*#t@0xEVNUXO=c$IrkP3#`#<JlqRe
zfX09)Yve<v`{XxxGA<rRc&<h!U?IC3TJX-!ba{U<m0J#9GS~_I43?f!qnk;Ka2l0)
z+n*1q>2kTQOgEOrU?qK&GrkZLF&ME*2jw))Mjwh1UW}3!;&?}2Yd?n9$30CxCwYkX
z05~N=2!AXpv(Gt2P9hh<-X_&}Si$Zhd>P3u1!SErJLLFH?CO)nh)SL_e@%d6Se&kh
zZsd2^?8r!g5YF@>DaOUH6O`Ra+>dGwePNspZeebJh}Z6gtJ`ZCN{1Sqw+!@q4V@*f
zh=r)S&Eq#n&2WCcpdakvhTvy5Ue{TK+FR=zi+9N_;TJ}mfrcO;-TP43_{kePFJ6Ea
zh8<$uHBn7hm$sE@zY1aFP<3m`68ICwc2l;iH!g4=dl-+2k+kTOUctS)$g{`b-o=zc
zp;HDUKd5sak16uv<!vftciDML5@C(%k%1MiwCf-ZHWG<Qf0ENhk{|Dk-Q^7|1!&~7
z!f}SzAQ5wc2ap#ss>_`>xJ>Lw{F;<)#~HCem01jKi$FJnI%h6V%f`5yA_0trs{I~5
zz@5`1l8#Qx^^#@IWSMOG;RD&*=mBq+QDT1qjtY!hNbKEX_#PG#)6j1O?w7BszCnP4
z>PnO7E`GqqnZ(^hMubEs4;x9Sb4_9ome>yigNZf_r{ynfOxyRzy>j9AM~I*^OX{?=
zoc9o}%YUU8P4#xZ7JF6VAy*oXabe^*3u7>|k(mY*L1$sp9UDYtf7*-MZKd_HtrWM|
z_LeJsPjUCidK!b`UOuoEui5}G--O;{Ws(!Y;Y|bMBgg&b1AKEJHVvq;D{{HqVUdUn
z&c)uh9}We>v=c=u7s*^SFW~%vi+$-=uEc#pZii?V?*x#BBR)btV%FU(Hu~5$#kgM>
zH%eeSXSRuKT9MknYn#aCaRc{=Oj6ReF?$-Cl$!34;99kYJET5XyPI-<8}|%0VBe|@
zjT-82vc4NPJ$df^{-9wC`dr^)<~fHSy5i7K<JH*6iPEtCJ24SAT_WzUNKgAdb5IU*
z;6x@m6<8xBtap`PA@_S2`r=4BlxZ^3HbG5l`w2a#pkdu?>~q^YJ7oL%M|efJEDBmu
zsY0t@JunIo&Thr@1}jJmQ@e4M4H{m+SqPm7dKysl#s#sWnZqr<fc@@+DUYl-55moS
zUg1NXMbOO2M)L4<ob(b;k%$;7TdZO)fQ1fL>gd^xWi0x$T5RlbB;qwhi0ySzKA>U$
zfdUqzTIt%t#`V&bw08an<9Zo8;p9$I@ey34CBn$5FKn*t9=1*0POmum{m>kHPr&>q
z-8T$hxaROUXd(*t(BGD>cFvc^jvHrgF$4Isw{dN2+c}?_KD-Uw$>QbOiEG&w6THpf
zln0s75~W$~*Q37hwjUwGrOo08U@wkkzVjxg7)^40vq0s-Avts)rx?b{Pn=>fhH01M
z#%>g_d28g^)6vrVH^O~TaL#!K9_;Qs>XqCLI2*C|9Sx^i%N@*k(cfZz1`k{4y~Ez)
zSVGI@iVu{o&MmQGz36pblLwuPoNQRt!H<VjKB(<+V*tK|t<oRHV!W&Kpw~Pv<|&xT
zNWC+&)pY5t;kwKD9HSn{^?0_k%n&fSy#(tyZ-o(#o-fXs4Hp>hahVU<N;=or>-Bs@
zjQJ=A+0HL}6Yhh3cnj2<kK7H!kQ*jNHc(#1hR+gjJKX(|glU=+3kw!|A`nfn+zmMh
zd4{Ig4E7<_zpr$^A11Ml^lgwyhZqRNc=2?&SFXOhT;q!xgUfz#ZD-q518sn7nj`;p
z2XgXefsg<4xKIeg6g@6<$0Pw4Pnw?N?F*z*)!Wj!gnm@AYTM3tUVvrumIRN-k7Ihg
z*00(7y-=+4ebX{ee(K>HkAL{N*Pi?R+U2(%D;5k2M%<H+xX&MPAc?rCo&#6!(Sw)%
z(sJ^l&y{aKVc8Ss=1p&P=`o1?@=rIl?y|Z2BQG7f;FaO0Ho5+?76=)f^1rV~+|q8y
z=QrH={>l{>z8Z1JiD*_G@xq+1<Px63t5%mhxognJ$*<lMdgP%YuYVm6m{u~ce8RUE
zee%B<M-(rMfBl)?Vi5bX_Q6vg>2TyF(|fO+)WLf~y3~U0tOaO#aq-=~3B}%4_oAhr
zgS#AI135Ho0-5Fpl(VjwvWw@B^)e*A2@|KrvFQoBFvGs4T2n1@ACOCEe6l=6m$Djq
zOZZX1LTUpZl5LZp5ZPJiNzQ^I8F9ZI>kKzGH`pK!w|dVGCh}r4Uz6P`nT4lg>UB9T
zryrb%;R6E^eQDlgoOC2QpW!YJnX-DJLm99IKcqLC;2rSx-~Fr1j48MFUz2Ub5}+Qx
zda*h34vbRgVe#=P&Vx0A*=h&806g0Ip|x!4%n=1<y0?Qt0ibT8F$&~+yZKr;OJK!}
z1JyY99(yf%ax5Df<>PUTU|7<&!_NdfDwhD`r}zw$QGSs*8K3Ixfw~)={$<$xK43Q@
znqCBtXG@0S^3eXWH-XC`Q_W^t0<^I*$-xyw)`WMG*AI2_k+NES28u-}4W5NXw{Ml0
zAS36U=kS*|c44fuVcsq%Pf<KFJXXEJK)gaal3pZhw@TRZC*Xu6Qu3l_{eoEMW$aCx
zhcF1%@ziEKRfydU@nnk_<jg)}fEQMClJ6Txwh<w}Fd?e+;Et3c_<l`_#f@7wNlC~W
zd9YLL>DFe&SXGPLptw9J-#hF=HJ2Bh4h4^SUF0xUJ9k?n_$~9Se&;t&S+aax$a8(a
z*>iok;xMecF8!PpTQ0$C8s8T0n~GbO&DE$^MECRd?j!GmhUHu8u0$%`N0wpPk2^bY
z%>kZ@+_G@g?n!8g<KQ*Cdg~H*X4g64)|K%E6Y(7+!ehZ0IgkaD#n|)Yr|ro%!N;D=
zEbd1mlga+~G1%#Va^Lx2CT=T>M-<|2+PMoe6?wpU%`67J&RVBP9Ni(X84$4HEmH1r
ze3+WtI~VpHDBurdo`jopa2g8Uxp0yL<?eBA!Z9}cjLcf#$j^a$NV%^auL~0kxSRAg
z!}h=~co8=K+HQ98(O%|m#ylGq3UYAzqq)1pZ2ng0d&FaX8>nvN%ARe|^PsS{MMEDa
zI}d*6TQ4q|55ElZ0AJdf(QRhu!VC9xMhmkFDl@pw+lmCETvsE7cpNG;AD)kb^TT^D
zz(h^%yoDP>XxJ{DyB%wJXEjuB%|1qjlmiLp;U_9yv)C!-m)MCoS21^u1bYGLOf;{)
z0hw3*)o)^^FTx$(h^K?|(;Uga$LDTlFgXwIb$&M&U-+Fnyv7u>1AMVbu+8CM^03#r
zjM8qDV0>#S+#uZ4qc?$<4i9{-aRl!TLwEDk&(SDIXf%yu{FiY$M6&WRWIaq%HyVwa
z6xyRT3M~%j@N9K~6Dx6GYN|_(xDb<gcasda>Z#l01_gi;XXTc2ef`5L_Da^#Ur<LK
z9J}e|ypFkcOO&gq#nw9cc9v&+l5?5|1EQ)`ztI$wxKcjCH{STpX8viuDR;lwS?wDP
zkCiU}7!9-|Y}>>Kh|!b0fGQFNw{X(_#0!cy_EV`8MX-J{-`^7>UMkJ~b6s#@ftVe%
za7wWoBf6LBN}Hqyp;U(=39CG_H!BOSY=kxoE=1ARdCEN9#d!>Ex%%AS>@>P+myK|3
zgkA|*H(Rtyan2plR_{{H32)q3SK;j0#r%8=@6`+%khXFX17|-lIJDg@Z|U5LfdE$O
z?_peM4sSkp;BqP$a&418jRF^aqyvR>@w@=~NatM)26)Pt)|woRIM=RGJT>OpHNL5?
z$Uz~5+{AupcE^fgEM>IgiLobo8+{_XK3*&->@Cs=Ug8G-48aHD=!jfs(+|6PiEi)?
z_o1bUdUd2c#uF!=-OQ5Rc<d0YEsnus$>GQP%LGQAsUB6(edZTD)4oc<l;-l^!23Z_
zDOy8}mCy_<f!&Cl^g9b4#ph;=33R`(3fz5a8J;>2DsVZM)I_S+MPfhL>K;5c$<_W>
zxPfm*;IP(9Oq46#ZyKub`P)sDp=14ynR+WynH29^iVRH0?GvIO#Ip3t7`Zy4r*k8$
z;rclDz=3%JEEMt57h&A%2i-|uS+#m$gB2q-YA}YgM(sBtyp&*k?Agd#nG>bN#5l!R
zj$?s~^=uj1$t{pJ3cXqK8ZXX4Kz;fF%=csu=ntu?vV^X-n6`RMHYmG@UKm}5G!<iZ
z<i(+H#I*r#{#JQ$lVeK^0-y<nv2U7|wmk}KVY_BV9Wm~%t8C+~A6qGxaEtis>$e)>
z{;p!iQVsLK2C5hzZQM{3Z0}*nh<DE2F|66=Uw`${`@QDh+4I4av*jeIh7s3m5tn8W
z7jY5Ku|{0PMYN%g=+z?PU8WI@wb#oU@YB?L+TM5N%*UVmt@PU0F8;Bv^iDspMGO!x
zY|t8G?lv+TZb}Ejq^vhqF`~f3-lsFI&`fgt{k(%Wqk<>H*NXiahkFp34M0tU(<;n_
znxg*0%}R5uEM#EGK~>-G{Kp&#9VA8~u7j-N9~tox(h^_B^Xr{s#qP#v1^2<%f~hXD
z?;&<sdnpHf&J&^>^eymq8trW})Z4WH;Tk(%h(kuSQb)GqKsq^Pm`qr87_bcHkHb%a
zI5w2b+QT&Cd=V7&<{BAnj^L(6>@Cm%l*sCXsr>@vn>29D%X4I=18r4XQBixH|H4o-
zBYbqK(v`UtGP}p8!Q~com)pI`;=&>sW6O4I1sIK{%#$m`Wn%ICLh~E?9A~1r$|!by
zy%Le&^tDL{)<{CZPkgQINJ3KlQk2CS+|JVvN>i*sWbKrSi)qB^AXYJD9ywTZpwqPw
z>k;_)fF;ZjY=S4l+kp6##}I8Ub8B=gred<sf1gZ4mY~}jAd8zJpoOS9vY-*SQxJHD
z_{c-<43(8Z9yZiRREe4_lWpvI$zx$*3&pxX?p&7*Oz}boAtUT{Bnfa7Br0H7L~a&?
z29{gsXTIPim|nDrZ-wWiromg#cQ~+xD7xbsNGv;d)ROba=a-=f$2B(Lj1~>3IiVoO
z^lfR&j+>OW90`X@aVHfUf=(~o7;_6=R*ZLKXRNTwdki$&w_$4>YS<*_5nR)9<Ss9+
zGh#KiLTojLgDfAsf&34iQMzN4X%Q<I(EFUGYoKjJea1{+0yL|bA7Qq2GbYUgr=zK+
z#`<PR)>q7fHCrP&yVSJ^`ccVH$GH^8e+Of47%pV6VH&>?a!lp_2=|YfI9jsm-VH}0
zq7cSz<&Pc+lKT6@JS-1Tfw_BC{3|ne4KlY~d{>kW?&c_ry+0OP60x@7l)GVWz_^<b
z4-03cYzJ0>akwAPw;sP1VQz!kjpY=MI#sLmChKJm8xLi&ZD<j)UIN&LhBt-#Pk7Rn
z-3p)zEP~^GN207@w*{Avm_RwheNUPqx+S{Ib;}_|gpx9<@gk!d$4amSn&ty;gB&FG
zl?yO*OA65}_fDWV;;FN4xIKZ~3LptIrGT@}sou_8<QOAjSd2+T2j@GvIBYo9hJ#Sj
zNl2}CxOb56U~g*Gp9p*Dyj@jgYKs&%js?64Tkv8&<Bkzt@x_fdB0}@{4ayGDJte?o
z0n<hVLemx<1}=2AAs)eHdsaIIb&Qz8$^_B6$Vx&gq3H3(KHzP-McOU$X@H66DI{53
z7x7wONWyaT896wQQRGMH4FY&Aaf00j2P8K~_fCvSK(pC35B~KrNiXvDlS0DA^3`(8
z9~Z2{1WZ;bxCV8**I9!~JOV3baUtIw*UrE?Q~H7{FkOm6N#GVFsg+OR8YXFOWzp%?
z%d#kPS=w@Bb3>f9w6`L*H!;v~+nWPNNN{chR~WXbtg=l-uC7@r${N#xWi4F~nbH}^
z%+YZ!fk_Vrza-peS%~^&oPBTfNarN?8;W+WdhQ;I9XDH&^3HhqXzQU0er2QrNAOS;
z!KF*x<0OuC%pD@uuxsUR*%NRcGYUI(qTht>B3{m!(?>{W%nG`jjvYQCF(hsG9)#Ah
zd9QIx4xFO*lx3<6h_(sKjMwcwIaa#8r!35SI{CPFNcx)DWzyB)Z81B)Cq_*8CD-o3
zRaY`rKN?-lrx42E^$W$SXft$)&R}T7#K-Hw&R{t?w$l0XLiiQ9HrhUj3k6WLQuRG3
zKdazfg`F>n@}hZi<3&cH6G*S#%f<9I6K0qOA_eHdkbuLSGetym+*J*=Z|}t2TTGuA
z<6x=~@G|_baj3QN#|_0z7APXobIMO#^Z+AA`Pqp-ja$pG!;B2~@e)MW0&c~Td(N%!
z5ih3%c1EE|$vOl5eN$Pwz(dqLY^tuAkK7W^+Q{T;7ZPJ^y&spT$-BLEA*Nu`pC154
zRCG4n4PW#Ctcc}^P?_7Gdt#aL*UYtSxZQ5Yg;DMxurhj$tN9DK;ofwwZ`*pIDKxR0
zJto1q3s+l6obOy{uOBtp5?*xt{i_5q1FxqK8<UhYH~kGwN|hl3Y-{$~a1uiVu3V(v
z);mpS!Gb^zk6wc;ig<uF(r%OY`B((MqF3rKr(ab3=zZ`1-?lC%UG?yJ7?7hhjJQy_
zK{kQ%PMZ+b?b+<FkB=_z8R*z!cEoiKs&rZ$D~9-W$8mjFrC=roTc-ky9W5}MNJ91Q
zE1UCJw^g9ViR!W#7d62vVsq57#D_61z<Z=j&_;R_#V-N|V<%J?9k8<mkI$`nCW15N
z-kI&9(R=|OP;m={C`{c!4N+_s2Rmp5n4}ND*R7o?a^_5$2)=A8DnaP`F)>AncvFgy
zy3sPsu0}^I(|X*0h{$JTyZgmOZsbLn#yIgWFM;_RPIKY)mYFWM(D@SE2+gbyS!aTX
z`otU<FzXf9zgNq%(6ys?bGO-LE@3t>%v|CelSM~jpvE8}{ULN~n2C>+7HRSk14BLY
z5oVR)d>r8TaWe4?tZuIHCSVlA`#N}U-$rlinfOgy!@(wY0DfsFGav*sbrikF4pU0q
zofVjh4L~1{?>pjq<A4%ZszXSWT<zD=`4AnanWe9Sf05R2gL$3phG7i55SQ5cM)-cR
zd17}PZjD-LE*-X~r4v}NwyB|D?}M#9-uNXmhcDv<LS2#q7r?Oa4b(96;dZXcyFc5`
zor#r;xUDLaBL!E(YDb#a?X@*8eJMulQAZ%?V{y^H+`lwqN15-!vwK{B(_QO=Xj&S+
zGZ;`ioXkuC+_>|$em{2^aMI-~^5#GKpQVQlJqmEowWEfeamS!rMm<ola>FUlHyeZ7
z$fL$=Tz*Z`dqZaT{r&TU=0q><hW1wO=2|~9;)oO9zP?Mb*2<Ta%Me-dh@)LpptR_7
z<(M0e1pJKoX-lZ8T8Khwq!}b+71<UJ<>83pVmUEpwDFi3r@+kx?iozmjNe=>US(qz
z6tn>t<ZP4+RFua^mED&Z2ih1%ZgvKSWAry=j5s44j&Z(<@dU&;d|zT5gc!HGtH3}r
zMg@(78x>YWxM`(Ej*dWN$L>pH;_)gRS&E;L8OsmfbSXS+(CNrdMr6T#iL6vy>-$oj
z;4TNhy5*PAoUuM@dQe6?ZLergMYK=M-<NowXS`3$--S!P!x0y?V8o&sLFU@`owQfP
z>4^BTeTo=U$k>Ek(SAEr6elu1xF4^IZ^B;jorCxu+NbzluMuBKOUCyh;_Dr6&Q<kw
zPF3;shL%i@hBr84dyBDJz89uL@Ki~qMQJNWiOaI9>T8;c(!o7)I1tqEDA5D#K$KAl
zzRqrp<*gaxr`2N|xmS#{5aadx6ys+q#)>wK(Ymg&9S+v#s+r7-b*|Z`_`Xu{RVFdM
zZ>m?DV|9F3iC(==@qKIKOLnWNryb*ydr-p7BB<lLL@u0eaHT;Bey`&5w`Y8~nYt>z
zC+6!2FNX?YpCbH8MHuM72<3w7@C&ZnD?+C+>db<Dit;xVWlBdzSzsn=>}np~J4!e*
z!*JK|Y?bRpWQynu7h~`MMkxkA;kUeh?<iX%%AfWjN)(60SuM)6PK;8V0U^o_W&pWo
z?<m_L%J26n$}4P?L!(>RJ~b3zlyIhB73Cd!N7)HcR_;@jMb)Fc4)ZISlBLymN)~-m
z8wB0;v~T;2Mn{j1_dm2@T<of)k6@k_r6HzIMV;;IBw@#MAr`ziv+0TT9gXvQVnPe8
zAQs}HbDEm%iG~Gf^VC=z`Lmm!<EBmyO`MdK+c!5qdD6tO$&<4thjOQ8C->`@oR>Le
zV*bQAS-E2nX!6u4)j;N?NmIvV=1<M-m*d$t36rPiXU*E@r;*ZecsO}_UT(6aWK!~k
zsZ%FR%1X|eIyozOX4b?h6Ebu2v!+bW%+F6gV`}dCd7;d4S;><o&6wO5uXkS7xct7k
zS(CCd^Rkj_OFwgBOm6<9e&a%+z237v*)t}TJ8^Pm?(BmH?61_;k%~zZr%a#K(4?D(
z#z_8-&kC7`_e&<%k?g$u%yH+B$wTGJ`uF0`&qaOtccPy?dF<3lrrA|51zk()%D^fK
zsC(y6x5cOh)UToADYX1C6Q|^7<&Mh=p-}SrO`I}rug2g?a~sX%iDM^c?#KG0<IB&<
z&B`1fv9i|@6mK$V`uMEm@sqO0P5BpEsO<zB*tvuzW#(s3&7F)!I(cG#-k8wTyot7%
z|I;lse^TC<30e7LGRKe4{a4z`c-u4o)9*cf%FKyV#v}jHzwZC8QrF+SoXqi4XO79s
z%CDtm*U>d#$euiL+?a8bGN(+4F!<K-WL{OLQdhU+CbsVBLso9?l&KNocc13Z8r{wq
zKy*X76Q|@&9G4fdrPq=Cf3cBd<s#jg`4gv3`B&P=KU7QW$opzl=P!>(($(Z-=H<xX
zQv1}?tv`RIsbo*eoRAkGjyhfx4P)YjDKc-_J85-1X2-5EnYk0j<Ywj1$eh%$$HQ~+
zG2?Qw#+`4Q#^2E;);h@7-MFjLlZRQ1^fh^TW9nC4oHgv7dl5T^DJG}>jyL_&H9(E|
zPo(kMO<>(wW5i@*W@&pi`vQ7mR!2=ioy@N#E9-d5&f)%*&L<b+jGaciQ?2P!Cg$aj
zZ&(eVG-oCra3zE|*V|9azdAC~3?elK{05YrS=N|kg)9nXJ{@rkb1RbDg`weuu$qmK
zVQ$E}s~o0C|9nP;yMTYC(v8PT07`-L$BfU)$3lN<ZvTimT}MGn-#;dlnVVNjA=UBd
z-ko`*?xGF>f4!1bTRZFe>DC#ycKv(PqP*#$P;ORU9;QWh#B!UrJ2%8&YV|e!4>gJ@
zGiOc4A8tT93*YC4!CtZUv8L{J^(QlFH>2woQrjG(eos$APx>!3?!VADY@XNXLEX~K
z!{jB>uuv^4ke6;~QBgu`nLO&mqmRRyVD_<D+4WLOL>z$skr~8(ZfR~d@h7@~%=`u~
zlEU5gm_MDl)zN|3N{y1*KkC^!VwQzzL%aIDbQrqJtRp`5^|@IS&=PZJ|6OfSwA9J@
zS$X-%|Hb?%Pjo$b$!RB?oW@Fj;*`+z{5)4vKW6eo^rBNHjDa>gyWhClRdaQBMYccN
z=iaoc>KJwWm~okC&wqoo+^o>lT<PU>vA7l9bP7~XVk|)qQT2z_(A4p;+gid8iF$MI
zxaz0^HHNYJJvnabj4`&2$jo5BbUt+@ShVX<?N80$e{(apY$9cE1U1fn$pW2*Y6@vx
zmFmc?TBefwCB=0lL&my#cWhPlOZwmjyljLOO8sgycL}TMc}#=)2iND}?x)_l#59kc
zF(EfIl+&<AU!}JHrz=8)BKq4pk+IV!P8x6eqwrEYf8u26#Q*8Htm+*bP<t^Y9EZhn
z9y|R8uDYr+wxKns+LL>?7TpWkH`_a09c_P~t9JN7S1;J0Xv1C3KVA5LtsF7<*U+{%
zpaNCpS9q21FVy|2RQ%1_+`V+1b=0}4o;+e}(6t&(0E>;itCt(@PiCHzXiHLiJ#_`~
zXB|!44O!nP|4sXha7{-g#U^{itgfSwFwSS?W=_a*m(ydWWX+5yqoH|HTXBYG$+gV0
z8`|ZDcNF(mGjm_utv0ofD{jcTtBozEn!A#+I?K$jm-1mB42i~PP07m=jXxsImL1c7
zx;N%4SKsn=budul1OE-po^9~Zf&aq$;SIQD*TC9qjjQ(V^Xh1@dvAv1A?}85_nG;|
zeqO-U@e)<j9k<`9v(IsxUU*omGc=kX{&OoB`bVqe%bJ{qUyseq&y3j4>PW582sNmX
z*cUhGnefYrq2$<!Q;Z_>pKhsI8TGfV3-C@gEJW+b?y7l%wMwe9iqs*eRs$QI7PsGa
zt66x65=G8_%K;s0ZsttZ)rhsbju#y}F@J{HqKeRs*72B`od5HKf!Vi&jxmBUteYiu
z51_h^?%m4jC&xC@ngj8~spDX%8%gEi%H+>#y77Oyjl!rEyVqGWr{+$IWIpD0e6t1}
zI_A)Uea#TlcfyqE$*3lI$?guxeo3^>qsgXE`^yvUeW~14y=mR5<T~2NzhoC!m2#@B
zBej$|l07MFZ2tj+8rBt5b-5ASWgW3qJt^JvSg3`ojGk-jXX|d>Rqv1I-EpusQPo=G
zxbq{t^Pkqpf219l>CXNeN$Mz`+Rc1*j@BV0yncv~!F4<qE=WA)c1q}tA{A2|QvRg3
zYrt#NGU@zBYQXrcym7e`Lt?)fc|%oK)cvu_L>!pfxVEo#4RU7wT(t#OV?V56l|bIZ
zHG=sU<{DwE_6F3Gzo=2Ev6<JvzPnm)N+Z|3CpUGqm9R1dHlSw9^B<eN+LderUw5A#
z%A1{sqgs>ftYVCv<wTsZ)sa!-ax!IsHD+q=#0e9pz{Jbl$!tJ1saish#Q`w1iv7}&
z)bSG4DoeG})^Ff;EcG39&EhArKEV~mpWJ``lYXR*7eG?wSP%=e?ju2UCm+4cKA5!9
z^hxJf9VymJ52^3&(6-ln8}F42<Fu*Mapvh?s2=s!?AGzV;UT<wfAXjKRd-oc$*fk@
z_DkE*^%bgZH1$ca=QMM%f5$>*lDkilmo;wcl<_#$8DT(mQ(kvYR`oy-;Z{S~5d9TP
z-x`)odk;PhNSIv*?5A!qtfKjQb`fejM_jL_QCC6MPe3g@Uv)ifj$lQoIdwc!hvFsD
zQ&koq`>6~gMd}_E|GUO#S~36gZ%}P;uJ^jw9arnUSg5;us5$l4y|&wu-AMOc{TQ)B
z*U{Bc*n62rHLP0KzEY|qPC5M*AsU@2YQ)elrGwk)K|SPF%k0|wShr-t`3u4OWt^-d
zpYX=}u&Hb|Od7lvsJ@5xProN8cGXVLH8A;h29D{)xT(2+$)!cOw8xB}gifR$m8nX*
zr0ZuL1#2zI_pW?(Jwn^42uZ5rDR+{GtjQMhj(@CVtH!Se7G<@rd+%d3@TU^_gFDYW
zQJ!ee^Ki~6AN#Mz)2t^vM#5oKTJHl5mvHh)!2dCxZ~riVRQsnjevk6B_pHme-dNnP
zy5g5;Pg73@<+ywhd6D8H$jcSa1dsA`^aN*E`EGh98=mN?99T2nWt8LTW7Cg6&8Hau
zMtNF#JcrZ_|3AEMRCGI!XJk$OG{Q%E-=lE-TU2x_PZH($jC7A@Q!U{?Gx2ux3}^ah
zePADemsOKLMVH!io<=$6L%!s{pXp!17tV&yPvIqfZFuXG+v9nPyn-AT8<@|x<bi%R
zd=K)&@tUT5$)6-YliZ(d!zYt3BoC7JhZ|G*+|$R(DHveG<6Z{yd67Ig-0}w)zLLC1
zg>Ti@$|)v)kaF_L10$`RUd)H&el~o@!Ir<k@RP{>CtH3E<L!+Xk@rm*X!&x6Pbc@E
zYxyxT_KSDP%Nf2c!}o@}N0C!Wey7)d@e+B(I4frn@7?&dzYQPCwtNNCUpl~Ye~#tj
z$xk`h@`8z$A4Q%r(DLFVEN{>BTufd{?w0$@<etGc{Na4B^ASMOpK-qBaZKknT#q3<
zc%<cvDE}NZRN>|1+sN-BPdmzn-$8yNt_GCwB|*#Ge13>Lc(mniJ^z8cnEWovnF()C
zA}8$_8-5&lB7ATPFD0K#em8mWSQ~y7`7uLm_zH6IEG3_5<euYf_*CZeIYVuD|7^=A
zGk>}bv%KH}%dcekH1dr3mN#O3yN=v{q2+0mQyR2#(neYSqDhU%Gw~?P11DMj1^J8Q
zCI7M9&v=I)ZNq#1YxxH9Z^=tfwH(=KJ|)N4@TI3&K0q7_<8$<}lykP_@l5CI<QZcv
zKgkC7OgYYmFCll^<1xouUVfepe;LDnOdgtS`4^PG^aL9|BV_se44*#S^58VfFJXSZ
zM_!(9`4OzoQ%_|0g_a-7@b{623N3$u`O^bGBlYL!rdm0b?1wL8dR`|FEwXZEk^k<7
zCzpO-K66J{Ipvqx@U8g1kCInjVfm*vxTknz<aoEcT*(jClKzh{UPR$yvC{vdgT<#2
z<u@H=<yS1W@xl?b`Sd6ETxt1f%(oLz&{5H?Jpqnu0fs*s-rJ<zg^FzW+sUsZ4=k}9
z&Cq=QQ%gF}2bcLkX{ik#$9lMoJg~%$P+wBcTJp**HvEwczYSdG1LMcp_hohPv_yUj
z-+G4S2xdOL$#YcrqsT+#!A>^(f56SW<tH2O<CJqHdHK(lyZy|IMow!_kokt{YCdm)
z$9mjc_rMLF{cNY4%BI~7XwQW9KsiXf!M2t^$MBit<y|c2^uco@x#vKJr<^yzC0;i^
z_va_%e#&+E&y-Ucv~n;LGM~Z7SV@onD9dfGdOQW-B0ovVxsE)b_!`P7IJ&0%Z^?_v
zUHM0NSx-jU@Zv6AKDp$9#g;d=*mDJWisH9}OMOTgV<o%q`#5<?VNJe;a>}o@@*kr7
zR<TyjI)-=a)gW-mpR^S=d~=3BnLI@9)_@%HB67?`&1XJ&2|1=c=Ch2vazl6X_XXtT
z<i#&r9#8%qdB!W2yZO^P&ZaXIW82p;3_k!|@+X7!-+ixqhWE6z;oGr8yq@7pnGbF~
z+Q{$)t8DlMH`xe2WcYFwz6}b()T^a7`~hxy$Ws&_NuH+o1o9Ah3gr}%7b{+lj@tBN
zHy&t0V)|h|-QTjjgdEMmd@cZ&ezlzaD#V)49`db<4|v;#uTcC&@=C=!ZMNa}C_aPS
zb6U;%`3JdA@v&R19KYgEkS8gA^i~@_p!huUo{DcGPf<MK9V=&$;*-gPir-B>T=DpK
zt(-K)k0DQ2{6g{!#h(P1^3Ax%majW5JMUTfIV$`(@{r=o$!9746M2E+{V_3?da_9I
z9P%Q?i^-QOzL~sO@pw#-ME+{UgXATO&mv!^cnNu_;-8Y2Dc%_ajj8{NpGm$|@g?LH
zioZ<0Q*qq3W8_yVK9qcq;xouSr`K#350U#6|B~FVc$W$rZ<69?kOvgMg1o2VpOdF3
z-sM9pe~{v1$%BfoA|I~!JLGAKxA@4)PggvZJVWt?<T;9$l7|%kg?yIchkR_~El_+a
z`69)4lNTx8ce|CdT=A*o#fq;dU#)oT4lAcb@l(jxDZYZdRPmkUWr}zE)XGodeA4az
z$B+jUUqRke@jc{&6hC#Rl|NkZ<>YCK?;uZCeBftRPKM$O$a55*g9WB(*NSf?uTZ?@
zmp1%P#nZ?u6<<ugNAZp1p7cH*`MCKR|CN>RQ#_U2ulVKUNs5<|2NaL_+RE>#_)+93
ziq9t>r1(?hLB)S3AFlY3-`IH56u*c(UGY-#48?yX&ry6(rIjC2JfD1);%mtZ6z}w{
zm9t3k;ovf!24n1evYjb4kLM1CFH+&ZC10-ial5SiV#Ti}U#<9e<Ryyt-EHNpQ~VC{
zQpG<cFH^kpcUDfh;^&fYRs2Ts3dP?g->G=V@2&hw#YdCxQT!@$&*+-%;!|>;;+=l5
z^8Jd>CQnlQVe)|D9e%WOdMbV<d5Ypskq=Tlevg$CRD2})aK*18PgDF&@^r<MezNj2
z6hDhRNAXqUA;ou+&r&@3XDh!z@fqZc6n}=iNb$H|teoYFA46WO_+0YUia$(VqWGue
z>lE+!tBto*@zco56fYt#SNs+7t%}F~X608X9wgtX_$=~D#UCNxqxf!e&zUvr|9^hB
z@%j{BOzu~_j66y4*gveCfa1rK_f&iVd5Yqv%e8ozhYwPG33*WQE#$)$4>(p%n&Rh@
zrz?I3d4}TOljkTtIL68kDL#vQmf~B;3lwkVwQ?3I{vYxp#g~vTSNwJIV#VWPt^C!B
zA5C7O`0eED6z>>k<&-L(O<tz>J>=zze^0(u@eH4pU!nL)@|}urC$ChzM<Xj|kK*IV
zJ!jRd|F@9)6n~f8uXxjVD?dr`<H-Yx-$mY2@g3wTiuX*g@&_q?K6y~_HRQt;|AIVC
z@t%#X{B*^$$uktci#$j1ugF7+_itk5&r*B`d4b|Rn%eM-6wf3tQv4S3<%)kmUaa`w
zW>)@c#b=P0DE<!lI>lRIV@Ud?QpKl}mnr@bdAZ_A&8?iRil>uTD88J0r{Y`5D;00w
z!ph&HcqX~$?3(rec5<KMpOE_%KPb`4Pf|RGJfQgOmNtA(#mmT36z|!}h99JOA$d^o
zZREohKeV-#lcx9taG7rxooVOW?mo^MhEG@Fo3ycVG87My=P163JfwI^l9e+{@vF!S
z6#s^Nk>aPcwQ`CSUr)YV@y_jR_+rH`Az!WdSL7v%pNb6>ssHN~e~i3T@irZ7_%g-k
zke4gIk$kJ-F&(X(3dN5k->LWo<dup)M!rY!-Q=EgYPO5y18lrL#q-Jiia$@Dq`0q>
zl@n0>DDs|)UqGIs_*(Koiho5ORJ=#P#yec`4DvL^Zy`@t{5|pv#anl_@^chFl{}>Q
z67pG!zd>H0c#|$x{vySPlNTv|5&3e(pCT_-{8#eTiXYO|##^HJ`Q+;qe}%kM@uA(U
zoHE7dl9wy~1o>9Qe<rU`{ICP9{GEzVC$CieVe&nSH|}obc+Ra^|Fg+`ikFl76;JA6
z<s>PdP99MFYx16oAK25%Nl|<p`5?vbB@Zh84f$}zlY3eDX^KxKPgneI@(jg4A<t2~
z+d)=-NbzyxvlPFLyg>1t<cky^(A&x{QhXNqa>XAfFIM~)^3{qD>tp4YD1H(7I>nzS
zFI7CQua#4#cq)0h;)}_*D*ifoh2l;7S@}B^PbIHZ{1Wm#ia$^8QTx|UvX$>s{AhB&
z;un!8DPBq*P~1te@_Q=Y2i)wh4e4n#fn7O4nMq!BoaN_o1ND8%8KmU2?{DP?6(0sJ
z@<UhJ`A-z}jh8X}a25U~@-)SL1FZaX#g8Y?Q2Y||9K~NI4=L_F*vg-!_z>~}#ix@m
zQv3n(BE`QTU#@t+fi~V^#nZth-^#h)-JRv~Ah~Bn50jJgST4N|v2s=`Ig`js6fYrP
zr})q0rHUVSsFh!)_#*Oh#Y@SzD!zxjLh*MGv+{Q;-g1!Tm5QHEzDMyga!*Fh@nzEC
zR!&dF?<G%B{B!a_iuXFg$_Xl-Lq1&bHRNfEe?^|Ic;CTReum<c$#WFHmpr8SkL0ry
zKWT`SU!eGP<ck!4hrCGf?nAAd<%*w2Uaa^n<f|3mN?xM4f0&iOPVwW&OBG*0UZ!{%
zdAZ`=Bdz?cil0qhq4;CuI~D(iyi)OAK`Vcc;#uUL%$oJ{R&t->736-!4>-!oPf~m|
zc|h@N$$Kiki9AJd|It?dAjMB24=R2Y`EbR*B~Md))-hIoNbv{BXDR+Ud4b|1j<s?Y
zDZY@rNbx7hmn;4gd9mV$A7|ySR(vLTiQ*5FuTy+Cd8y)u9&hEBDL#X|T=BK!TNVF>
zyh8C4Pq6ZLD*iBerQ-hKHvAsNZzT7Otyyn>BljtO#EDjpU-9|mNs7Nr9#FhRs+H4I
z@w3QN6u+B%km8l(LB;!zu=0m1KAk*G@wMdXiho6(q4-H7t^6FtuOtsCzL9*E;yp%L
zIR%O@B44EVbL2&e|3bc8@c}1U`NfKd$X6@=AbE-6mE`Lb?|ZV9U#fT(d70ujlb0+0
z9{E<qhoo8g6^g$^zEklnkJ#{)ijRKG@;!=Q1a9VIw6}Eicu$g7C?37e%JI<-)3u8j
zOzu}apFBzN9*<i&DT-&14^sRY@-)SN0+;lcHna1_A+Xbx&l&5j{D2BSo4lvuH<No5
ze~LV)`1|C;75|kyUGaq*Y<hMo`^me>%auLPM)C^9za-zPc+)2+U-3cYdlVl>Ua9z1
z<Q`=YvyR-a_{Zcv#hX59(-TrWg?y3Xr;;yMd?xv7#cv{?rTA0iC5nGWUaEMjr))aQ
z6hE4Lo#Io#rM~$G+w~)wt@%8_@IEy!Dkt|VzMDKr@itG}^aK>2N?xSWvz)wG@n^|{
zihoI-qIjE98*hQ)L!+aOzB*lvn`6i`6u+E2NAVTtC?cI_xwW_5@2)7UIsO(MZh58R
zqfq}%{*SQvzm)kuCdQ_7rwU(4zDMyp$UO^c#`|0?>FmgOD@WOQ-Sz&@kR$2woK%x1
zdoA~qA8f0MXEk{mc{B32Yl-(f9CQ--1({ZUDZ}p}ugtLgD)O-jjQ3Q_ZzX@o<)>MG
zGx_w!Hhj?;mUrZfb#H2U1?9W%JAyno)`o`}mHFHVF8dmOm7kB3Cn^3ec|h@B$$Kh(
zKr<^pMe(D^2Pr<0JgE4U<iizTN1mqmhveysH}l){WGH?xd5+?zlZO;9AfKi9t>guY
zzd*i7@vnW6+t-EtX%{!nmKV&H%;zh}k$P2`Wx1;#8`a$M;CRdLXTi@R4@|Y()tg0O
z!VsxlCxXj)jUuMgtv?5oFC*u0?b%BC1-Vv!2Gi+l!FovUwxff|19>+5p$tEUJcIls
z@)5P9^Y|le{uIr&a@r)=2kxev63W3bI`jFF+>>v^pTZaWPohm{#&pX!F#KZjzzoZ`
zk-tgqnQ8fD(e{h6wWKo_T+X4CTx!GL!FZRGhpw=^E9E~;UQ9lXd>eUSu?@d~{AX7_
zxu3iT4(v$0rR1&1hm%*3+u0Y+J%LNPm#Xr8v6t=7*Qt8DI>qwaRd_$@t@I~r6hDEy
zM9DegFk6nr%pWh)dFoqM&T58t>){PJfG71o<zkx;avo1U&yuH6&M1pLo7>p%p380c
z4&=X(XOJIHeiQN`QoESe+RE`?Y2}P$ylZO-|047Ad6f^VFwZyR!qryJ3dXwz^KIcF
z@;LH19PkvLq4*cI#QPiL^)IsHqgx-kqkTyER$gp*fay;qFK0Sk9s-y8Q*o0`=d-A1
z@>xw@y4v#Zsqg)UasrIkXM=lMHn#EhROz|?AY1Nfl;4rze{E~uH?YLYcX@|)mIoCN
zl4p>&VMQ5FUa-{4d79}ga^+lSc|0pzdHcxmrXc;1>ccU7uR(k-w|>qh->LH9R`MYE
zYQFCq;F8a2_u2GBv0Q#)_)3*;fhIOTJ(u+{65R4#Il$(pf1Q<+MEN~OkW)UoP4hXK
zJmUcy{$lduSU)o;r!#rfqqcsgkh}dxFY-#t`IzC0-SCfD`Hzr)LtgTT<xh}b#QwRW
z#Bz6BID+jo^r+=eGyJKnC!U8acjses$^D8~vV2R|*ziYCPUjA`+&#=6WV`un0GD&h
zi`04BHXT_{9<*{cF?^BBRlFO>Q^=oV`5t`$<7GbFM>&a|EU#p~bs|5CyomYl*0(9-
z#fq;aFH!sr@>0crBQIC{B-FP^^}i?9%hE4ZFnl`GITY&@;lamk{-Ap|pR8Ks%&CQ6
zT?@ab7XB=_^lt?>3^eh%`SU$_knJVt_nGgzVd^j8FH+%8CNETcKKW|$z*hTSYrv)b
zdN$biwf}lN+Zn!C$?>D2O1z86-S*qd@OGXg`iHTp^mxu<_~i`m@<rsgDEW_)->UfW
zeQiIG_MA<jD`yk9_@(iyc#l2Ah7U13veA47p<$ZxRlI;aMaf?twDNsQ{#(aU{(769
zt0@1oQI?lJQ!|}OCs|%j?#_o#Z^Ci&NgF<y?duqK{l$9t8RdV?^;FRlHvAUyZ#d4D
zKWq6g^20D7%KH}ay?$cF`hYxz<zB?_X`OBO^0%y<XPBN}$x9jjC5F%MV#5c@ZTKrG
zzk<AYi{<Y6?vk!HeDTwkUrjkB<SC_=7g5eOa=$8<=G_>tD#sz@8RV$O=95QWpyb>{
zUi5;E_fEd=umi39V43BvpNfabGd5a|V>sqBu)7VP@rvc`xxjFGSYGgk<;xg;0eQ*)
zSw4dC{y^?w{=d%fSv{?s(ycc9D(27oXfKiaw_R*UmDA7=@L_Pz;b^eZe!o+E9{C<}
zALG4++>>kNc<{G;UUhk4P5y(+ms*}?gL|4cxACTtyXlF-G(hIt<tpAwQfzxE-eKQ2
zlX6;dzU?6|ByShA;R7Go@Ok8~9xd|Qc>=2aZaCS-TgG_Z_29>xhkHJ@a^_IZFU<dp
zcP(E^eo`-6E}r)+pGaOn9wI-D{BH7c@&n1=aOG^Xa$X_taFCT>Nq!0WeDZ?#ZTRcR
z?;<Z|KDguiM{am=<b@BqP4j8m+sZE?e~A2e@=%2h-<y0kc`>;=A6rFULEetxUm*8<
zXyq(o!GB3!^pWL0hHuu#rl*+vO!9@~1uPeL-t_=^aJ!Xr1jBD~!;`!9VQ60~Kk$hS
z@AeN1$x{?xO&(PI74kH4EC<YINI$Ee*{bT*Sn>*0&#xrksrU(KFOk~y;|MSPdCI3Y
z{Wmb)&yAeco^sCjGRUJaph);~hMz&+ojh%)mBZzQ=Rf2b<Ze5?mOMm$0_BvFmy%=o
zW<F6VjF&u#d>DBpx!b-jCJ%jP(=#X5KCsgb|BdD2*iZKDPx*gX{w8@5dB!fwU*!w5
z8o+$}&hoPvKApTI8j1pZ4k2GpUi5<vk8as~PB_@gDcxgvAIg82yfVhharMKk2HNnI
zKiTk~F{95UFN(F{Po<n^$t!-d;qABZSA>K~IabD5g6(zl$s_mwZo`ka*t3?rfE?zI
z=F|RAE2o6{b^ybVAlLOIr<QtnSuH#SIZ@FGo|LSvMzY(l-ds!g>FEFEy)qiv_j2_c
zo6xaHzulAlc2dA5bQ#kbWcXH0|L5cd<R9~W=Nx9=H<V!Ie@5PTkmVUoEFVlcN01kj
z7c%@Z@|0#a{N3c+$t(PpKR`bAaN7@@qQ<+DTJm8lI#?M8(-}TMIsf5&(vxWA&mo^r
zUeMBVx7|I;d1gr)%U@2kFIIr@PV%{+x8-iRFGM<p7g5e)*7LRG0m`|W`SXW)uhyQF
zL+ywVAn$R6U2mo-K8ie~_%!k&#qS|6QM~zJo6cgU-(BxdB2VdK-^*G>dOR!0GsxZi
zd6_&!ZlCgaestxNkL7!vHpIs3?`uQaR_5_sL>?snkl~*pPw7`PeDqM(H}Y`|-<Q0c
z+)dB(wbYX`mV3n@8`2GbKFhs`d?e+2Sne5z+K`Wuk8t@Rmg9J-`Bad5k}VIBH^xLx
z>Pabi9P43_`4%|LhR-1%jrWrJJnI@e4_r+-bNId?%5n9UtKED^v2r>y{AO}bf6LpG
z|4tqt$F`mM^c%)}CV!Ik`7H9lKr81ShF?rxPF_g9o?Pei2ep*TceU`z%(r53h>8!U
zwdS({`eD-^$=!bMw6`q}o?ydwVfY(1TVBra?)vTnEC5B0=Oi26otO4~$MQhXa@!kY
z-cFuzwB_#n^T79P_=>@nCo|sl<N<OwJ!>j#_@a?E{F4m7<YUYIhg**5%;!<^Vy4r5
z->w+QOuZs^<-bXNavIa$llAH}&SU*dKdQa?bYMSUM1C6wo-S;Ul}t}J%6X6Fo5uHY
z%RS~uTYt)rwCP9jnNL6R5X&V<o=#pu9w0Ap<&Zzk^xQ+9Hp<GskKx}UuVg;Vind?G
zwTL_q{E^{9r&u{|ynTW;{YB(xp5}8ddHJa}{9r2RA1C*pWjV3uGjc7zQ!VK^7+lts
zDa&koXEuA%$php*n;g$P@}7zp8=l}9$oYeto=ptjU&-&v`9ol;O+TlNo;B<zGscoL
zL1%qy>t`By0@E|>Fk24`rrYqH$=`%spVZHib1d&cJ^*%l!Yjz#_j-!lf36Mh$}fc7
zl!W)^S?-3PNqur!Mos=Uc|~ST{#+|tzG+OSo6lWav)@yEYm()D=8v16*A8I#2{q*(
z!gW(A!y~)QXAF4?`K9FFk%z`vIX5yv8C)M_@V(r1;_Gau8GNs4l#{^uMk&KDA-{_C
zuyTx*@Am%(9A(oVV7(18{JG>A<SohXATJ=lle~gF<!mdz8F`nZt^6Q)kbFFO0r|7!
zcaf*DKDh5&PF_UbiQ#`FuOx3x-sKp+m#RM}lBXy>i9AT|uCtbqXOO$~`7!b$@@|ZG
z8+j?YThE<iZF(vd?@b<1<#GynkldYT&LYnsckAKJ<RS92n4agz%T@U8<Q3$8hL1bW
zrqj>*<Cc40@)UA6|4%0`CU@K80`d}aSN>b%m8`cIam}Za+{6CF9WOc_&-7&3di4_9
z#Srp}Y|Gtoa4NZn?a}48k_X7$_j;8)sQ7o}A=R$io?z2cK<<{y(d5PCP554O$xF!H
z_q~O@l)Np&myuVHyXE^Wc_q1<&j$>*>G9;$Y<EF&KY4rR+f4G5i8g#=#=D+8NFF5r
zoxFg&lDzYYHr~*AR!(2XesLao@np-pFy3p(OJ-PpBjs!$FP~z$yYBda+@t#2xKtZ&
zX~>4Zh;sUor%ki`2*x{syeQZ5nGC;@Jiu|}Xz~}yL*y`@HlLr#OY^N945Q}LZG=rv
z1-aXAoJQ{9IO6t8SCa?S`0_k?3c1_9c9Dlv_?9DWyd~srJseJ6PL64(`Q(u6{`OvS
z-QT|DhF9hOqbpyPR}-v{Bk9??*TM(Y!kvMfSIo5eJc8+u#zbAl{bKUo<VOW9FIRkI
zE%GyK;g{CJ@2!QG*TS2BVf8Hor`i;{dYggdDT<##9#Z@!^4q7>l(P*DQ1m%(O{&S)
z?y@{b@sYckp2;=CPbE)LyySZu{%j>D_=Dvm72gUj<&rVWmY3U)&HvGc@22Fp{@wB?
zl$<;MusltLe~EmM;@^{}E8Z5fXUU%-il0cHq4*TTJ9-9kKJ1p)Du&Nd;h$&tK`Q+B
z44<#Uw~Mmr8L9XX^8YHHL4LO47n6@y{2ua<;%|`8ReU%3EXCVI+jLG+{N@<T7pd}^
z5KFGYmys7J-pOafuTXL(#tTpIWGFqSr<o03qQc)uzFP4?ej9#|;#<gfDt>vQ4ewLw
zypO!4;^pLx72i$nQ9Q}`b-{kR;)BU!6kpucre~Q-=hg!)FHpRHcgq(lzL<Qi;&Xb~
z@TH314KD3Bh3&U5`_+pE+3?#`_-;p9eyrm2$cHPw)||6z=jou{x0yLll;Fu>xx4e3
zuMBVJDOPe)&H12qp4%0V#(GfN`*_9AArC4Zyv~N-uK4jQEMKnp)8yYN9xS!tXQ}>q
ztl<fskgCs@GW=W>el>Z4;;)b|RQyZwMT$3l#-?Yn;{C~&DSjIH6N=9w-=_FY<Q0lP
zNxoh2|B>%hJnC7S&My`3Mm|gB^O5A0D*QO|#j0K{BL7Z>zlVH};;)kbs`%IB9yKoe
zpR?(YQT$+XpW>&HH&%QmxnJ?+<SiB7K%S)d`{W%I_dIXY8Bn}4c{jz6BQH|<a6b7$
zmCsj^FIVB$lCMzw9r9wuqh7G-T&;L7@->Q|OkSe+4Dz*#-$=er@n^|P75|j{dBvN(
zXwzAy_+jL)Dt-=mx#A1S-%@-v`Buft$$P5u`hh&8!gs;~S^C31D*W-}`6_&fyg-G&
zhJ2yokC87@{2lVeivL1hq<F_No6cp5A4$Gk@v-D96u+3fSn<{5w=4cC`D(>0$=4{J
z__9rBiQ<QnuT^|B`8vhtlb0%f8~O8!ZzL~M{Co1{sy!aK(WdiN75+r>a>b{TFH>@^
zBY#VUf13Pu6@CZ#Ru#U<D>gkVRQSQ<+f?}R<Q0l9Cf~03TJoKWzf1n5;$9dqNV{0A
z($k;3QiVU0e2ogfko-Fpel__X#a|=;Rq;x4PpIa)tmSJqoiU0ZM($Jm9P(n7&PC*n
zRrveKOH}xG$o(q(ALK0+@BX??XN8h;3i)=$XOa(6`LL3Fh~lN>LB&5NKUVShH*9)_
zE8drUq~fQLrzt+2{J)AXCr?*=1NqsCe?Xq0xToBvbG+i+$a54QK|V?GDdhQz7m=@3
z`TrRCEEWC(^0|sf{hv)|k}CJE<Q)`0j{H>>Z-_jg!Y?83ruf6;Jr#eOypQ5PlE0<m
z?fj-qe~Jo!Jo!2mekS=q75--OCsg>?$p@+MyUB+r-gc8sPpOh~0(nq{pG^L|3crH<
zSQY*$^5KeCkdIV6>Mfg|G{t+8|5x!;@-mg48RY3I{PpB#EB*v|hT_}F$15KHwoPY_
z;(f>`DSk3}Nb!8~e8q1dFIVrik$jd4zl(gX;%zqDbZ%2}g5(7%d=B|S#jhe?r1<0H
zTb29|$rr2e{w+2=MT!q0U#55#`EtdVlCMyFJ$bR>738-o-fXK)&uYa7ldn;HJb8)Y
zOUSoRt2u5yOzv0n+s))_m7G1~J5_$Rf9L<vb?1Ro&3_;Fk2YnUvL#{CP8fccsF+q!
zYD%SO8SPSxq>?Z#+9ZS0&Xl4QhN4i6BzwkI*(X8>6Gc+G&zbl0bKm#8F3lg;b^RXK
z>-U=X`J6M$_xtUfV{>ko{wDab(%%bTE&Y@5HR4a<S@DnY_2PfyZ;IFa%;tPoye*y+
zABul0J`3L>{tEtu__ugo{K(I3o^9e0e21)aXMDTlN8l;R&viYLe;F@`Z^d_sm;J)#
z+#_Be-zOf!JvknR;pN0<;T6T#;zx*oi-*K3d}(tYD}FjYT7FN`8m}t(L3nlX6do2|
zh1U}Q0<S0TePwet5U+(t#IL~56CZ*%6rYKglh@sAcw@=`f_sub{%e~vD*2{(3-KX%
zMd^7EZ!P(k@wVbW;xX|md7Gz`_@#JP@j-ZZ@fr9I@%4B?ynyc#KY6Rovq!uYzE3=X
zdqYd#hv(wu#NWXyitomc5D$N2^Mu6P;m3-Pz^jTc#;c2e<npqA58z?R*Z<b$IYRO=
zyq4te!0U-Wfj1EU6px5k+Gfvtp7{BAL-F2tWAPMTUFLrYk4pYKyoGp`?`)pd;*IgP
z;zRJ5_(OOn@z?OK;ydx~;?=*mdBU<?E%3PHZ^j3TKY|Yye;ZGT@5V=qpZbH%lN4`@
zj~BlKpCrB<pDO-2o)WLP-JW;4_{I1v@xk~!@p*V!{2hFe_+EU8_~}2|Jj=ze!877h
z@YUk0@vQiEe7*SbKiTuXDSjFLuJ~<uPW%b{WAQKWE#j4bw&(pqyeXa+ABt}ie-z&?
z{t3QAy!;M(-h%l5@Ll5l@jc=*@qOZN;NC5z@AH4*<;3g#V)IlK?}i^CJ_SEk`~|$a
z_)mCP{KQ}Fd25Nc!Rv|Nfj1D(;1Thy_<7>T{ASPFP`o7`m(R_icwg~H@qywW;)BHx
z;0f__3O3I$@$2xB;$!g9;tTMCd=GyePfGqfe7t!1oi@)T@jCca@m6?Bd;mUOd=frO
zd<i~J{4G2!{u91P{IK6`{w3n);>*Rm;u-PV@Rj28@YUk4;cLWy!n5L`KWxtR;uqj=
ziuc6d6(5f`mg9LTo|F6scvSMc@mlh^Ty2-l`LXm|itm%-tuNj}`X}M7#h2q-r2l<<
zkM!@xzmWWiyKT;0l5dLVCEpKkEAvdoW8xXSllT_AtN4DryLg>HZO*uOd%UmsNPM99
z0(`LeJ9q=xkN@Ba$)Ea{%`;5=a(txtP<*ubd^{<>0Us~E6Q3l0%pRL(s`&Z%$YG_Q
zYuDiW<mc;QcuIO^;c;1?EIwWG+wocAmHxIl=ZT+=kCr*F#FOGT;c4lafj5>p*Wlx&
z=PP`Y_#u03&PCE+8$VC_+u%ziKLl?m`3Lajl3$5u#5dzpWzN0$O39!6kIf&I=WULs
zq^Cc=T6!kq(<Q$QUnBVs@L7`o1D_{;{63pAD?N?y_2RwoH^s-`?}|T)=fpSQX_@Co
z{A0-<vESz0B7P3ONP0TqUr7E|JTE>6-zL5m-!A?ozC-)~UJ$SKug$+pyd}Ozd;q>r
zd?N12_CAi66Mr9XA;;n0cty$AIAC)gA>JGhiTA^oh)=<f6@Ln^D*g%HSLWQ0SC@Qk
zFF0SdnpaD_6JAezBHlp!1$?>q5BNZt=a@t6c{7r4iVqa;k9U=x`|-h&e-ZB_`QPw_
z<g1pkd4`EsyVCyMzQ@0}tjWK(Y~p*cZZY}X>4A3=AA@%lUxdfS-@ylq|BNTZE4O2Q
z@eA;&;+^r7`0e;|@rUu%;&0&V#S8eG;wQGZIo}m;iRZ+x$G3=2!}H?L;5)=W#tY(m
zi+;Elm4DZIY%G}baIdxaS$IsmHQrUcf6@KxgUR>ONqAU%5nfOH6}*A?_jpviQU_b-
z7UB)?*5d8)dh&X?6OV{5b3Ni)@V4Ry@R)e*jy7j^@fbc>d<4Em{{6!Oys!K|WFtOI
zd=EZSyv9{F&r0zazDE2mJS3klPvcd^zsAGzK67Lzd)|on`S^L_J@AI&cjJx4pTwi$
zoADOn2k_S7bvoOeZN+1FO#D{7llXkRtN3epck!R`xOk|G&C^%BVbKrwhVgoI=ZRc{
zj}+f^kn@A8`6nL!*LhNB`5s(9Fmk7zM-$_`vr7I21cHB`pTEt#7oQhjH!)A$Z9ahf
z@0XfKCYt-FZx<i8N6ph?%**48o8o*P<?%CGn@2}k{wqAun)9{qGUsW%tIONxT9O{u
zzp;*a_;$-5!+G>)*Edg*cju?CyTH7_oageqtvKJg!1Ma27Zo2*;32lxokuX3>&axK
z=P`HwY0~C#=R1t{_hYcmxyk1KpJR%T$MGcVf0eQK4j#S3@*m^7@i6<*T`#5j)z+V6
zes_Lodpw1^^QLdb)3`g|a4wz|Ux$a;-aDA*M?5mt=B&ncRqkr@#KxJAC*Ks$<4=&k
zpqu4A*4dpm8pFf*|2sdP#6O~E5guoq-TwLrPvDiAr@}S%yeZsWm*5;cjl1(kd*Io7
zY`b_`uggJRcQ^35E6lX~6wYIv<$G`)hNxUuVLkcSbj$lMlj37Po}6pGl;eCYuiwmr
z=CjE6xq#OlJ#+DeSMa)<VfhL8?`_Ou51G$r!*291P*Xf^$=^!;Zq^~kIy}n@Y?WJ2
z=5g!a^tk_iU+U$1unuv#Uc{YvYuugpdPz&>S!{FO#ymqfu0m<^GnjuS9$9E!iTn{9
z4^jMd{Pb(>c!)e~`9}C<cmf}T-++6MSiT`X9*^Ql{82oMyYtY0!*%}Ky4yU_1=fEb
zJ<sDgyaWD=%gc71(8GEXvJNfrB;Jza^LaeG#GdzO`pfpTp4emN*Kr&!ImrHf?I8Zm
zL3{)ItFYAiZ)Khn{p|{_lN`FxexFv4d781k8S-xZ4;-Ywnt!77e|<h)bP&JN_uzB0
z@T|@A8_(OAbx1sEK8g>dd3b(BY5uK)^nZO2FLT0w&spOj9zBR}WuDwho2NbVAJxnD
zWB57qIQy{`o?K=AHhwD}SzEeYui~*6&0nLxf?pWCZ-iem?}E3$leqgFzY|a4?sNG$
zJcoDXb$zUV;1&B{wtjbfUWe=UUVD)B8SZ-puZri6C-*)x&G+DaX`sA6@2+XzPjau=
z^Zv^Els(es$*wc+%=X@NvU!O7b9kLonEy4)yU(L(%pb)MC;$9G=3h%bM*bf1`&`cl
z*8e&_x3@iSp8O2ve-lpzH)8M~<M7>h=q+>ixl^YP+xw3BEb^W4_(t<E`)eYeeBXQp
zz6DQjHh25+*uJdWhvx0bzk$a;GM|iJ)X(yTkInDGSK*0G<~8t52U*XZAMyF}gXR6N
zTgAuk)vQ0Z&D>oN^Q!9R>CepL7WW>+Q(MfN;@{xj*XCiq=wHHdlGtrt8=t~)lK$I#
zBJ*_bZ|{c<c|UaD6UO--ye{%QuRE@n;h|5hrv~$Ujz@9-G6=;-r2*EH#NBb&1W)1a
zd)m!-8t+HXLOjI!oQi*fNAPO+i33?@+|3`ullV#GN8`HwYjE##+um!*AHd_d|L4x)
zqsjI5yzwtBKMNm*C%!fRl%9%%EMF*Z-yGa=do3O&@Amt>c)o(=ThU+p2J6XjT={>#
zDL#fd|K6VWZk{)X$2mTG<5dP*Pvj@dzszyc9xv=LZ_nrHSUg%VKY{0c7mx2WKa}He
zzw6&;{ulk{58-(an7@Dz!V8Dk_PTXDfX9z0U58mWvJPd;_tLY@IqS@r-Yz^ZUi&8I
zsc1d!{b4kotz_=rAHK$;hnpYI6AVaLkG_8I!}ayG3fJRp*FoNA_WJqY_jL{W*DUTq
zw_Vk5Ha}DRDF1`$zrMGG58@a59(;Z!`22FOqdxd7`COZbr^TPfm*5-@-X;EW1@AN2
zqix+LaU4F_z&wAH`K$N^$Ju&jjx-;R_i#>6MLg=SL(m_g=Mj7mp2J=KTYt{~S|9I3
zTZiS$@7B2?o{{a1`yPDX%T=>Ehw{A5*{*a|^C(5<oM?0APc&af{ypYQgv|fsi_Td?
zEuT2Pw7)xEz<1EI49}lndAGfvx;(y={8WGb|5}Gr&$s;&JIQ+7b)uFXBwy`9`<@dg
z-<tk42gz??-J+*j|F`7J`fu#PakyI6`Lv=R?lt5%cfX(MLOv_`F}?@;C3Bkfzfb>*
zc<4;?O}uZM%I~Q{<mZu}(%jY|UB~jy@8tJOv3lm?*}v6Ju>Dmy#rz6-e&_Q#R>OP+
zzVTwqd(7jGw?p{(F3I!yzrHIz{@^%?*0P>a{1=k!-xzuS$Asde?J(=luwVRN3ltwu
z;5mE_>u~+89B<5j3;9WSBy7*?t}FYDbGFO<UMP<zYg*o2N3_~->(ACUzn%W}c$E3w
zI^2dQ#Gi2ay4EwAo{g>_AB%@Z*z@M=TmB?I*XsLU4F2o*Z+;N(<$G{kMbEXKsr28D
z$1X7c1+U`&$KZV_6fwUZ-|zpTulT*|B6Iis|9?%*lNX!2*IhS`!-RPBc-E8t`Sj1m
z^XFTS`+oHy9y`yRw;69I-y1!8od0s6^~cG({X2u>F#A92aqDo*EtXHST}_y$xpVqm
z&k#J*(0YC(KMRkty+`B6aNMTJyE&_IyoDNBk2}tP<9LYC<G=iik7gVXg|kc7p%0#9
zPJg!+9}na4b1d(EK79kv;&+kXh37a<Ts}O~)+fTcjUyk&ldQA*eeC^s3SUG1bv(xY
zor3>?M=!KF2jg{bv**p=Zk@Z~Db{Bs`N?>M^?V0^6ZaZh|FyhO|Hk7@%x@!q$|!r@
zJU$3-ipPR42L8jnpJxv8y8G1k;B(<k-sh9W|CZ<N!&}erKd}5qviLte@1*MXI?Y~g
z{j+$2OYlr5^WNld@V(eikNe!5O+MV&@=Ms>ZFsz+x&LeY;v;r)(BuETmCvaV{r%Y9
zINO_|XBHlbTF+tldwAhe^Tza4y4}_@+SGglekmSmX6{}Wx8q^_Z1St{7=9W4GoHpj
zVV=`Q+w&${Sbr0q;1=J5-?Js<_iPKv$2(YlH~lyH>+@glx3dr8C*8sE)5Ut+@7r78
z1>F5SHOTkkeP`H@>=y5dgY>_35Z~c@a6EiS56j@4ev0`PJj`}o>wEF{em!l@WBD)4
zAiqbBt50$7Av<o}@piKR18>k1id)a6^z`;Um?tFpG+t5stD+z7#W)UKf5Ck&Ty6bl
zFn_x-w$8EZ%-#15&)>lR+F!K};>$0v&#zcN>lr|Q&kN1-ea+qHU>eVo?@qo(L(3<a
z$K?;b$UND*w5QR<<^}v_dfvrTeM;x))yVRm%=0&%z}@?2lJ&_mPaFDQ$0Izi^VMT*
zJrg{y|8`M)G!OsxdiM7{*uPcTk0GyFF@Klu!SRqAWb;2&RL}d0{&0VDcYGc?&Ym}d
z&m`Xv&*4|{!sw5OueY9i$v=o^aR2Li@$t6n!AId`#&i5Ir}Kum&iQZnzt`b9^2r-)
zo*neh#uJ0h-Ot^xIUjD`mHeN0YKXb}K2Yawn=^yE&*dxd(2bUF%J;)@xHrOlGH*cZ
z@%YW=hx5F@;tAY+f2%veo;QiR&xI@T6z;wk4Z|~dTl$yc-cWnqo$SXQuK!Nv;q&gS
ziT1qSt>*51zB3-fr_g^Vp2x4km*Y{^*?;>gK7PRCxchnkxJmZBIlMOc)_Cd;o98$D
zQRjD?cj17JPPU%H1oL_1zsFM(aq^c>v3z8*`6PTbo}FS|9j{vBzmLOnd=Fla^&T2g
ze8!6YyI}BWhesZ|z4)I)ekynj@jZB*#_qNLw~Oj|Pm+&4lr2ga|B8HKrsZ$M&pFL{
z(hr%xi+A%qm?!$s=%R$Y7RCR)jrZI5eU^8}`JPMc{Vg@!JWo#@|3C_U?npmiz8GJ6
zlC4{Gn)wm<I=+V|W|+^x+udXN{5|IW+ja4Ar*pPz8onJ5Pqn=Nwp@JFoN7IBd^#S(
zJ=W75w^Q*jbGrA{=kORFvfbhxa<BC#S)T#CUb^DZ**1@x^Hn@Q%e))&oIK5X5_o@n
zI9|Bl^6vfQSv<x%UqODW^9L=z20!XP>kqNN?5Oj+#?Cn&CXm0?KOX+;b9v}<{CgO-
zi?3<kKzefG^YH@iJ~!XP6LW0N=B#JAl+Br)YhE93gcs(Sf6tuL4l?J{z6Y<D%p;cf
ze|=PZwC4Eq7Medz&(ag^c{BJr{0+VbCLguD`+dbN%#)vQ{x<#n@3)>p+Wb&_|3T(F
zrKY{^M)LY4?sdfzxch$faM2I<as&MT!+*HH(>=#L$;a(^pJo2j9<=YzOXT}=N8f|j
zX>^I@dyt=or|~=SxA5>%%YVrmaii(hlY7d12l*wqhrf*<I>YkWr%Ru=BOcC}yWelT
z>Tl+M9f$dY_%wb`m|bB#?)a?Ceu+M7-imoH!c(~W9ys&?o71Dmy<e?i&d@W~Q;nXl
z@Cfeu%ROX0N!<N>bQzw(JJK@+4>6DXT+84(&qkML&hzLg;4ypve#}hv7w*4}7az^=
z&`R5`FF6p$y8QFzKa<bmh3Cxs<CSJvf1La|_@#InuYup}d+_sAQjYU^_#}B<yoRUn
z5IuY7*Zop$w#}pK8O3$}$+*t{3a<12eURgDC;99uTc5JbdEp$^U;J)7Fa9205HCB|
zdc4)8{Vnkj?%q#s^gZ}FI>-Bf`#e3h)_=dgqP_?3pXqn4|2m5L@%jqWKaB1A&b@EH
zX!(oF*#BcMudnQD=Kg6l#Yc+QSLS8&uH@ff{xtpm+ja3#zzg_5{P=ksZ!cJnd;K=S
zbK-sRD08}>Bj(`VTI*THobThISIwLAy!GbuyttdcKc0Qv^8V?T#m7C)-!N~*_HM^R
zZ=1W<?+FiEe+<8Z=k0)}@nv|*_hA24Y<Nad-2VL>kBV1#g#LHzd0Q~&<#=eL`J?y*
zJo~=+E)M9gUC$Th?){-&+WNB}nMaxPM%??@{7U-Q;)PGl+v30DG5m6R&Rt;rY259X
zVR&@2^$a2Z2%f?3$G70IEtao~S6FEMN&H&88J@@eAB&2Q(Rku>>v6C5CtMHiK99EG
zDct@1TkTPM-VE;MoQ3Ot-;C?_-hPnR^{j*VYTtwJ|7qS=`EkVC?Y~Zg<7y<wPd(P5
z%p#jJ`lW5J9d({}J)Xcvl(YZB%Xs)(%l}Ewk&CUTu(fpEI^*6q=I--rJ)Xeb_u<fE
z)|11-^moAvcpZF-@4<GZ<#n<9AnS9o|3Um;-$PpXUR<|1*<!Q%p4R;Yn<rJU`Tbw_
z6(4hWAB+ENemV!lbG#1}elqV%ekHHpEi&iT?04@w%V+6X$@Ye~nb)9aJ08Q`=l&6o
zGbiqz_X0f2JZI8#i|@g|6F7EY&7xGoD=)G2Y<*Gb`_DyqQoOtG!S~O?A2z3Z-*^ho
zux_W&zXQ*S*Ir5w?!IsL#J%6G{|0&<!V|dv>$u`0k7w|9_@z%+zsH<z-4@_coaOg^
z!ee-ho<_^8Cym>0Wj!y6=Xcqhuk*h5taI}2{bZ-_!Eqas^}OCc4*%=;oa}q>_ksxf
z>uBcLfDabG#D9Sl|Gt1XAV1|Y`?)W%*XFFv`(F7I*{*%&5%PbIF^`wG<KY3k_E_`6
z9?Or%FUOOAo4eoJOvJqk*5iI}e8O_;$^2>gAL!|Xr~fiPn)Q6p<#}HB`M3^`(Z7?P
z<DLxG`D8D{>zW#`CB8&n?_+!qj^`Zf@b;lr^evt)XY;%7OP4)m`7rC_j;ouTGmrau
z;TC_t{MY`Sdk|lJ5dYlw;Cr1{(dKc#k3HG{;0ru-xcQ5$L(2VpME(}``vyFAsO29e
zzrmkBShqK2eZIw4i|@m;;&=G-{MYu*If$<~h`)Uh|Hk*?I-F+nyPqT0`~NX`f4hg{
z-|hE557Kjp|ABajcchnl!QKy925)}gWlr)|b6%D6OM(y3GS&Z&pLv|Wu>L=?Ud`*;
z<?r|NWy&1wg_+a&%%X?9hT>O^wBss-`+K_hsO#se`1?0}hX2C%AEy=nr{|r8XKL6w
zG{yhtd)TYy?P+5Ud!2l*?7yF6AFzgU{6G2>dtB=O``@?v9{P`-vBm!XFX#TRr3dlX
z58^v~54J1RxV}}e!z%j!vFJ_A|H5lKztsF0p12#Ho$MF-kMekMdPQabd(o_M_D)z4
zzv!G`4Uh7kc+l3<<rBWU-?0-d{;xm%H=O_Sxs7ekSS#ys>$x6}<2IH5cNsi|yMKrL
zFP^{5dQRZUPi|uUUhC31oBAHiAHv=5H{K*4#w*j)Cu%)~HrC_j>2Sq=Z`V2Ptv{Z!
zyj!<*{s&_4yxs3Fo#!pzgZ-ucBl+wc%g34LU;H}hsnEfC)a&{lJa2fe_1JIMJ+BGg
zUwT^7qu!r<nx6NWXEc6;^h}^feJT0GJe$Y$yy{$f-lZoYULYTxZ#_3KPbL2sO2PKt
zD*2G_!8)r)$Y<%<PftsHl=NIlkNS<|a}V1*ZFt>{$M2G!`{+?$K|YeU9{2bCck%Jk
zvxOe@{p6zytjGO(wiEn6JO%4BS$a<KJy;+0OUUOITF<wv&sF#|>FG(2`dHWVsP+6x
z&s_XL=~+mR`a1I2Mb=Y=b>586lAf>VQU8~GVX^hBV4h<-+4{_vo@%}a>!W@#`OIV1
zb0<CR@rBaUjUM&U<Rgz;&qR8r<Bv<vJbKjElF!oPUKbzX%cbWtderxlk1Vlyo@Sn-
zJKOp^BRwbh9;}agL-Kj)X@@^AJzeNgA3;8~)aH4CdG5tulAh`GsIMd+U1mMa>3Iv!
zO3y}m)OV0iNzVcNb?K?#|G*xsk9r;7gV$r8o}tXs2!C68n$x4+mwbG=ZC4w5M&a*E
zPm&(>N62UB@!w{Pj}`dG((@uc>YtGhJ!$i}p5O3KrRPt2)Q`W~)+a%a|F%|qoPmEO
zJrUo7byjajK1a{>^jwF3D?J0~QJ+9Q`jl-~ik><6cIipeqy7r{96e*``3V0-dOoK|
zeINPw(>9OmsnXTf?RV*^>U*$0>KBqP(DNGew88(9o{sdW4<(<?*gURhBEC<0QuL@l
zOFs2XY0q2uArF|BXU_NOQU8^E{8`Jp{a(iZ19b4Zs~|m<d=J(~y*~LIJ#N2Wf*&qD
zE$LD3M?Sv7=5hOL6dsbEarCG!CSQ=AHTZGT^BO(s-;z(Rw0SnMo_p|W(&JrY>!V)7
z_h3Ea&z1IEh@UDwP3ckZOFsI1Y0qf9mh_CJM|}bL@G9%MgY9|_uP;3>(WCwu`7}NL
z*M{QbSNtsL`GX$y%Kk6XgZ1%N+dS_5xfcFE={eo^;5byjoP3O)|MC5&2i{P6deftx
zB%gZ0<{8CyJ%~4vp852sXUT_NEM4a<cr)qQMvr=>?zTQD>8XidCOxP79;}c0734!N
zmCn-(Z!0|m=~16TJ|#U1@D9@RI6dkc$cNV0JmXmB&+snN^9?=fWqa8Ar0H?LKRgk?
zR(fjq9;}agGxCwOrSo*f<I>Zc9`(D)XX$bC%*Xpl&mwx%*OQOEZ1a4;`h1EHlAdqq
zQ7_li)+h3c^=zf*B>X1nInDQAebk$g4`;1s9z9p%!=$GdJ?csFxeeB{jGl+^+ob1V
zdemPbpM0aV=M(%+>G_Tx^`m;RK5tsjZ02czkCUDzz6a~0K7f3Vo+b25z$Z!1OnTH`
zC7*oD=5alF{9frP(4&50ob7tMbh|FZACR8bz6aZ-K9qcp9yibZ_)O_pNRRrv<dg4|
zZr9KFJn7j<k9z1jw(DK%IhXhOTKEF#Im7p0yVTo|PrPS6jp>Qwk4et}dekS9&ulF1
znTIcvo@Ml?e?-3UzV)1ah*x|R@Qn0$y=}YH>-!#jo`yfLo{Q+Y41Z2~uBJylNk0Bz
zY0ty>i_-HXJ?fv3Pk&UpUBBb6NKctQwq5FH_#SLm?&H#VF2~nPPd9qh$C3|iD(y+*
zZ%NM*deq+{AKPp_583YWzQ#97&vts$59`bE{FU`Arl$t}k@TG5dvH9fw<DjY$9-Sz
zk8hEl;q<6KLO%Po&EtC3;$KS7271)DlTYWZXD!=x0RKjMj_POIrGA0$#qZBsOM9-u
ze~_LY^r$Dvhrh9&_n2oEzC(JJ(xd(n`2sy|zyFT!l%8_^ZM)RZ_C46H)VH;ZlPqSQ
z_V}OD)0-akY2>5ZtjF!Ir}2NJ=T&;tcaRT%XFYCz9WlV>^yKdv$M_x`hwA?$pQp!N
zZ>=?6UV38msNYQ9`@!aM*F~9#A0|CB=uuxoKC<0<+`q%$f*&nCd3w|j8EESe`_X#b
z`2r{4$4XCa--Gp0Z%aP&ll8cN*EA46QF?BsNBu$a-VW=zf%o|*@KdDc1$xxCl280%
zJ?`K2{fpO>o<sd#6a?F)Uf=iNc+UT7J?=P+;`O8_MvwY!<g>q7k2`*5;%7?F5_;4>
zB%d!>k2`*L;^#_FnL)N)>J5Akwk!0z^|<}s2ER~xdeEajiG1P@>v8*Q3Eo(Go}x$n
zJ@WZo)>E;Zy>V>Cn@Z15^r#<kgKbw~kM;O(%f&}cyp{Ac@IBb?>g~vf_gYVLdivpQ
zq~}I@)KlaW^!UFnC_Wy?W73nMNBu+B^H1qKKjNLGXBR!{RR`PpWa-H=&pG%t($m=Y
zV13l%<dgeso;mc~hWC=5@${%KAs^mvJ#L;?@xIdYE<Nf6@_BmPJeB-^7#SSr*Go?o
z--GQ^KaYI!-_mt%gWo7Uo#;^?PCh3+_u{um&rEvMUnCzsVDq$Moj=4!O3xScs2?Do
zke(B6wDq||dQS5_SReIP<g@flXP#d8Sm_x=kNPz7aj#CX!Sy_jPn4b&^r&wk?;T=2
z%bDkQe5&;POOJZZn`|92($fULUwT^k9;}c0K=Sc2HqW`ta~J-Q^h}~h{R#4E>3J2O
zD?M-1qrQWDUV6$VY<ttvbA<1~cB!95K3ukRyISIlrKcl3>Lba=>G40dijN1$>%TL3
zk$jH4|F%|qY{H+G=iNq+`r$X*_QuQE{B7x}i?5WPM!pB@pgxd%hMs=(Ou=7}o-{q`
z?~@Ogw|U%l{eizMJ^SfV4-d8NO3~xivmySP^fdE5*e>;c<h=?uPloNf1AkL`#?qsn
zCZC~aLK!Psfxjm`FVds_8Tn`>o5%J1hJPqMd+AX><rZ6?+~L+Uh3&c!-z+`Nd=J(~
zy+8T*5!Q1LJ$K<>NY6xi)R&Mi(DODuFXLOK=S_Ole<q(j(&kCgQ+Akb@AuMkwC}-o
zsh>wac9iuzNKZTbXX&|?9`!NgL!r{1d3Zs37SN;q3i%8@{;vUxj}P(P((@@j>VK0D
zRknHDen09~Tc5qsbG+}t`lw$>K24AR@lkwSi64-jtLRZ5N<LJ@=5fc*1iajI`+P4?
z|5SR^myu7?^C<aO@JiD27Cq`a$j6Scd7h%D+;Cf;qon63--Gp0KZm?`to5v*=W_fQ
z>A8v?_2J~R^vs}V8eUa;7SW^r5&8IWHjg`=|H4m}p0XosyVUFX9(-@~j<=rk_&Ky0
zewy^OqeuM~^3fBl=SQ~dUc9dK%%(^EW%7}#*0YG7EqDXz`JNv2BS+eH<>~SNd{BJU
z$Ip?T3w;mvyLwOZp=vgd|MpvajKVLFp1bK$UrIhjkNf#(9o|TKKA=baZ}J6t{9ktz
zAIINj+Z&ahI=%<nrQV)={3P41*XbF8x0Iey^r)xFN2*)T8hT#BuaKU1=uzKAzCh1z
zda8`F?P@PQHGL1ZOZ`gng_CWb&*&M9caokF^r+7zpF72R-22Z;yqomAM34IC<ddgb
z&tuF}z<Wy19(vSIyxrCz61E=q{r_ydkMx}9d$2z09m(hExr=%F;RB`T271(|kdN1}
zd0fxK_z>w?M34Gw<n#3SU;m1a&G=C1`I;VeZ?vsX{4|@#eSRH}kC2{IeGk@0y(#%j
zP3w7wdAj1Gr6*30`WW)bTGn$XJu~q!(vzk~eI5B+9qW0Oo=@=!(vzo0-MhoqCtcTi
z{6A+DAIIVMNKbX&gY{8wOg>c4dS=nn0Z&O!4|>$clF!iN-sk7xGo)t`J?gKMkJq<(
z{69+-AD`oMq~}|D)XVuhDEJ)A(bI;WYWO44bDHnL`lvT2A3nWwo^JRe>FGm{`ULVR
zdhTVOhw-J-vxFY?cgTm&D4pk9{3+@AjUM%5{TFz!J}G+KJZIr6r00C!gY{AGOg`Me
z=4r&wO@r{&(ld-6^_k@3XIW1-*5_$_t@J!kkNPI^S$h1hPsPX2_&VwFlD1vyXZaqi
zPxNe?r@z&B9q>1#=URHy?<Ak0rzPLV9>U+1o-{q`uaWo8v3Xq2=lBQG^D8~-)yCL%
z#m+75X@qZ*o_4+m`(1q$`Skxvd*<SwOV1*D)HjeXoM%1$*u}?}cwTybphx|%v9?`_
z^R36d&!2{WCp~BS9&DF-2lCMitjB%t9fbcRJ-5)KK7)MZLhJG0#*2^T_;1qlJU!~4
zlF!id20c6RUDC6U9`*1zTc2b@o98=vF2etoo~FJB>!aSAeBvVO*-6hx{9oz0iyrky
zT+hYU<M!7|yzGq9pDSOcM|~^#Xd~-!zeoBTuP8lb#@lwO*YZ7hy=UlokL_xNA1OT@
z=}{j`KH1pjar^yoyo&TZO^<qxyw}8f{Qp>de2<?XJ$vX;uX(p^SBf6DKF#s!($mrR
zV85%6BJTw!Ap4ISivQE|X5uxZCryv~I`Rp6&ZXy5ypHs2qes2s1ojs_{;y4nj~e(H
z(sQow!FH*4Cm*@APVv9*|9YYLxZSz*jH4$azKDE=o+gJ_&x`o^l3!1c`VZuzP3sg3
zT+adgV(F<g(Y8yyf$zodKh3Pi|1(PQ(E`6zdM>9&eE|7rbL-ha&nUcw^d#v~e}sIf
zh4mcA`mDe&m!6mCQU8K`q^0#7Nzd<iJL%a=kNQcIY<;4wtjB-bEI!V`uacf7z6ZyT
zdT;VM=^2f8m7dA;s6S0UewoeVevkb&-a~pe(WCwk`A}=?>B)8-H`%tgxAcU454KCa
z1^I;Z^uPy5&j5PVr;yLlGlzK=;)A7U89nOnkxyK1+vR$;;WtaqZ}g}i<Nrc6cwL09
zC|#ch_;BfI=zFj}>OIIO=yAu-2>f>GnMjX%hP>Ct=5ak6@uc)@p-256@+o>AWu1??
z$F}!w>8b5|uwCjg^3k?7&xiCR@F~(WoF4TV<il55k9(hAir*(aE9g<*L_SWBJDz{V
zr%TUXdel#yYU>bhXY=?UE5*k}_-yHk`W~#0dYpWL9{2Z)Tk(gb=MH+*A10q{Z}Z&1
zJS*`>rDrWY>R*#D(BtO$6JH`dp8tbhuwCjkeGgt2$yn(;jqoR>r#U_9eaRQ-ar2DA
zpOv0*^r$Z;pX^XN&l-G{^lYF<{YUZzdfYsROtbA>BRxm>9;}aggnY7N={%R?uS!oR
zdeldfFVJ%>`~7}=gY?X!M?FhEc9qT3j-F5Pccf<<J?a(jvvo+*)0v(ccusoG_B~i1
z^)BSSPBxGK_FH@m#XpgrBt7bnlTS&{Yxrl<vymS4-^qtN+dQk8=Wzc32#)iwrKhUz
z!FH)%LOxHAe>fE%-SBPF)1MyospJz~Y#!J1IR2ybJVlTCM)Ka(*7H2`e2f1oJv-=8
zKk9y4hb%qr{iinmhx9b{Jy;+0;p8J-Z64Q?#`j3ii}a}fLOv}$$30+k?w6i3d=KVP
zzlMCMTj@M^;$<GHU0kv9tlKPl)ZZbWl%7BEL#3yZ{|CEZ9`%O42gh519`|>>?)VYX
zGlCxVW#rS>*mm8-{2$|$rKdoTdhO{pPw3jx^=XeEFFk{O54KBvA^D{AyoaA8JwMW;
ze##8ihn`_<S8F^hJ-vJn=25?oe4@K;m+M)H*Os0S=utoPA?BgS9na_Dr%O*;--CJ7
z?;;=XQM%5L<7Z3H>-4DaBcGF=dNXa#^Q7l;--CJ7$B~crES=|B{37YuNRN71{|^Mg
z@tl#KGx1BLr>*b7JnG}ghkKRI^9<fxdful;y~1ppCnY`S;;p48=6f)Y`b6?xymX!w
z_?6Q0K0WH?{2xq$^+`z2S$Ie3Y3+M3kNWN8^V0Juezo+hqep!&`Pg-(+f{e2&DmXg
z&hkB&N4*{Stn~E3ualm^^r%lGAL(5>&tkm4^gKzA`bP3;dd}s2V;g>h^!!SXdgXbx
z4taWp(o-K#NY6RG2kWEWk$k+5ZI_#8AbzX#45deX2Kh8SZk{LcQPT4qJ?fjthx^(*
z!`ZGK_+8R-fFAWT=G*$D=@~&!TYS9q^zuDeAN704hx^$)qv=_WPnMn)^r(MCK24AR
z>yF}MJ3dW%3iPN~e%Q7v(%<Ise|#=J>f#Sd&)L2Q+oc{OpQXor@9mGzlAa;-sNYLI
zHo)fbe{EHKEX3za&lB{hze7Gx&pq^P#TQD?kMyV?_K2-dVxZ0AwkwQ3E<JU957tNh
zGV<Q_*3*f3y5q~Gr!PI~cax9P)0LjN_%qV;C_U=y$!F>5OV4Nc^V0JzJ?iDt96y6>
zyY8Z=I{uRM)bTx7AN9-0XQbykJS#mn(4#($eB=h3=Vs3ykKwONPsaCP9`%pNXXxo;
zHQvwo+tRa}9`&jVY<<FmZ64PX!QYpj3w;mfQNNmeik=qCGZ_C^dWO-XK7)Lo9{<-P
z#m6%IQ|Wo09`(=2Cx+N|`7fX1;}85R>Dfn*`Y8)-eL^>u_MDG@D?N>T57tNh8uA%>
z_A<|n_;%?TMvwY~uIDD3=X!b`$A6KY=jl=ZihMX>JqddL!GD*Y@{ihfsn_#8_#Ty_
z$NxH7d_?iTq^Au%>Nk;(+-&o>o=NyV>3NVI^;P6^^o(Vm9Dc~m(w{3orAPfA^6{ZI
z&ztmAS!CN=L3*nD9_)AZOUXxWDebu$KU{jQqep!_d2g8YEM=Z~cu0C4qeuNM@)>%b
zre`aDob>!mkNQ!IZ5`sb+B~kO4qi=q&hb51AN9`UL&L47%po?=jrgh3a|b=@i^+#a
zSkH}Y*E+nG^n5^%`rqVJ^mL}@_{VH}>r2ncz6aZ--h_OC9{2g)5kE_My3wP48~NnO
zI>kxc=fyPqf6_CZ9`zOEz1ytk8n){VyrJ}bLXY}B@^O0nUk4Q*RUfzYX(B!Kd=J(~
zy#sk~l+DwNo}2Jy(sLU<>hs9Q>2co=SK^mRPnI6_@5mRV=K$VTdJbJ;+ofL5_u&07
zdAn_wo97a|gY>kfNBsu!q0yy1<M1xhGnF3or^u(J=XLyA>B-TfzKeY1j?#IK^xuer
z?Tt&%3BCvWUHxM6S?TGB_miHU^r(*`AG@=3p85D7=~+&X`p4w+(z6r4NqWjWVcVtN
z!1v<s6YeUVrwu+#db-e~K8n1TEbVy^zfF1`p+|i^`IPj0h2JSXKhdLp^fKG7@R-th
z>f__2=K|k@{jT1fd`5ak;FF|h96jnw$VbPP&a)1`S9;#3M}0T>ob-g2+x9*nJ=J{=
zwoAP!`S`ffd9KB0O3y%g)TfazNY7Gyp7gAwM|}(V<oMEgcH#@9=U;l%YdvY}5W2gx
z=TiJJ>1pkIus-U8$fu=e48BZyrqH9loP1<L={&FD8R>bK9`#?zXQik7Q?|X&Nl(c4
zV7t`MBOjYsI?t8(i_&v7J?f*#=cVTX{1xe$PmlVm<P(!h=lKj@FFo7nQ9soGA{V@m
zd6TW@H$In7#ov;i(|r%tN4*{S0zEtE8Gvt;o?GcrpGQ7F#pa3f{ctt@k@T#mNBu|g
z(W%yxVV<)7{}F8O7U?<M_h7r!8;~#1b2~lF@Gqt33VPIUAfLR~<~g69G59yqb1yyW
z&yvqd&%5{!(zBT!^?%5F(`=q5%yaBBwmv(g=Vaf5^-;f+e3TxyU0v{<($j|?^~vOu
z(z6KvQ+hJ=sDDB}E8F!O{*UzRrbqqQXL()RXX|q_>r>D9tlC8>kDu*(us)u6d-6$o
z+LP~#mzVrtderYFpQFe9-uO}cFzH!FkNUghV=3FN0nGCaezf%Lphx}a6}CQEdfYs9
z@nfau0^fu6QSV7UdcV!%?$3TZexmf;Ly!7O^6>{sdp^WZk)F@#QQuEK`JnZ*V115V
zY1>;<dTRR~Y?pd_@;Q3k?^kcc>q*ZY^r$Z;ADM3RxSn<RnbPwyJ?aO@r|5C7_fwy<
z?K)R_>i8aPmwGGmIeK=nKG)(GN>6Wk)W?!fJ!ISU0X;ME#?tdJJ?bx$Pt3HQ7@rp(
z;Z3FIb9&VGk<ZSup39l1%Ja59t)!=#@4@=0HzFUJZ9Sdo>43M9p6lpQpF%!C&kTB&
z;4$f0O^^Ck@}W64&tQ5E;GLzX{3_cn^;*6MpPNZ~E~cj;evR}*=~2Iqd{%l!;Ju_L
zNsszM^4?tAE;r8$cwgyRPmlU`@=@tIWVLPY_0n^s@4<doKbL%xp0`+^w)l<G)0rOi
z5#;mpY`Z$pGY!8*dgjoh{tEf%eCs)%o=@?S((?^H>g8Twze~?a_#M(y*Y{w3)Z3EJ
zN>6`$tn>_{M}01N?_t}n7~8c9pC~<9depxmAEU>u&)@h|>8bFdZI^lj--Fjhik=&p
z=Q8|$>50*!K9s!o2*)$;W0UcRq-Q!k>Z{2|)7I05c|O4BO3!wB)T_S4cG1(1o+fx&
zdam+4*zf9N$$JZI9@q02zF2ymqDTFG@(FtUkLlv$JNyah`I8>?@EY5$$U>XvBYG~y
zpO&5uz6aZ-KAL>!QS14ho_Y96=~+aN`s?JqMb_hv!_V;-q$f|0`oH9(^h{!&%4=<V
zUzVO~z6aZ--k5xjp2z693V%&{y3(UQoP2VzZPzS%CgX2PPl_J(4EgY5*5kiCi;vgw
z_oU}Tderxj&(q_+_a5`It<Q(jQ^WUQebif%k3DYlG-95<_-5(3o*wmy<WuxKN6#Gm
z3+Y)vkNT_RV@qrv*Ru)VDm`D&qy7*1(9+VLqhGQ0`CfWX@I6=`^@ikg($fzAS$ewA
zqdtOs>Is|2|1qTaxEC)-&rEvMUnCz}R@(C+zFT@er$>E1`Oxyxo@2ANK6|C-RNsU3
zQEyH@Cq3Qq1Jcuv9`%XjQ%~AF=X&;d7%w-wb`i?skNF<VqyC2P!RJ?w9{2w9C0<E-
zexOJFuve`o^pwrxdQQWSlAZ>>2lJ@6Cm*B7ZPx(&80ksSqy7N-l+3dfuPQw&=~3T8
zJ|{gp@sp)zA3f^fb*#_Rwm!o>dt8K{COs{D57tM0Ao&zMedrm3*Oi`0^r$Z(?`3SB
zvh=LQ8%WQ4^r-(yJ|sQIzGmBdj`W=5d$3*VjmU?kCx%}jJy+ACK7xFNo*r!1RJ@V&
zOs7ZvIr1rbZldQMJSsgO(xd)6dG8rp=c)7@y582OrSu%_d$2z05%MW|9;T-ieuea0
zNssyv@{wn4p3(G-#oJ5I6nfN`k<ZXGfu2|KPSW!ZJ?g)bkFKzJ+;&xX-PWg@^i=UZ
zSReHZ$>*h~1Kv}5y3wOPihOcq>HbRLeWYg&J?d-8hn};ZMXb*!_(18|N{@Pl4Ym$h
z=?UXQq~}cEgY{9rihSbvI>kxeV4fTCq0%#g9`)JeqpPfE96cF)g!H^bkNVf-bM(xh
zXAeGFdJgjk4z^4EeBXo5uf*y)#fDGl>5Pw&o&ofzKR`bAg7x&H=UIG$^sJ&s{S)$8
zdLE!>JARM!{6>%Z(Qn#z#b2~}s?k#mPf5?2z6a~0-kyAto&-Jp@fp%HgdX*K$rtEx
z+qDp%BRx;iqy7>3)JwKq9hv7B{1NHdM~`|f{}(L9{l3O}+;%m^7fDY?--Gp0A5A__
z&o#_57hftpOX*S1k&mvmdG4cU2mX}w?59V)?%TFqIePrJh2o<XzCwDs`5tVS`rYKy
zFWWrs_4OFOT6!|{sDDI0`ik{zW}ct%wbHYP9`#e+vF!?Ht>;gAF2dJIPb=Sp?NT4?
zdS11j&*+(mzac&M(W9Or@2#_*yXe_~zbifO(WCwg`6NB=??h$Zwe9^tdXDft*e><6
z$$PKaJpb?Sk?>8@(~%za;p7waxbNZj;h#&-OnTH;lh4p|59_lL&r8oHder|SA6;+T
z<-eSZk0al+_4!VEj`uxSAN7XhL$6!UVtU%+KS|Hk^r(*{pQXpW|4hSwlb-4HsIMR&
z-eB{%@AGfqyQF6$J?g)bPt)W64pMfbt<T@m6Y@P+AN32!C*QDnda^zp@PDP}T6)y)
zB%i0}dU_th%g(7?l=AojdemPhAA8g08BG2wyrT4cPmlVc@7p?L>G_(Tlkp>^r>5`0
z`lvS}pLxsXIpz>6>Vj91o}Tom-$g$6w)Lc#X9j+P^emu9eFOQ#JJxd(J$byk^n6c`
zdik8KPoADG^qh#-ke*t;2kWEWntb$Ko9A14uEXm{Pl6uxS>$77>lPcrtn+jD8Pc<k
z9`)_yv*oO(67!VxUm(HfenfgI`5tVSdOh-bf9=Nj`O?#z9`!!t3(WHf^NhqVmYyU%
z>I=xH%G-ANA0NfXD*RIEd6^#dugJ$MSWlduKkydPvzH$AY9I3YqGuxe>umgT>1pVD
zu+Hk&kdIZgd0fxUcsuF2jUM$m<nxuRXCB*?!LO2@RrIKDCZ9aadYZ9aJMga3vy&e6
zqy0Y+6n{T<xb?X2Kd0e6q^GX$!TPATB%e8=w5KcHTYBR3sE;9^I<j=TX5s^+XFfgZ
zYslx1DxIgv$2NbK9(P>L$HP<V759kyUa{5rT>D(Oo&9x^|ASbt4r$zf`zk)}z;hj}
z$K7}DK|C?fdR%@v9-VJ~13f!@5B5t2XZL!QHZecGpL|o_gM9X3>v8#B<a2l%{vYef
zhts9`@5o2+-SpS?4~*jb(t^_bWxfaN5XDE)Kbd@XVQKy`@;ThS4<EXP`5!IK*YiE-
z&*8r^&p<r8$nsTq-ZVY=#ic!K$rtby^z6p<{j<}j_PmkDt!E<n2k<QJ_SY_#Ut;;E
z$v5!7zz6f^#rxx_rIvqzeA@S5onudw&i@Pf=rZ%><lBF4{VDOWcpe{0{u?~b{p;J{
z)xWSFy{~;oT<=pq1=st|ufz2|^1E@pFZ>x_+Vkpt-mk;;zU^~yy$|~bxZYR2(pRj{
zGq!H8vAr#Ez3=($c<x!tr^v5x`4#44@SkzL&v;$`i(IhZ^}gY~@c47q<G<XBkJ-51
z*LyP_US;_^$RC-v{xp6S-W>N<Ti)&8;ke!xdlQ~|!SbW%skGJl^}f}Y;d&qH$u9p=
z>71{+y!d{X=YGxOc;5eg!*=20@gcb0m-%sA@3Z_JuJ=to*?+?c_KV&Jxf`zcHU0wE
z`xGC)&3g2{!!bO<{eNdO{~TQJ3;ez7k^B6f`yF$>ZR_cJM&o)P-Zyc*ukNwm(<ArE
zt&i({Z!g95KDM24y)W%8xZY>>0bK7J`wXu4fqfs>`?~&$hc?+dxX;rgf3WS+`>tMw
zN4~QBV%C2guJ=WK71#Tm=5f7mY4z>)yz#Bpzn1=salNnTNL=p|x*pg2epdO>`t?4Z
zy>Y!SX9m~%Y;M5yzL{U*dLPWaxZc;Y@=rF8-ly^`T<<&C3fKEc_QmzSkfU+E&*NN|
zm-{xpgzJ46KgadHiu+v8F56%3K8e|%ZM*cohx>57k72_dme>0d4s-sG^*85u$l!Y4
zzyhxK0gU`&{d!-&1g`h#%iwz7y&bsTN3Wj$#VL5-)BED}$MrsUi*UVf-8NkBLs#oJ
z>(~3r^~d!-aZ7N$@7p$9@8cFOSijzvtvjywS(}CHebdV9v>v@b+9X`>ceWAN`<I1(
zXFbc;Demzb*zdz}y}#J!xZW@9#6PS@@BeiluJ`kL1K0a=9qRu><zWBn{kAT{_5N97
zaJ?VaE4bd@>Ja}2fuLXSSJe{N`%jH@`O3CFQ&^uIuJ?yJ>`(fuSpFXJeQ~{i(;Qsy
z$5i$&>(TowHO2LQNwaai|Iu%*=XjgP-OuQpJ=U-HC%OgK`wcyZ>-~fN!S#MXP5-uj
zy}!>GT<_QO7OwZ-IcBf*=>2rA#`XR<3!I;9+xr>o{|m17uQ~G{>(~3y^uzW3GLPVT
zznBkkz5h$aeb%q{bGZc9`?HM1^?oZWalL=auejb1<@Ejbyn26=-nibcWDc(PANc~;
z`-wFA*ZTGTAh+UrzmKPJy?@6qE`NsYuioshGY(k4-d|$~uJ_Am;$NVk_`Y$L^>m}>
zc3kggu>#loQyf~>dh~u1Epfen#5i2<2k{ZE_jjmS&ieI!4ZU!^|H4{a@29XI*ZU(}
zRG#@Sv~_rcbr^x`{R>v%dOw1l&M&r}cj&2K!TR-n0aJ0k{{Oem+tn*Jj5)-L{=)S-
z|LqU8e!Xu0WL&SqzuDz4u{qu6SL=$_qu0qFk7vjaW4qSldL8>vCD!v&>vz|azXI3m
z%um8|%`88J{tYhQ+<XK6H?G%pKl?CyUcF9x3fJqdzm4m4)Q>padi1*JF<h^6{s6Am
zE&mYL>yU?zV4W|w?V7`OwZ-*1;Zt4y3d@frzY*8#cvtl=P#?Um^}5{aalOuVm7^@L
z*UfH?>vgavIB#e3+|4|1;(DFxT1Q*IUU#}DuGf*yx_r#~-SPRa%j53%Gd)7|cPP!z
z#`QYP|GIp~()=Zrtw*nud?&8gJ$@P2>lpuw>vf4+_`j$u?w3yXypvhCDY#xY_-$OT
z1ALVK2fm<3uj|_p*X#6N=bv~O<n_9{^Kre7?g3n{i`&D$fkM!u*STGR>vd}@A8&cR
z4((W6uPgfvuGfieaf0>ebzhg@g==j+-TTsERV}aAW$ld@x|i<v4Y*!6wbqH&qt`*5
zg6nln|HSn=rCqC8k6w2)i|chn>zrhHy)NifT(9%_E3Vh=Y+K!W^g5g?alNkQ4qUI3
zS?^@)(d%CJ#`QXuX<V;M`I+nKW9#pJFJ1K%)~B!ea9$T3alH=YLYMDn`O)NmcX@mg
ze)_4_uh(7di|ch1AI9~%h?{V|&SA*^LvZo+IMC*Cuj?yty$<16-zziQ|NnnYA+OgP
zT#xJZ1OLS%gY0?lqU_=tHjiH4ZzQhQ>wDhiZ?K*|^z6d*dV1%cX8n47yg|5L?`{#U
z*RR`#>-FfuHJSfLo5%k#r1<FK{3i43@h5S;{#&hD%$czKO!B>Py*}GQT(7tG4X)Qu
zJEgYu>-EsO<9dCwIk;Z0><?V8Ki0O6_3QP-p2hY0V0&@B-dD4_)}z<&+Jfu#xI*<T
zuh-XVh3oaQrn>wnTjyS^^Cp+a-R}X8t<OB-U2whLR0`MYM{UIQdQgX*ZvA?Frz>#1
zUehF8ufOy*uGdqla)$LM?y~K5+tm}->m5Ce>-CHBxL%Lwv<B9%*B9!K>-B<`;ClU^
z9k^c4=d3eX|8b@B55@I*JI~^J{hU8>y&lfFXR-bhO8alb^?Ef=;ClU;pK!gNOue&N
z|4F6&1983H%VW4+zvX*eug9`3V*Pr3m2&5>ZgRbpp}1cEWI3+aGx-(Q>yw;yF3&s7
zw(EReM}u&^e#ld}UJs;zd-qw-&GgjypY`kYI(p)I{f$|;UQgpgT(6H2@_$hld=BdM
zE)F~2T(4iz3D@gUq;S2y#22_;FQV23*00xp7>eul9G-A_xjw@;xL$AJwF|9Zub;3V
z*Xto%*wFHNeS@XAUa#N}T(3V+(f@;2@q7Cm+m9EsA5X>g`T!T=dcJ=LT+iQs5ZCkZ
z-*-K7K6~YhZJwm~mAIb&ejl#qvwwo?`RSD#S-+ld-Wk{P$M47g&-vi4N6zoA-`M)~
zeC@k%J^%UzT+gR2-^6<K{OGH2J>U6WT+d(5;(9*vK3vZ)KKl}TUOiuU0Iui%F2nVF
z-krFfpWEtE>(}#b7vg&U>=s<lhpiH|9zDOcEw1OQPQms3(+#+uPg<cV{c?V2GhEO2
z9F6Pwn=j&eKIR_R|5WMw@OjPbdG&nBVR-0i%e$XnHsg9e;|a~JN6$|@6W8+%+v0lu
zU~gQ{2b_cJ`F$VbdcNKfEm%)E|L#Uy&!>9~*Yo4@xSsEJN=xh4^Vj0Io{u&M*YnG^
z;CjATwN}=z=YRFY^?a^5xSpT&nd_1Bt?FH7{d)e?a9q!a+KcP?O<h}CkDjme1Fq*E
zoqD<D^?ai1@PwQnv=rC#eRkt|{!XJStY6Q^nTYH8HJfogU*@<r)}!aYbaO7}v&_f!
z{FKeOo^Nt~TkF^JM{dFOe2^8mp5GC=(t7lKjRdaeUo6M<e2VXJJwKv)JL}i;9XjE9
z{=zI=&qvsV>-h!8x3_*hUm%9-zyF_r>%ZsE;`;CN>&C2K|GoW_xc>Y3U+~C#_P%!{
z-%qM^V4ZRPeBm|3WBAeJd*X4)-;F15_dAov@s#A>#?#_E@QnD;9a&HDbMc&b7d$VX
z#0%ofaBrink6VXN@Q}E7mCX|tuY*U#uf(I`2|Ok~9gmBz#uMUS;z{vxoovpOcmq5w
z-X710--2hwXW}{WHF#cpD_#&kw6o3Wy<fWiXW$|67#<eC6_1F|#iQb{;4$%ScwD?v
z7n?I7em0&I?}Deq@5IyMi}8&3dw5p-cRVLv^=g|lFWv+%i1)$0T<Q8x#6#lC@v!(u
zctrdkJStwJtIZh`Z;i*r2jdCx`|+gsDm*3r6`mF^)6M3|h}Xrl;_dL9_;5TgJ|8cL
zzm9t!l&=5Jcu2g;H8y8hydfSD?}bOj$Kx^aC-AuVMm!<@2c8tKcCF2s5^sj5@#=iO
z^u@E1pNQx1W63YW3-~d34i9~3>)8t5g~xICK7V3&n=>Ka3{Q#=##7?c@wE6_JR|-+
zo)r)Euz7Og7vg#GUU)%#0`7fOdR#q)hs3wwVezs(ZJvnuS$I_Z8ayUG36G1f!V~xj
z?5}U|l;n@<W%H!P8{!%9K6n<dM*lQCFZmbog7|m1_i^cZR*u^|A@Rm|SbQKJ5q|)W
zim%0ExLco}@Py>6UT5<p#V^KF;yv)R_`P^W{AE0gyY>7D&rAM<-ZoD`{4(78q;x$;
z;vw-xcv$=cJc1v^{`LCU^F}2f!DHgR@VNMWctU(No)rHEPl+Gj*XBu!x4|>wci~y_
zm3U742Rtu+Qa{#Tyan!UDqa5@@sRjTJS_ey9ufZmkBT4F-{y&lpNq%Euf`MN<ME{U
zvv^AUGdwL`c7V;35wDGB#arV!@dTb1pN$v9*Wljf()G{dA@OnpZO*WGeLN!G4v&ft
z!(-wL@VNN<cmh9_*Vi6ACHd2?w>i_e`+UC=&r1F_JSV;w&x?PE7sU7D-j>q!tUt)+
z3E`(Re-}I|`SEy6d<7mC{}xY(m%qWDHz{5ZPl>n1)8aSd8S&|OR(v&{6aNm+iyt%C
z<}8R`gnOTsu4fz%iBG`8;!olc@elB*_@8)8{KO$PXI#7qo)GVaC&kC&De=X4TKp|M
zBVNF>;#F_7IdkGo@Vs~*ydXXS_dc`rZ^?1{1RlfP_t=ei9CzP`|G<-yuXdBonUZ`n
zJcGN}??5~!J@?^x@#pb^_*c01xowy0_YyWw7<b!M6OTy#GCU^v!FXK!emo(*3QtP^
zmv~z8<!-h)Gva69S@Emzob(UJ3zDCMdta2k?$+WV@vV3mck5YUsLdIbd;>ft`Hpx(
z^0(nh@kj8K_y#;J{v)0dKk^ovGmE?V&%yJO?}8V^Z^ykaOV@1y9>QJ!20SA99e5NU
z#OtogF!sCTFT%5ukK>WAO6Qq`NAYI#KaHm(zZp;Ct;z4l3zDyOtIg?sZO`lSZSWZG
zUSGH1aol~r&%u+DUx%k~x1K-Z8Oc{3&i<ABMR;EFz43zh6x_>~uICCoguCti0*_0+
z!U&ruA^8ZNmHf4M4tMKwH(ro@2KTm>&iMfz68{4ai=Q~s=8T9(@u+xTJSILFkBhIu
z6XM_FN%2az**q!nGx4-|2RtJ_9M6i+#&hCp@x1sqctO14D4Wy!rgZ(!!b9Sn@v!(E
zctrd$JSzSU9uwb*$HkAoo#PNchu3vuJcGOAxi9X0Te@BM;vw9<9-qM@lHY_!CI2@b
zm;A}2nG<*OT#9EU-y6?Kej*;;RyzNacm#Lze}Km%|0f>D-8|LqusJi5zZB0(e{Vc5
z`3ZPI{0ZFqu5|wQ@sRlMcv$@SJ8jO0cq2S2ejOeYABV@qAHx&kZ{bPtU+|RpQFqy#
zY4Hf25$}j+#c#!P;xqBQ_=|W!{43o1zI6S|CT-4;cpW?}egz&8AA(24@5f`f`(C{Y
zPe}eNJSkpbjLn%6KO0Zu?)Q7$@T}y=<2if*?;FeT<PW9m^AVmB-;1ZkPaA7<X2e_L
zS@E0jocK&UFP_B<xO=^SkB7F~{BE6(9B1=HaF>tZG2DHh>4e86KO9d=em0&GUxTN`
z^LR%3%Z;}=bCR!*=f&IM1@U3H_oJ<Y+uoUY7<b$I5+0HK*LY0w<?gmQ<Kp%3g!q+s
zQapjDaJRpv;~B}X#<P<D63<KCn_zPm#B1T+Po>9sYdj=A1P_a+@QC<IJc_%2&#)Pf
zOa31`A%5yan?EVu0#AuwkEg}&!!x*>=Q%tl`7iLixHpOQ6t9hYKbNlO6?jNIfrrJX
z;}P5)4=>;`+#Szf;0fvRCbMpmuZgE6e;J+;zX8vRr|_KgufPkE-;8@ZO4q0C6q_@I
zyX}qO5#0T}&;^f4&mDMN{82n1{x+Tz{~b?>pLCDSnHF!3XK*+FU_2-J>3Cl9YjN+F
z()IZo4~dtVYV(A}>*5jdEAgoK&3H`wAv`Yr5}pv>iYLVnyVvGSiJyn3#e3ix@rig=
zd?}t2e;3b-7x04kano#0@7L1xzZegR$MLZE1Uw@CBpwxiACHOuj>pBT-e+?r#4p8@
z;(hRx_#`|np20KXpW<2ZLsB+RPW*H{FWwF>h~I*Hzm=~43_K*h8V`$qfk(s-x!>lD
zir2?u;vMn0_-H&K{y3f#{}4}!|BI)^Yd>IfX2h?=v*NenIq`?_Jnla4-oU*=={oPk
zLwF;;51;g)%@dP+D?BcK6Q01``{8UnCHZxDTJk&atmIFa&haMs=6FH!*W=#K((O&*
zA@LP>SbQ@c5#NtT#cR!A-Nf7Aaq(O5g!sdFQv7W^CB73+i=X%q#})2=KI)I>BtH$$
zi$8}K#J|M7-%Hoon`zG*60e1a#arVM@gaB=cgOSncwF+&;|cN4@TB+wJSARpmd&3Q
zZ;fZf2jf}s`|+IkN<1&V1uuy2!@WOB*SW@Q)){yE_i{WU`Js4Jd<GuF-Rt)SJR$in
z@uYZ}IW|v9yf&Ve{>$;K<Okt7@u_%TJcAd+Kf%3Swr=kG^IklRyPpTcb8Y^J<S)Zx
zlD`R$<7e{zIU7$&em$PX-TvB%XC;5?JewznyMHe<7%xbEI_~YZ?REKWco=uv8=BAd
zieG|9#fRWA@!5D>d;^{k{{v5o*LaxY2Y36cC7zM|AUrGi`|!NvpTi6IdF-!G@yMU0
zudjdcsCdmsY|fbY<#=3tFrE-k;Ysn8cnWu~$4~JL?$&2No|B%_(l%#a{0h7vehcpX
zW$Un*^_huhaCiKy!Lzvgeeza3w5N1E4_#pMgz@X>kKifEUxTN`@5D3WkK$SJckrC}
zE<7(@eWA@+5O0Zlf0u4o0uPDL!^8L)Z0{R*O!9x=aoqiV^Q1>@o{Z#M;92njcuxEt
zJTLw{UJ(BV_x6@<?-7e^o{;#3cv!qQ9uc30M{)P_{#raP`Cssa_(_Xxo}~B{cuM>x
zJS{#0&*1L$wHD7w{(C%+yZH})%;xd_DP6a7@DT3uUGRwHN8?e+FT~@Le+y6G?)6x}
zvy!j+xXqcv+i-j~#>4wc=f4h*;4VKAPfGqNJcYaM{RGcQem|a-e61xmXI}Dc@q*-s
z;-USe^Uub^xSQt{JSzF`@tEX~Tx#<qB!51hlzcZlE%`g~jN~7~bCQ1_&rAMK-21n5
zdrx`7<_zI(dt2cV+<m^>gvTWR5T20!m+_?dw|Gjt(lVPTEq*4R5$}j+#Yf^f@kj8y
zcor{+e}j7mO4se^<*b`{6Fe+F2#<)*!lUAA@R;~kJT89dlN@j2XW>cl&Ui}v4m>Ts
z6wion!n3&hzFPJvn<p>%bMS(APu%nB7T;mq_lK!?Nc<T*EWR0!i0{Xv;<cW(Ib-5i
z;&Jg2ctU(Wo)lk?r*QZ4=?{2D@`q<^o~-!Ucuu?%p2yw4&mV&qB>yz-9a6f^oAHpi
z_l(UG7O#Ux#IMAo;-m4H_;Nfh{yClyKm1vnCn<gwo)Yhfr^S<aM*L|!EB+Op6F+K&
z&65|8;05ukaIZ}1`VYrL;&bpYej%T~ui|mZ|A;5>rsR)V$#Et5OYj_iIQf2fUh?<i
z1w2Cjc|6R0!QAUNk4MDIK4<ep#p~jFf3YU3m|yM}wgcDucGX&KdA$!;Ph9V-^$?zw
z`(*9G^}bi37p!0JV|4|t_oW((=j1+958-;>s3&l}57ahX@9VU@Oqt?!a`ZkvulgR`
zPp8njUU5&AXMep(UVS_H(B+ozOnyJUTYAcuwdYl@<9qPD33>+8(-_|?J<aJ+??XOM
zPk(ww;s>NBNssyh^06yyyWDl2SK;O8ck8p39`!HCXQZb<Ue~QcIa?oHhm&#LuK(dW
z=M}g<?*`{>ZM)q1Y{tV^mhQ*xcoKK}<=yhZod5Unq3@Nwlf8=L8WjJZ<%s;?d$51w
zT%Xyk+m2$-|MkNvSUw}yb3PgGCp|TN53c*9-i&-SX4^G}H<~W^AnEBrkNO?t^*&_}
z;5SLnY<kpRAfLU;=4s758}VV%vxy${zsP4gThGVz9C@g%+ilWQ&G%q^)T87xU9G1Z
zJ=fxQN>5*U)F+S+U1L2*(lZ|)Cp}NnqrQoJ_*&~Zg`QpbB<U$r(Y8yyzVE^D7VU05
z73pb--zz;`=}{j~KGwr}+;O`Ye?WSkqDMVPK2DFjPTddqOzHWJ9`#Tq+paV{WqEzo
z!RJZO`MwAHUA+hS3_Vp?pHcV%=@~<h`U3K~p0+;zW%i4Y=kdp+=Vf}-zji&nN_%$W
z%cQ5wVYXfB^?VQZdoEtOzpgyo^7=k757*aY&m%0audj@AdA+=g>+!!0*W>x@Bbi5z
z&$hT8Z#Uw4{7k_0c$kmt{(TA8{q=!!S^w?MWj)IsW%H*Vvvsaf#`gQ}%7Opi$Ej8R
z+uIz(I~;5Mneo=|es9(f&xw!13%L9J!(=>kxAi!mdR*{1b+~u?z?#KjLyY{m@4@RO
zRi%DW-1T<uIl(+W*?O)fKL^j@yo|h;@jULHw{BIoi~DAtNKXqqhgZd~!wZtX6A#^E
z^SGXQcm#JnFXA!De|4hG6Hfg<%)MKTEX$f6v=|A2guw^Hc+8A}GsrSRVzxS~`nH5Y
z)2q9B?Pm3*rmNT5=WHA2R99tn*Y2vy+F4cItB(K=67XR0a0HgI!~=uCwuI&JJc0z$
zd9XmT5SllHu+Y31fyBcYgv8~A?~hwXX8eCd?wKvj);X)PBfkHOKmNGiH4mQMQUCp|
zU)1jF|3JG>jr+%6GW!2z?f$2YduiN%VDiJtxc_<M{u{>K)>nH!B%biU|L31G<@axE
zemnmC!@r{4fBJW{yS4K_YutbEKQ-O|H@H(e{)Tb4>H9~%(C$Aq?*G0~;h%r2-G9@h
z`_CKqbL0Lq<K8pwf9U+@-u!Z?{+s{xYg0e}dpiGEJ%8(4%|HBsKJoew8rgri`nfkh
zWO)9Mjp6x4jsIs1X0Rom@W0P+|6%C-<~IfW|0uZs&w~5^Ex7*!J6@kV!Tomx_a8j-
zJpYp5UJ361A>6ri`I`d%&%E&A{sqDP>NAi3T5$jKg8P3dxZl0<{J$l*Km4ma{^$NR
zQ;t40^V?s7JC%zc{7Ado_2s{A+<(=${{>T@|DT5EcfZ#7E0f25;a}JMdw*KH|Jx>`
z{|j&@{-6DsCjZp9|DJLG7Y)x(EYH89@rLHjUoh^!+H(IJ#(mgw|69iW*IVv?$GERs
z?r;8_*8ewJ?!Ro@f7)_?XWW0Y<vua)zuj{GH;wzxTJHamasQo``+s5Ff4AlS-x>Gc
zYq|ej<NoH)HtqI<-_Y^;LCgJ58uuTz-2aqu-)p(w8uwpqxj!2BVaxr`8TVgrx&KAu
zzHYhyb>sdUE%(1^+<)3~{~wI|Z?@e3cjNxs#@+5u_@n=Z$^R|<fpPy`<8IscZ{YqX
zk#m35<V#yW|EYgh=jS*7iMBJQkJNu3jr-63vUcA$?*EO&|5fe2F?|31Um<<M|Nh!v
z<?HSLN^t*Q1ouDjpYlBamf-&13hsaWukk#8<+u2H=06kMfBtW1yZz2Tq*ti_31gpM
zGVZ_khnnucY}|kFk7)P5-{237`w#zc)BVD@@BKi#|FXgVS>yie|7g>FW88oGN3^>g
zpZ-PT{+s_;)BQg;?!WzyYj-<-{F!n8+0QrK|7+v^JAbt4{<n?$@BTv5{qGw0-}__Q
z{Rc+>|I4_)`NgLD-}l?vo<I1dru#1#_aFZ8ru(lL_q~6z>HcfR{a62~ru)vg5C3WH
z{>O|y|Bi8A|FG%)9~<|d{<3y=eUdNct8G!O-+Zwux5XDH?=F62JAN>KJ*;+L-0$X#
z>95SEZ@!qmt~RgBH(!wJb5U*P<*M;VUg<v{$D2ou-om9S7Gn}MU+cvdU*&YX9l!aa
zc+6+jcv<9+Qy}eMZ@$<TpMjdM=J<DppEqA9vF-h4qaE^bRgGUY#D1@<f+Xyp6aoc~
zz?3rM<$MDDN(#PMRAs&%Pp9)0qN;V-F7wHvTow6te7`6ZQirMnos`REv4ZThxZgeG
z<NN!nc(#AdS2GGj`BbZi#eB1cp8D_2w#v)dY*RFxtS?kR#5Hm;Um@7~JBR3-FJN&f
zgx^Q}slW8^n;crqq3y$)NwwW<ceB|S6FA;oo#vMpqq{tRGcBs(0m>9rZezE3!#L&n
zw9FqC<^6b(Pq$^Y$;Z3TZxH46qSzMGFZMvoAaY6?ZSEYlsb1gc#GLMy%U6haf8}+`
z80fNCuB&pEn^gN^vR=QjZsWzGoQ$D}a?QWKew*jU@A7!Q+HA+GNilenzrVbBcXFBM
zo85hWVG(&#+Hy{d&7_*Ix5%`cMqsnW_@U{d3dE#XlO>D>msxK!8LwtJ<%4oHe4gJA
zKTL*MKhK}XIVHGCi2UozUN$SM<#?Mv6kGT$c16CK|D>1ze*XskS$|z2fSQTt=>{BR
zz1w14R!F29m_+d(&CAJlp$tABFA(%pIUj0gmAREh$~E&`mBYbqwJztIa)sit*}P1L
z&%^61gNer|S|F>kWA6_GLQ+8AeJA=*mAiE)dE`qfj1qkN^d4F7v|J<4fM#z^IT407
zfA0^=&HVFdJ$_kTAVXf9&xdoe*@x%Rem?v#oQ#h0!C;Qy?()%IKI-Lgr~e(%{}g5a
zrJ8S1$&@Gxt8J(lvX_gWMd>*8wB&O0fEp?kY}i7B1`8ErRu$uE{@vyFDVt5_70T8E
z<yITkx>1Q7E4LmL8?V>JG`D^lKl~6&E$|}#DrO;pCLeZU81=Pb4^8NY`@X)OtEcgP
zOJgU~4zaiACx_Pga9?Z9kO+DUELL-yuxR_Z9MMcz1=1N}|MRq}ib?)b;eX>ba$i2z
ziDW#gY0KllNsQO|a=a#VZ{Mq9P;qU?)CM`<-uq&EL9Mc`gszN#yCfd^b~?Yf9FU4A
zLC069KA$c&A6J{*TGtNGgCkY#n99KA9?GT2){~`*1ImfcR4UiQ{e$sv_)Sqf!TnXo
z+2PS}Zi?ZvHiRxRsNA<DXWM4NSXe7le!D@XmRD5WBF9r%-^(?BopJ9s#kiV0HZaP#
zd$!KsKB`iP_M$?mC*yc6N$>3iRcWe<ywZ)4XY<YAd5|5vj5qlN?Fq_RY3kSBaV>VL
z8UtTS1#+3dpw_OFywIj^RL>~Oi;hgq+-1n#+H`zaP@iT~pb(kVIrhBBIJ{4}E~pYh
z`l}>8^n@0$=9cPTo!SSUSM6gHz{}+uYM=2p8o$ka(rj?i^VxlFG|EfWh@)W*Jm&W(
zLk;Yj+6U84d+Cb?X}g)`PsOWh3yUdbG2j2lcn@5^qZ+?ZpJcu$8Ua+>1xNqZ<(j!J
zt*h~3KvwYs>Fw3o<;(fQBV-1vX#mUi4B6Sd2tBjY$71sIarH81M0=SLy;+?hK^L#1
z`2%T!77>M}rB1fLH2tnI`q=Yr5LT~`?#rZr$x!{w#`r=9^uAo6Jb<bZwWiL4x2qj`
z&`7t9v*+e<JS|^x^nG{DBtuS_Jh3)42|p<j{^zagsMP608O-*`8b<C<skPO`eS>~8
zU^&%+?Ih2RLtq=z+uN<4R^`je@M*d>Dtn7M`eZWMEq4o;ayp_g2gB=8p7l(6Xs31p
z99|XGLvg2SCM(qJU>#}CQ*DBG<4rM8h0Tv~_I5e`jP`;$%BY->sAcsU=(wlqd}9fl
z@Y+ho_<n-+G6T`!->RVlSWyvX)P1jelfEXsev>%Y<BL$6-jNR30rK?%`RlP<V3^WY
zz1N3%uXI*zm}Brw%b?JDmm|;G)G|3IAUsyKqCo|Vb~P2o2VS+)a9)ll%4`isl*!O>
z`?|(Bve;~uSKYUCIvDdeJ8PHyJw=VuTMuIVj=YLgBePug4x1u@QCsj{3r-1}FLxaC
zQ4^|6HyKYJ3jweDG)?6do+0n=>v78kRWlGdu_L~l-S%dNgs1FGEn~|<y}mPHUhcLS
zS^Igv8P@|v$^_nfyFjh9D4q*r=Nt_aERK4U#`v7Xep`Ynx_eY&9QdHtaXEfPe&6n@
zRleFS77RK&t~RKw=1ZhQ@p)1dQ);Q|EQ1~p2An3pw9L`#Yb#!%Njx9V7rUxBSxu=4
zS)zp=W%>LbofvdicWSZ*J*6F{FsK@DSgLnmXm+cBX!yJoNSYjIZ6ES&xkc7pg2c48
zjbJW&J+!Wq0?}R3oR7{+nAKpc(#!7Wt8w+3Mjl<Ae_57K8zT9-?~hdT@#NbbMm+hX
z!i1En#*0CQT<SY`mZqkG8jXC4NskFCimX+713<3@D5`2zw%lBAqE6C!#-Ztk(R8&b
z^rVO9vzt6_=HnLaVt%hHqZ$Y&<lwT>ofqHTcA|omKSV7%SUnGrTL!2|Fi57!f>!qM
zK}W^vd0t`OFN*3f3I>t&bWMJ#W~7cTHYbbub1|3?j-G~9F`Iu5NvV;&#dx?}q6u8y
z4|=`L(|k=+E0uCZ=L93r#llNP)n&{#{kMB9i`m+{Hq3H|K1K_*Dlo}7Z8@T$+S06!
zE~XnyY!yWsRd-(NAlpoxf~z~YGSQ}7jH|is9C;|eYuqf1Szpx}^UuYUrd0;ZCCzAg
zhSbwk$`U~}ob^sH`LLQ^@0RyyyoW=Y-g<p}yuYV8u2#*^>krk87pf**&z#D2XLdMx
z99P9uOPrR|VjxVkwFB923tpG+FeM7@?uQdKVbq?i^UQ>536&PcdVXcw3L)Fm6_)E)
zHtpHVl=l=9n&kvt-UV7FQ2JFl`KVx+ykos)Y-x9rY@V{RVGeh4eD**gdeF5+JJEXk
zwq>SV@<~;P*X1cH#fz`e$HmYY?e+%F1N?P-LX+{g#q0uq9H6{D6obJ4GcC8*7=c`1
zM(*-{4=#g)!86)d^~cd|p;}2a@^w1!T5h)Gdiz+sl+|LI^Y&>#LNheHZHLVZ7PBN3
znk6?iTzhR78#R$j^YnaC*q;K!v|t&Hhfp}Ei$e7rCe{h(QR~TxarHnw1k4n-DxFqu
zx$UcZnALc021GS)7(XNxIyhsR`sU+6gH+uQL*K!cPBu~pAJIU0AU5LOXw{Ob*11`r
zGCiuLehYdFs6XcDu)Mww-p=L~O{ZhUsg9H`m@wr1WLoI4WY7Zj4z9+p_XP%3)%G2_
zW;DY*)lCk@Fh?75SM6I<kIU>W!kONepRpLE2C$}I*IZqQ>QZ~wT|ab`Cr|APdD>28
z-CQ-})z)txt>7hM{z<R74Tf`?=hkEUW+jyEp%aEIODhhH$<av@s0N2m?-9f)Q+xMJ
zMzjkk7(iM2wxY*VtUDkhVL+U(F=3uh?gzuH2UqnMEt4(vP`0J^(aCgrRZe${VpKd(
z9~?tzT3%_0(%?E$FTpqKU6y0?pEcD$*E(J<Le0>~LUppKCbr?%t`RQ}F^z$4`e4ot
zOEQ)9E>_57iv=cN#uF-Sn1eZg*7Goa$!}Cgoie~ZH99oeKYxH0T%p^;FgoJhyQ?tP
zu1sm*(xBJ(A$}28Q(Dag>m8U(vDs~?AAoAkTb2tt00%y`o}T&=MQ(w{d9J2R8VfLp
zB;^!6>GAdThepu7gOLf+6zvUlCogH9Fgv2&qFM0Vptms^VjM)-0#mlSdaE0=i{)T2
zTn?^3^#4S@H$nF&%=y{g)8O;A8iTS?aW|4rci<X12f1oeKIdF7AMPj}4~5W-Id4-R
z<g4Pv)YvVXQk3kB%&8$&J*7c*=vaystz=V={#vhRz$fduRnzM{%igXh`NM9EMGKlY
zsY*<B>*1Iv@D(kk=>p~gR^7khhov&2Alcg=p?<}Ba=ALCVQ5iV@vNuDrKsHU`{F@m
z?mq|zvg7ZU<7~HFTdnlW=yz7+Xg&D@L_<j@B7j!zVLtKe6C|t7RcQLr^Hfz&yWy$Z
zrAjRrx4ILP$1!#vV3nhy!iN#?y3})5K{a@w{FH^8B{R$JUdl_FRJj_jsVCUXYQ5g6
z4&z1zaYi9rQ~&<*H9PDd@4H{B<+Dqy|1Z!eP`3<AzmwwAym%Q54o|UWvBmU;SE-}Q
z#h4w1G8p(OpGZ{R?ly(Wi)>eCPl=C+H>_}*am)6x5~RA2x`NI6*Tu_MW%YDjZs#-1
zpazvoOa1`;8zpaMF)6@yiEkF%REUFXs@RZy=d0)P$<81*s!}WzTD3=HLmkyn*UMw{
zzEyhpnNC>;wd15c-B5cpYQ=U&YFeO|DHT*ZafgY2HU2BA-rf*{8{`3#70zk1&?ZcN
zr3Xe{bE;Q_8uO?Q!LZZp!c9dLDfJJ{Oekadgk*N?McL>OBsjWWPiMt!yjyI~)&3CP
zmiY#AK<$Ygy#hm<FIuIx_8?PJx0Ov0wSYt7(FD27j)pY<bdLTw+SB4fwZT-5Gh==2
z(wHRhG+>5Zb->GU^<>t7f}GvkyHdU*2fr)J#qn9O#7J_1tV@$q_i8Nf7kh`sIGf0Y
zHr3OxLnkjg+`P<ZTU3BL@O;)tr#eip77ZiRM^>xQ6`{5FGk;AEODTP1gL;)N%x#x;
zQdecy8`SJG%ss0qn%fVQ8hzb?GRs4QZ_vfW2pdb}x~ronJ=B8K<Vlq$48`|ztT`iK
zw*@K;7bOr|ZZHz7s<P6mqY{4Fr+-j9`q;|CTVGouZT)aux>F09$du^3MYCqJuE=_N
zB{bKu8)gT&aWFPvy0@uuL!(o4<Vup+!nV~z33krHM1r4}FrGH*OSAThZXI$M?X6P<
z+Bz@in1jF=bWx14m2YB~QNLKKkx@=l(#@@Y=eH+UgM4s(hP{3{Mp@X`Nc-lnW{cIx
z`6OS@i#POLZQm@>zQ0kMO!VJ2!Z+vVqrqK%ck=FX0FsLv;IS(*N7IOoqe>OcfKmnq
z>;t5YDuyF}z|w^AhA`EjZ`7th;Ak77(d^>x3enZ;f*6BbozgbTJGIL)CqLdNpKi+h
zQ4JG7R}a-x9Z#!FFx!OohzWj6^om4JjH2L2)>1r}Cv!Uy;cgQIO|p)Yzf9WioiBEq
z$B(O(TC2G#mT049n0>@jH~vrySLg^k60-;nPI%?m2umAL^Re#RP&-j?Zp^YLZ;wFJ
zuWwgp^}quAF;`nkrJOcMVZ)zZ{cSYxJ!}p{UfL8|tsw>;o4fL6gt=bgZeWGk7{Yz1
z73}pvMaDuG?;|Z1+4-G=n-!c4v4_vl#GdZ3V`{b4U89j&Cs&<}F1$xKD-}+_tOk20
zcGd8dmL=8dd*AMc(7~bh8lyi-IO^WO0;(@pYp7;KbmFP5;9%KWU{)*99^gw1H_Hd~
zLjojB2y0gwYi+zwJuq}iXeSqD1TxwesIr_M$GcqeXla9#!Njtfm2RcGibpdE_bG)(
zXy}zW^eCu_pyaS1rdQhAzb!Do_gv^W=xn3sbC<ND7*6gV-;Q6@h>%+8K=nQ?MXI!}
z4gS_8+F4Kb%l6Mv@-~kc#cTQ&OXh9fa)dHnFS|h9gkH^4q56AfD+6bOgAw+1*nEXq
zom;hfdrgbCVf1@@%G?WVm31<Q`xjo3-ai?VLu5=d7+|g)T~KCTsn;777?)465bEf!
z(E|?CzPD#mZfg2!D%CVGrJO48bs{87Nw$Zf0hXj`9+k7w_Y&_64Js{--<2+Q^$r?T
zmZ(wINUzZW_V=pJ&A!9wY6LO+hN;T-RF!c<^VDkDkn&|6_ui4}{@W>`7G*g6`F_$k
z!$<}dfmi)CO6H;Rolz<JNmE_gE2wwM)2bul)7}Mo%=Gw>-*h><!!kju#;Xfpwui2$
z&g#lgm@I23m35<T%Jgn`t8-|o7v<X<m+Jjz)UcXe0(IKdy7dn9P#**7tBZGE&9{$U
z#j7%d-DY+Ai2Ygv+B@zivfJmjonyqf9WU3adqN>%M<gZ;9*Qk_Nn)fP)*!4bO#q^a
znUdIKy=zKkeN6lo6~>ZkZ`WXnj`}`!a4wHb(y1+BG<teRK@V<QW>({?FR@1f8U8}|
z;?Ztlz84j6Ep@r<?cu*Z*3l^V*5u3aB^Htjn%{I8+OLMhhMU&!`u$LNxUJNx-kI*N
z5KNqv$!N2dirPEOzOImo+qE@qA!u|1m^!7;)+sQ=c%j-eRf)0~KcZdg+rglj^GAm{
zw=9wUJ+*10Y{L3kyYgd3IoSc#FZ0Q0JzrhX=7wX+M<a|M+f8F#4)r*M>dD_{@vlp7
zO{zC0((HS)a^NbIW?8ChfMb)9G27RPC#>_DVIQJl$KZBV-j##vo2$VUlLs-azPa6C
zNdZmYz;4D0n}qBLts^!l6;rRUp@4ipxwB)59MlG)IwM(pkjn?QSL$z*>h)YCn%$i+
z-iLZ#z_n5h1NZuV&&l7%7ZOp+Pm?1)3XM5s^F3r?J8?l-qY(xsKgd^2Mh<ss^Fcyj
zAI*{O|Jw$s)fLK)QOnYhhB{&ARk^&~nqAzDkb5TX##ZRcQ_l|*Rqy5xltI(F(`bz1
z9ON%vmMd1M3ekXcEDwS4sE<er(3q{9*qJ3!`C5UVJ5WJ`7s#r^mz}E&seotDN7G?O
zAN5bS$|6^r?9|Ym22^1zO+p!N)rftExMl~oyJh0qT{GlWakPhtidHwQt}L<jL#1lB
z`1`$_TG{n?C23~vOy`WDW~t4U%$kSRTKH8PXgg-kL}lGKW#wW8jpnwNMRH@ypxLA=
z6ge-~x(O4d82WC;&K)yatLmwDEE|=eh^hZQrP5lt?!2Cjc6J+fZGnRv-#6QLjj@Kb
z=kLlSho@R9EqirZDnns%iKgiEW{;ptSGUuyV`Az=T1xk(<j`Tx(E*?1NUTi)^+kI~
z0w7LZU>W#vjze-S3R_W3X^mJ-MAkEc6h!UM5;wi`*?2?qe>KH=hu*U#!8B{T;rV`=
z_UsUAKa;}D%BiV1w2f{Kh*}{mzT`MRO{WA~P2WDPPGhQ(mcwYSEo@caj*#l>M=V(G
ze}#=<?LMEEWQTgda*2Ius^6|p_juzc536@?Qlp{vMPYCw4pF!+X=`v(mrr!BV}qO{
zm}-D&y#dxY5GHmL4xb15KyS-1$6lX+#@HvAstJvg^=YFB&K9W2^bNLwhZe#bNKxsz
zp}P_@Td+1I$RB*bM*ATBU6On4txAKz?J%4_quD}gvFUIvGwDMOD0$P0-GQ5hV<toz
z+oBF8G)u`?$jd&cL$-9%CdistkEfaQR^EJz8GXHQIG}@}&Gex)T$KI}Q&``}TpBGV
zu4%f!W>`H9q^2(!)wo%+V=zRe<h`-4?Y*sSW4xsix*=yLRKK5!rnfi1+25+5i5S%b
zQhO+9672vzY}zALuGDHSvl8(AXpkMM9dUkDr>YH_wP{W5EyH=$>}X`~8=$pdY0mTe
z$S_&|l%{VlrgtULF&n}7xz#>$iRwEu#rEi`#APbwYCgFhW4wu3B&N81KX&e+>!#Lt
zD5<q=sa?_gZ$euS2G7H9mM9H(d00<;KN@AnLk!W1soCYFvfpV>R_OH=$}mOqdr90Q
z)Qt;m23wiaL5DEz8BI|&fz<F2cED}5{hn;)&k8eTk!BY4&P6n8H1d;1`h8UK7#&6@
z%K9o1FhSPuVGa><0fEtza|r0Re?Jk&6kP8NyVqA=ju%)={7hBTQGPtcu0Se{gCWhy
z1^WFxosnx!x42$!S<##u_9TVs{(ossjC#>YSah=^j*Q!8=w@|}1^tEE^?~fgEaH7X
z39iZ)O|PhoOlL`@B^_o(=X+A&Xjrb*X>HdmN;-t?B=Vv^^7i`2OkWz?ia#zer98mx
z3xg$&HT?f8CuhgHcQ8WV5%n~!pQr&qKKVS(F*w7$0Hko;Rqxrg7QLu`F~zbhwaKSF
zWkB?~^c0%*wmXx;NFSkhu8#ZWo7I}S>R9laP^N3Gc}d~v^~`TfF72dWdo|xq3TRcS
zCxip2W{0LnNL$Gpv3d7;H=pVOOZy0Vwy!r<-J4VW-^nJCjY*SKXR<!FqqlC2=yHvR
zBYiaQoS&ujXHXGG9hcTxk~al=pw=bX-ivLS{ICyo*c5cf9F(Q3<t2)g3$u2L9Gugg
zUKvJn*nX$>Tm}Q&?11Kb^;l1#nK)uYhuV1|u{Gw<8enX!UB6OoH^9i1IjG|usSQiX
z9)+j2VpgXaqgIck>B_9};<9{EJFe?Wl^OUR1*p0yqekoQQ**$UWu$;PpmA}pmm8aL
z^;FC-`=sxfvqw6R_^PX~Qo%T?TP(0mF^uh7EAH7^SIb@?+QCl_HJd6cl_~q_0ztV^
z1?@(&Q=KVtoGZjYXcH>s1x?*q?26S?DoOQu=|D*DG0{0Hh)Y8CBb?BuJyJQgd){OI
zZp-cFGKc*?uCU*$d{|+yG8J>K7lB@;$}mB_4WOn#G%eUl_yZMlTbP^BgSsl;R+Hsg
z8JTP7TAVZkhN(cR_up)`l{zS1?~tNvJg~KDU1Cot?P6s7jU#?mDPaa+#prjsqiIGJ
z4}H&8D{KZ(i`8EO0cIE3Flu)+v|F5P*V`%u78z3uBOD|^G%BIL5h5K!&t@7h04qi;
z*k71p;4z-C;_E=P5}BM2sPMn2H9~(QgdYkZ^luu3$9Tf{+?G#K9YTL2v@CGZD5_;C
zW3v9nkyLn$$365nLiz`nl*mC#klGzzNevi)6~pglO`D8`(!lx~NBm4u!VJKQ;dgys
z#nX%dSTQ6(v=FfV#_>KX$@nTK5{{)c>u((K!=!?j$$5ZvomIu>NPWY3Yd7`pOio|c
z-#DtDNakeyjU$$FX*0W!s~BJ91mTz%=dInaZ9FE(d22WI@8Q0zDi5%(6fCVLMdI1^
z)*_0a?d4?$*55elpQO^y0m2Es(1}u6ZO&V}%{VVlZPwp7t`oAXzj4$*L4>@J1%z`}
zfkzC^`Wr`05)tMB*46x`wP#XHYXH&N16^k=<}}V*yUo_Ss37C5MHmm?-XW_JSXVe1
zA?wL`Yd5_$R4PfFFYP786zzDydDeYe2~xYeFI=ZCT3818J`c;dGOWLG)ITvD1cY<O
zIb_;jD;bgV)^6(GSjarUx{f!RMhuw;SXc8K3txiN?#rim<XL~?NPUX9w19BV&vY$w
zq@88`jpOqsDa-+^*z=6@^@=JF@LV_L<n7y>RnenC;zodsuSF-+^CUSW=WUo%5wAsA
zB<Bt0UcaL$B%tPP_*z;nf`N&ZB!~2fows3TBgitd$eP)BJAQkTsx?o*<aRB>(tC=q
z{x;B5sGNUYcyzAdFkbjB(K%e`sX1AtIZ_trSZiOSDyIWSTgTB$4o6%2>derI6`72~
zX;R-)bJnVR8U(6Bjs-f_+SjPc>A=y}arDA>w8r;YuW-l>AP=yv&$FaFk8py|c9A8b
zf&lSF-FH5DjEs-*5QZgVfNkIBO)_>o!V|QgOmK}vi{MBtt%xFd9`k{EbP}ZDt_ze+
zNn{K!n|NG0$53O(8iVsT%zVnT)I7B2_WTXGO-4D})Y1Z$2UvbKwyg4ea(L{9D#gh9
z8%NIod@}KX+SPoevY`yaOF3E>MZw3}GY=1_UCmcY9~p)zYF{6-R4Zo(ArpAc_A{xI
z35&r&Dc>_)QU!$XtTQB=IXrv2?8X8aUrk$QkW}&Kk&f8&*35NMr<i%9WA?lab26=K
zSsohts~3jk%)@zWH_S1tUvOK3EGr<Kv-;TKB1ve)mu&&zobmC1`^n=va0n-;J{}S}
zJX9gV3F1T9AvPivF`kF|gf)^p9L5vIr;2fchsSur>J#x!@^BbW7@zAGES@G(IYc-?
zMHc#db%20y&NRD~P(p&#?)3FQ%fNC>BuMRU6-iiCDH~usVXI4kVXP8hJYg5T0OS$L
zAm7JDGDW3~7=V0n>|#$Omw<52xTP<_ARwGGs1m6yAe=L(5)1;uIlHs)*w`=Ux(F)b
z<)yDfG4onYKm8YC$RGnk9@eCA^f<P!zGGE!8q6VZJ2kDNn7aXzfpmfM7!)h)Sz6Mw
zaFGU$h18-j8jJ9(D?w^^dZI6)MChvo=%3ia1MBY`A)2pn7q}d5o(q_U2VwBGV`QCK
z!x@a{F~u_yxU^ALh6h+z{9+Lmw_8OCSV%3Zj}YR@z?Vp5WRUOUeMjOHK{dQyaQsY;
z7~`v)=$}~WWc`gJ{j5{MA;5~!Z$GYa+gpp!zmWx7&Re@-gVQ#Voh3-^uD+Oj>Xl^)
zQoE}!?8uJfAqi5udli<tC}lmynwM<QE^r=$ij3|~o8vYWnO+9@KKMr09#_02s05HN
zj&F>ho(LJ$N&xxd_<qFkyL-BLB~B*7niTFYvRhO;O~}BS6y~pJM^(gl9*&h_+UuYh
zhij2E{tUW+Ub%<^+rH*+M7aYehiVWwb~1Gb!uwYi*rsuRD?4@K<&gmq94O~vzic#>
zc#J1Jp3qK9nS=E=j`}B3*YW`Cic2&jE250Aa-x3{Vr_3NLjOi~9&p~;4V!zP>8@3=
zhsOh~tNKmLhX^OQ!QdeD_DT*9u&(MiEgvGBAiOrQei7?$9Pv|4B70+4f8&UJx-RH=
zN}}2oK)yJhrHZ#IJixk|=U3%Czyqu+4MuP)s4UI{tgHEr)gKb1b~j(K-o6B>-Q8CQ
zUC^m-$JiHNBLmX|>C(7~^*4?f_NZMo5pfApyCZu{ULw7H*55edC$}dAkg)1eJUJ>I
z&@2SoN46zS>f;?Y$x0HWc2{4!GzkM~h@vT>b>Fl~&^3<b0iFz_idWk;#ThIESl;7~
z*q2FUMnyn4XM7+rQWPwt7RA?!g?P~uF`ft1*y6skNOS~*bH)c0Nl~znS`=RtofHd?
z@q}L{ibc-lJixlX;2v(evOp>$tiN&GPEhj3+t~R85#mHqtu5B$?W3%MfN;*(FGH6T
z;9jJk;3*oqawgvBb;1iE++RCGF_x9(0oK*sx;dWB=*kwl?^{^QNkKz@=aLw$MsY1q
z#1%J`(G^0Xo;<<{+K-?;^NOvAX?T2IKTs8_^og<Mc@!CV(>|GK2ZIM#SMwX|8%U7a
z9R-OVcw0wmB?(fy<Dgljlg{`mr#0i@r48q;-SC9rG<tM`2Uu4;j74)tk(|W>teE*t
zbWAk`1M<c3P4m#xq&gC+L4ZGSeti3i&QdgP0>~G~H(i5}V&XBLFg}sRQ#3537R6VI
zqoeUiaR?`f4|-k=r@Ett;W3^teX3VNH<2A&GRXI_vvP1-j$?FjVB6O{xgkeXg4FKr
z3-1Ci@5@D`Zb(5He|;~4RI6h;KPZ~o4b{yOyANW^x)P*zcVDr3Tn70*8}vYC5!0~#
z#t}_<T$l$~S1lc__!WPEaDs*@0C@GU1gYJL4Li=|ti}VZt0IcZ3kc_Im}0SzAho+?
zi?N6p&x5*EsisuKcpjLy#uHOmp&QGF6)~QN{){h3h#1eqQPQ+_2bCCDlS2PRu;^k*
z4A%71f6-a@lzI%t6V{((`B?1`K{dSeA7uYD8;fcXY#x`<Ggz#@ajek5mdltH0>U}l
zW`@RGv_3z@@N4o&Whi3u?m{tS+fO~SxzV?t3L}`VA~LS+v!OS=<jmCt&SL^sx|jRA
z#SW>!ipbQg&v%F&1=xb~lz}U~{g4<Wc9w|%c`w|CF^P<Oy9>pTZNK>nwK-lFr<k{F
z=C!roK`IJud)m={zKZQeF#B$7;g*GB^wtlxgG|6gIf@rZt@Dx|XRvCoH;?10i;+7>
zMFU_xw|F5YoxY8r84;P9^}%3dTTaT@*xRISBVa)+9)#eFJ;ucx&1z(1^&KUXa28ON
z_q$m4q`OcI8S(2>#2U$<RSxA6fmrYA1*<7f-#Pbp5sL@G`##NA(=aYffp03ni^YSm
zzL?y^Gm5%L(z*1fg<|xapW3OIF8hAKoQO<qeeUsK#2imlf7d8?l1l`#U9K^?Eyt4g
z*FxGa*8h|cN*;pi?Iu38&I7Eg`Hg8HL2CEyHnEjR02ndj8`D9AXbOmKF?SbF1|DEs
zZ5m@Tjm+7pxQbYY%sQb>@60b^88Yk!n3S>k2F~9I>7Vv46R&>a0oGOjMp{}PU|p+X
z`Ry*U(~k8wj`}B}IS;U|`ZuCE53sI_hw*Y5nTlZjjidgFXwCzytNx8>&I7Eg{*4tl
z0pXnW$1F_|LTXX{g;#~;%2R^W?o%3y?;poVBuMSPNRvpA+MPIJYOwys5oMj3f$_>9
z52#)7g@@MYiBEC<SWi!cn7sB#vj_<1JV#?5*&fAtYd2cRiBw>$w-MPcHWUT*<Z&$i
z5nP@(JQujMR}G(JItfy{FX~S*QACXAG3&r1VmuF+C=n+C;hY!lOOB8LsYU6OR6-94
zQoDl#2Z~ZBFIh+}s=va6k<DTUk`Pjh;%h#m-LsL*D`Gs4`3waYi?^5+jqVSO;#!{g
zlgO~?ikRiR9L2RfC*w7=mbqAe<A|T=Y9;G$9A{Jm$iZ^n+U@hS!o7m|1e;T0bFwj5
z(+_{q6ZT~n$_TJcgTM0)-d2j(R)W;-^tJ0k!-ITr`o=V}Y;qo8T{TsbV8?_}i_+Jk
zj7^~k2<NQ1(i}T398Z`&A2E{^>8nVP+FkQ?pxLou`Qr3Vv7-Oyi9`}{EfUQyqP=P{
zfegIK0a@WVr74T?gcUIhYAeTrd~w5?9d^)(0r}#VmVq&9F{lQCVea=jD8_(%am&lV
z7{wS=gTOFjA9NzW!S)Qu7q`3&j8Tk1H3;aBZ+AE!o6w8Jc*2Ub6Vr}IH3;aBd%P_b
zuRs}u6Qqw~wTtxW1%z{^j|nFK<2o=1CrBT=4je4T6Q<AEq8ulZh=bIk^ffJacr-E|
zAr|8a)93x`Ha6anAhkPvVeeyff?vdV9{9x3m@b_O5!R$2?kU|dUPjjRWRUN(-QXp^
zNn|6i0B~YEB<PErSb5F?YS$h5tyPjj0D;EyDe>|_ib=rax##i1Y3x;6dCnp{=NDfs
z)RhHZ?8Fh<#>)f*_muZhEw2|(Hy_qiNdSS%`ON5akDOGTw|1LN6Ox?25yE}e31Jps
z#gqck?McKQ0IZmy3H&|VNs!u|LZF8~XdiTJ9UzKpd8$8Ox{`{(Hq8u-<Q7R~9#IRH
zIC+9i4*HSYJd>I7I5PBZ;Jfm+a81@X?$1^6GND(fT5vC&Q_#?VHe2jAk*bmPH;ywb
zb8RAx80&8wr;B(Jvi`<#x<~}f1FS0s{%bs*CtHyRSXUasSF>r3Ie_?JJ^^geSTus^
zMk@dft@T~fBC;G+&Re@JcdLXb3$S8}WE?v-ARwHxVTyBT9YTyJtg6XLSw78US(S1l
z3-ZM`n1P|4oU%Oygmb2kWUodB`9AcWI9R|VoFIMRRkt(=5Awy;H?m&pr~`#4z?u~F
z7w$?)n3_R2LHa-`z%%r_NWw=@4KMm5USkOe=S&~SuJc&^En++myKaj52j`0>#iHXj
z%_v?V4V;-hO<wcU1<qrZw6a@8(uoIHS30$U_ulho?1GKI?vX$S-kt;d+JqnbS_u{!
zQccinUdC?Y;Q`i_?jn$@BF0xa(LXc1xfhpY5KfS;AdsaQU*$yqY$HqLtiN$Ao^dV&
zZ7wCQ9C8HJ@UnlXR7%rvuqFkSK$(Z}RZjFztkcf=8%I^><sdn4?WX>Xg~|hJ*Uw6K
zapeq$AQ?dIs(fSN8)yLW-4|RKUP;Vz7U4N-{~!>CBfwY|c%XT~4%OJ|f`D+&>LXSI
z@Br(oev?b?J8o6J7FA!L)6rlcwW$6|#^iH=aDw`yLu5WP7*7~4+U&L*#!Flg<9UD*
zGkw$Xb7bqQi19q|{#X(1f^+^xNdM&PVme-J=XflZ)*rZ;mN>BOyF^oTA8Twme<Oq&
z0-_H}vi`=AZWl-iGXN`w-$|^ZW&MpKaioM9fEB}U^c_sr-#B7Q<@2nHw6{FKx|-iS
zG6gR|e0Tp_$VBSRO;R-o%lAR7;?dy&*46w5L3pK8g81(Kw-_A>gb70(321?ms!sLM
z_7u5%MNkbd>Y&8C_yWQ?+mBdA7criP+Mu~D^GEFG!p#*ix)e0@r;!lW9=edHprJpu
zBSucMFuuwOBmTG?=Wm3lYTFi*Ncsy1=S(UDJq0gIv;M{r8HmC!Q!TIn^2P0Uhg%U9
z8;fg^cz^De_pz-531r}H9T4%@j>8zN>1Y4a(dxn}AOj-LK^dVVH;-k=;Dn1#Q#pWz
z)S~)}O;ax`Mo<ke`v>g`i&l)#O5eQLLNy35`|y0fi)<C<0oE12De2JP2<e|$Eays)
z+8tli?F<9)EGuF>kE(cx--pZttSf$t&qbso$OEh^{l;(I5fIK9A6PVs^c{JCbzR5j
z&^T}HwoXj>v;M{r8F9BMs?B2&n~#p58eaH=wmZirP$Wq0jxQAUs$54Fw?&NS@$Cj%
z?P8S?53sK4w^eMt49$6Kw|loXE_A+lfOW;KD0l%B69ut>kqq*EcGNqOqriD<w{i8b
z&T&{dwt^*MJP$jencA8e#kD-`pWk55s~ooos^OK|H;X0u_O@Crf-ysM%*6T|NBl73
zDKB&J0P9LDvBJUn8%O*^2Wzaqb95g<&Q@cgkl#5&ew9+40a!8oZe!c6SbyV)YbNG$
zc42&#6aI;eEg4_sq=SqI@&M~f91&s8+qivw{Whna3h2Pjvi_*p-l=CLu2BsPh9A&0
zetn%~NR2TT>Tr!jmF;W5ZSjC1UiI3R`Njy<tcN$N0d6T8(Byh!!Uf`L`odSC*EhPT
zl|kIHnOD2j3j3)BN|Sw$VP3AX3`4X9?YIIF5H=#1*~a?EG^cBrv5tWT<^h#jJ0=Gz
zORj1HrMedaw%>qSPoY4#9eB?MP6!IU9lMpv*HK9zbj%tyKJ+zH>`;xSJQ^V^u~1u5
z=cVDQN{$Y>>4C%lTeP!dsskCF4peMv5JkwPdj3+3pYlg6`;tW_>II5MsMeJsc4-8$
z2<eJv)y@oUZGFuPNW_MOfelAlPG*$iHQp4w78C2kD54EV2GF2r94${XX(*S)f;e<J
zEgc59+F1fl%9T;xB@O^gMr;%jf@mAYO{l^OB1oD`S~N_kH3{UmB{p8fVgP_-dekZS
zU)wC!RV38_A`SBTBaAG{AqJDjq&W!!b5aw-(64r=&zTVRZB~FdFr0D}I4kRljG$hl
z+EExmzs;JjL+aeLW-G*rB<M6F1Q)0EeZ?oglO#kW{Z+aCy4SxdmiWA==lxI1<+5B2
zpNHr3;fH6cAO`ryR1}SbL3YqAVN;+Zt^rU2>DH)=>SAOdCZu-K-wXV2Ricw`U>hP7
zlDk$akr_1yGUKy4E8329B0oXe<fmpn#=!P7W*(V5yr-h0{Db%?S4?a$Vmdw=z{~MP
z#*+)n3sk`xco>>CIr(xy@iLi<7$dcoLNZdyKL|^?(o|@uVWzXbK?IT%9J*YL+#bAe
zz?%F6DaFQ%X)|hoASWo=R{?6t!~H<k><ADwRsMn0OcpZ<&daS+avBhXr=8<5RMb)o
zV^&pyOloL%HWf|k$0Y@Ibt9r<^vvi>XKJydQ%n)%A6RIH<=&_i`3F*xkS>6{NTn|K
z6V+RjKLP@vQ4KfRHySh!h7+`KLB~Mv7!B#QO5_{F0v&No!N7;LnzpyY$kH_T6o#AW
z{-2Z2sc+G&Y#)|l;JP$ufFLIXtIMsHWWt<?LFyVt`3E5?S0O;1rwt;I9B<?hwWL8v
zl0v9Db&M<%&*hUCQEgid2tq|Sa4kw|+geB<`%HI^$C^P6fLuG1@4K<18sSqHGX&Kg
z5-i=)=ymvkPUJLN{^yOXgEqH8gv!s7k{S_6LIb>!L66gm3_YE>TF4laOr?|0qf*ca
zmmINPn=2YayfV@;h7KucrTn|au!cssI>wN=gcurSnwiPcHeGEFYr4AhV40jRPwBj4
z+|WM44c{0y3A57A_NjKJ1~&ypBLee3yVuqkQC(A5NGNMxsApRaUAH}3Ody>jANnuc
z2X#OsPDU~Fo;yperEyf04ox&`m$7>Xo{6(j+hQ1QTTjUP1tn52zRNMz#GD~}IspT0
zL^EqwmQLHQrnBG}UnHt+?W8oZ6wS17xv%M*=f;CeKbLxK8RNmqn4H$m3z(3L<UyMW
zLMf96_((b1im@}|Voa>Rpf&FD-0A^mqmy}XadUcinGZhU_x-zzJEUm4iEcnn-i;EF
zk%P2npmkHI6k#}yv?e1`<l_AF=K4IyUFhepU$ErI<<U&%Q48T^)onXW0rq48;bb4^
z&AWKG3c|~(+jnB&K2(5kvMRnpPf*W8+88rkLFI)ArR5cf3tvSlC2v5yJR+8sS0GN_
zydNL4oq}+(=H2(0tbzoxG+>ki*0sIEu+nnSwKN~L4LW5rr5~T>iZ~^2K&<wLo6YWe
zaOYyrXh4236>0`a2W**<&9BR<g#`NJph-&BLIPQkDwWuXyKogGkOgT<)<ObVbz>}@
zSHneJY}iL?O3H_U1Npj9l^<hxwwr9#8N1CI_c3gKB8Mc7^p2ZB5pM7%bPIy-_1s9#
z>8XV0!(b^@17Ph=S{R%hm&tmk80mOHSlD~I#5tvEbiimgP9P;}6w9geFpCdZ!G^_X
zoEzqtGOeA%K)Ri-Z%w*t5*%sd5*9{E2I4TGii;Aj^@X<6Q%Ja>>7LQ+fZTJ;BjK5|
z*={vqAUy?HVHtNjd1ONsru@t-5CUN4`|=#F=3V$u=S(+?H7E#Id+8_P6Y>NK<fp4#
zZ{uTxb=TG)I7zHBU;`OvY?Bort1*Ez4!q7k=bQETg<sDynX*R382GggUD0>}g-ba4
z#BdmOoma>=2$mfA!fCcdXaM9u+o;oe*zHB{tmRuS)05a*Al3<C5UfUhHW73umE`)2
zK8L0(R=dskp^$Xq7eH#}a8UufOSf{4)^&u#gnIRQ{w{&}O5Gz$+6BwCibTx%UO;Tl
zSJIfM<MW><O_)&4qcPnOS%ewHt4P&j8s-_1Fq8FyinqoGiHu5>_Al#q`a&)<Pwk;N
z2HIhmX0RxZ0aJue<j-a|9UtgmQ5*w3r(}0_Gx5+(HaKdW%%XR6r&$Omv8b);422tz
zkRkv{@;WX()*z<I@gviV^YA@gGx6H63e#CN{!o^u$#jkA!(?s9W;{(>8;z8iD8KXs
zD$_PLEX2o*@>IFT$(C4J8Bm}L?M4%1wFVj>$njO6u9UT4SP3r6DZ7TCvjq7DfsrFC
zlj+)4`Lj~|;(IN{t-Vj4mm3@iWvr<qMu+5q2*OkDlBgb6SV$mChu|eq9jZa_l9H$$
z&48TK7;YP(m!cD21A;K&T+zJ*jES5a0D&CJ*SaWZj-_CQT{US@0mJ4M*YVGA#Q+nu
zv6DtX5|BnO<H)q?hJ$O?>F9!b&R3*2h(Ixnou+X~gODWoYQXdsjVx3CjgQd5)@`-J
zNMWO&%@132f;f35{57_FPYO0YbXL#+gxpS!eeIKDyn>@6v<BZo^lUebPIh2x$is&A
zdD7Td4S8GW?6Bs;>?+S_2hWO8qoSNgKIdnF)Q;Q?se*N7rRjZM)ds79b!9c(>CmK^
zH9ATnAFeYa5Rz`0HH*RsX7NE{EParyw3o7^wCY((sc$02VvbE}0;4YX;HpHht1S(D
zme%FTVPN3qN&}cDFZ{4imj(^s<nqa`s;Gm4TEsj;k2tm_*L$w-6~F^dlogy6SdMY-
zBP25Qio@{IyAINX38eKp2Q5m*l<4W^DFi3@B(+#RR=*`K2Q0D6fHjTQ%0dEN-~q!*
z!a9dI45ZsRwMWdj;hi7s!gCd&*Xdt_0_~vJwuFBqW){N9B4|li#|*+6!%36olDO6!
z;s(UXD`{47-L$_<6oQeYU|B(3>Ru)U2;{aC6i0fBixt)Jbr?vuQ_^-Q)D=%9TBmR(
z7ER}3J>KH>MWzheKB0z$0UVQL&Xq6hH!a$A5BBDit2uUj;Y3W1XWj8Ol+e%WnTcjr
zAg~hNNx>7zyp43a7(^hc2JJNAZC{h*tY8+g1FXy@P8(!;2;ry{5In7P>t%?J>PFCu
zo^(ENyyVgjD07llOgjq+l{K<Lze*PdE6sGPa23+}foe~s0ng(05;UQqF31`2t|{#t
z0YZ|Db}o%9Gl7eQw~uKv2GqsNVS=#N>hM@7sMDByLlH^F3=`L^)C&$}Nt?n=y`vEb
zua%Rsw?;UB0S9jUE@!iFGbuVYG||XqezRTLAc6=PcX7m{yXPK?W1vL|b7x~{O*8QU
z3J2pja1?L*if#`Z2d9NaaSW)C77~Fe4f6*B;^dJ*C2^gT5Cx6lq)QwD=9M=i<oSbw
zaI$JhNH2RZ)^nO05G#*61DE3se^SKR4o4joDqygarih=!Vy2y1$bjh{N3rG6nzo|D
zK)RjTd+LA+jwYU?^%>xB%JnGk?~TyhoGx(L?g<_I!;Em%?_JWy<5N5@UR(gvV=?5j
ze=_X%2J~OMF>`w4o7d%ixnTP<rs)W<0UMTDgz~3nf|P$Cg}P~pBRO>U8<X^|{?Zs(
zny<X;X=yFbv;ovhoUsviM@$KTklQI`2hN&-YunJGi{!e}Gl(EwT2N>bOleZwszqpP
zW(VMwC8C=UgtoiUj%js>8M{E)G}AFN2*z%<LuU}8auUCezS0d4)^u=>M$81w8qZD%
zI801IArxI>Srr-k(dEaHW?ReGpdgZ%u*Pi7YFpyLNv7Cn`5Gk1i>pt*yjgx~4#6EY
ztxm8i=!KL9PSAoVq-8Ll#oz=MeV1Xv76cOH$(BMTqC4^~Du4+bWU3Av8WwPD9+!4o
zVQWId)<g;bxpsL!(4zT#jg2ZOnGpz(DG0GpXNX~OMfVXG24f|eog%c_P*4JotDp>o
zD`NU?7{Q!oP=~%qiJOW8Q`$z_z5qSN=xFjtrYX40+SgT9L@F7=!){znH)y3<e%%1J
z`JGIXeKWy~ILF*t!?p2M_%thB8}l7?h7Gk$H}>zMGclX(QEf}9P}{wi07L^#aoF0o
z6XoiX3tZ!;pF@u<dGq|`Vm`rLLt#ZDWFj4al1iFvzCx`&-zpI{2%YWQiuq#6FzhUW
zdegddEIcVO1aAsLAU9JYR{y4e$h*YV!0Lrin&fcL*E6a&t;30R3q#+{&IHgb#*F|-
zrW?DuFK|LP#V{f0ewQ^+BLpkgH5BX(FXm|Q*fCY#Y);zh($EUXiFOMMci`^$UO=l%
zG=kAm-z5YgSM_yog*BQE1cG8X)e8nxPqsRUWv^!1uyP$kdBUwwQe<MG7&$Q%Oc74I
z#H88WF;bmEP`g0cT-+=tKEIL?+3&|yH7}|zQD~+)FBpWgU9K@(%c*R7SQ;S6MY|$e
zlI@CYq#0s+jDM>TL8xo(E(B}yiIQc)-ELnK;Tl8`u5pis>oB1-=BFeh>wLG>h(OX7
zU`!&B=0X86%TFO#<U^~`ov#K5eAp8)DfuC<?ptVpAjj{lF=J)L9PY@`gltRU<}^vC
zaFVtvmf!C94IElm4G47A&M{pLLXeE<swG2R7Y}1x;2SBG8ZdZ+$+;D<?S7#4j_GU=
zfzH}Jrn5l^l0p&D{SYn6luV<IAgs%^!w4bc5)am3zBDj_^pr+b`^QmfhY6$?9i$y5
zkS1XvV3Q{mAdo|m6bJXZ{1`+anNyE(S}f>po3OLxHn&?)Aa6ZbX`RXz63EWc?D3no
zbnB@611VY`;HwxdrTig<*fe%s^|6Sh&Z{ih1=2j<E;eKgwV9cXj$ZL;0>glMduMl-
zqa4#5J|U*5M|=4)ESb0`Q1#r|5KUgSn7|ebx=CIZ*FAt56v)rI$k(7i9_Ermcd4%+
zf$SnGY)h#LAdse0Uvkh|+F=4|2;=ONbUaf_I}D^-dFhzWFS17jw2*ZE(kIpjFx$j*
zFi|3&&uAMjpY?T0BXk1_b<W&`67}V2=P<KNU``w6u?UGtI$_D)ENwS|fqmgk1labY
z?|Wrx7G0Q{z<_5mUL#Iy(a`mw286Ng9pNoo+N}pb{)jD+_%!98n~>`rcyNpl#vYhC
zPc~jlt9VW;0D{=esA^&(qiZhZA4uVKVO%2SUOQ$?sb_#2IF4n5lT&4c$E@Cg$47EZ
z`Q{N<L9;Q@S)5)(Rv1=#7m?TNkR)$VECbH|v7zg>j--KLC3ob*%GR1I3@g1T`MCmf
z8k_<q2rIi&UTo01^y)~9@{gE4iS=kmTw#RJWqIG`G6N|_|ED+OnM|n5v$cQ*3Yx^3
z%x;p}8WAp}<*}eoZBq^6b%_voR*5jcD?1cas2L1O)Ip@sb5P0_F%J0kdtJ03l&FPB
z(bPiZfEL)m&1a8>EeIuQAyPE85IMfxjnRg(d9=<G8pP|6M;JUE42<QtFZsz!ubv<X
zD_g8~d|zDmrzkHFgq58<j`=F0mL&))yThd<f^=;+4Pxc-R6ITW!&D7*&8jf0^t9aF
zFA7$?4qby-`TN(cFe<1+R~S|rT?1hj*o=V4>Keq!<1|gAUCmtkT;sq3Lp8=H*$Fyj
zUye7ASL5}?Fzf$lTHO|tud4AH_5S5D%Lap^@#NdxyuzNB>rp>H{1St~1x=~{eyFNq
zQQ*?yHdS*Co1*QW5N1-gF4YDVSZ8Y=Z(yX&T$e^rG_w9~wJve9%u4MW9kwiV=$|@0
zzqkx{>!25hEQZV4#Lx^P&=c<Hs#c=3j;V)Y7|Q1efzHKwZb6Lvi>$mZzj#YbL0H+S
z`nujCc?;t>tZuCRtOZ^yDqv90SF^IFe@kbULa8Y^vtqh8JyU>Ct|~-9owH^d5lWJr
zur0+%YD6fh83kEg)0j}19W6vkm++$^DTFz8-HU1hp+&TS_YU+e%@_5x%~y038N-;%
zWk1_HXw@9Ot~Ey|Vy_<YiJSF^iqK0m>SpxOLWU^-62h?gMVFG9fdsO7Ne3P#Px(Eb
zXISMK$6@y}x1b3wj>(Lan}NzatKwzyxT6QV!@A4fuV$69*nn*|TkJOc9)DAo6d;tt
zqSL0}f;Ou@GX)6brVBAGba)y7xpoddI9t#p#Hd)z&aW|5Ko4L9m{WSq^twDNiuF6p
zB$x}~+pxWn+L=l3Z@hh9Y;URy>>T6xJYa9{Vuh<s<|rh^RXN=)ic8F-g|Bc-7qVq^
zRbdc;X)tfG#%5I3rdrY<0!f-BsPF(arVykF15;R9$k5EaJu~T{FKdym>9IV}sAGr*
z8R)*GmE!j5171F4oz<$cdYOd=<?g9j1IPOvZ6ejcOz_UeykPCBn<x(z=tpnc)1E<A
zTvs0!!pf#QLnJPBhR9FcfnSGC5H@tYC?ksNxKjuQ#WO_r;Oma6;;=ZD1sArYEJzSm
z7C8q6Z&$D>pt~ugfQ7LfVx?2tkRYrq^5UGYWldL~Lf4=`-dj<{L6u3NUMgo}(%Qy@
zqU;Hzd8O?&t-WlHmfc!|{fBtxhgHukjT%I#56%G4(?HMII4E6|ZyHKEF5p9BT=ISG
zrh)UwgqzzZ>WCYEuvOQp1`+5%Hy&_#PWQOTmlc^+^!Q%bB%6^u$q^)wrCSm>IUOAN
zvLf5qf;35OY0qi1bES?DqrIS{;(|0sYkZi<q0G*bm~F<-IEH|*6fR()-J_hai8R0M
zcHcBnB+RWzkh{>~)_n{OfHLilfq(h{(?!gBhbq9F#^#`w@(-e^Tm_YMCNzjZ5~t}T
z8<`?$5R#-2svdb5S!U#s=L<PtU1?ef6S9^Sg0|yY3ku{>lVf+HFzK$twh&%6dgGQ7
z<?k}x$!p@;UP_kT@2-44OG<mvgyFUREU4|p7!W~LT+G+j*6OjE<4K6!_H#nM1|j@*
zsW~v^afkg{xc)6XN{rAk)i)hKv)d+3D_VnsDB?*TbnDx-n0C$A#dN&d&Vz^B=n202
zqAi3|+~$$I_5}rI+@b-23UvBg64WKwAUH|&OeNDI=-LQ^0B~~piOROQ>t{D?ka%r?
zZ*pqQgb<_`^_*t<LA_@4f#m)?t70J<L=Y$Aj)(2J22z(2hY6%HnWsY*<n4TuLj?>l
zSVD7ykR*kWwdO`P4`=Byjiy5u<gMlo6@-t{p^i4u_6yc{x+Cnv0uE~KQO?;31J^sz
z5K)$Iu1yiP0Wi5)Z5G^?`J<YF#z_G#5~foZU;}Naub$t(c47;#0UNz^AJ%GbZ;Lh7
ztgqnt70wS71H5cKnB%^}upsN+3ekYOU0nNWq)N|XD!}l0IiJtzVdeG^)@s3@Q^FXR
z=da%$F%X0hu+u*GVxo=SSVMSwWxTG`>z7}bt73zlM{G1r9-)er$e6eu)p4~Tif^C}
zVj^@7bj3rKVmc+r-9%eT&8#j)9x6y`>leAKP$y;+7~|P3@A*}BGv4u-6jbCwtc0C>
zx+iUk6i#H(G2@9sjc7}7xVUFS0+&)*U~?T4l_)is(+!tuN~z{LM;aqYM^emHeIU><
zk%MLsxe{F8qNq3s7I0wf{CvNQZHYAqPEryy0}?2{qpiyA%H!ByzybsZ%LMirp9wS5
zsHy;f;iReWC+VlVkBFE71@gGoDsk7aK`^ni9Y-Jvw&SQg3jN?xyIRzJ8-rlT?d~-o
z)a}k($CRY2s^qhT&fXTnMH~w^1)H0kZiEOV*WzrHUiDJ`p_DwYtTWM?pK(WFrhRRj
z$AYFu!^W%5`VJO^4Gw*BlqGNlSYtS8l>Mrt7t@*W-3Y~i0{L$@XovYa$c%vvf|q0i
z)s|Lrfz*9+1==72NxI2nvBiL$Mk`EI&6I160z<pVyW_xW;fbB#*ijFO4M0ar`A4Yn
zZlLO8aT^=6`oS|zGdwczb}K%v9@caPk+2M;leGqsvFeASag!x3+F0c4s>tWdHNBr{
zE-&Rkm>TYYYEa>ZJU+lCmk6uLXpStQlmxLPdW|GXB10_6MuxaJMKG2IWtoV|*>p_3
zP$o$nTN-Lf(Li)1O+zi|c=`>I;ugTDmT{ORBPOh^MVTTPOQTGZ$`CP-C0gMMGfYJk
z7<oi>O5iNrcrm6Cd6X)Gu{6+J-B%ywiGh5gZcHM|qf-0sQ#y$tkEjqqex{dE9+ld%
zpVCPLc|_P+&>=&~D38h>QAnv3!!_wT=>|Fx=h3Z2kZ*h$)u5_U1&4C!gM{*^HeIYQ
zf;=L7^-Rk6F`T7CC5et_QnNVBBkM#Zf;^&1E*mZpl~!Uw^T<SuPbGppqF8?Rxnj5`
zooI^|orv@3X8PPvG+)4G5m<vv4ANI^mM)P9QZ8#gmyGiC5=)5BB!WDmSn=?=VmM2O
zi~KR?6*asJu_X3LQIsYI@`zybggzh=L6!*Hd$Hpcn~}Du9$XS>DKT=voa$8VX^uuT
z1@sBs^?^yoDAH1*Mz>E$MF|r)O9wM|qD$Z`9d#aYdJJ>mbRSYw!8Ei<iU^TNC)Ox7
zprxI;)ed_jJLn7MD7;Ayt98~nifz*3;4x+rJLwyPT2f4zV0ULUd{^gmz&2^4NztN>
zVw<#Pypynj);5Z5(n13?kDXX!&?aeO^wKgy0&S^rRdgx`$Pi1SuUbaRs}F4F#3W#}
z5Ofs?0x-8{bt8{}o5V<i@1}WLAP69$j7vFf7LZ54O=8&G-h9>7<Z*nH9;(FQq3Usb
zlm2yHEK<WYZIcMHNePWH4>m8c%J@omrgySRH=$TS^HjC(lGf8s5^4riRq9>@wHk=M
zo1G%mNyY_({AMhcB2OVLC1v4mlu4we+&u2K(~=%Tj2HeW%#u|F4gjWlk1<e9gb8X)
zsGBL+gmP#T(r8QlRA7-Dr<SGpP*?O)Kuf4^KaN^YhWI488ZIQ_N>nkRC8RCY`w6ZH
z#-~XR@rg$Q`9$J0nzK1Ei6~2j${FwTPSuqr*rl6@WGA$xrd{+o-aA8Eo1jf1Ev4?_
zbtCP7wX{??Fdm8yN}O4{fLp|!23m$UhHcVPciY^SA1yGNIDz)4w+}GJx4U9zo|#OM
zcfniwC)}-{$P6;Xl58Jjj7z2%$PsnnkujD=8~8g|ENx#)y+PZf@6wN^LLBDEP<*?P
zMPZH%_lTui`UuDpt;bl0PUMLQ#?owFSF|NGw!9@nEXhoUxQrFSI2t-x*@Y^Ovvl`U
zb7(IbyC}+1Ez2G4vyKu)FqUQ)!8k+3K$fUnOk*&|mB3j#+}}leXj9oK4zpylILzsm
zz*#!Itv_mc8RC=BivdX*31o@Ps{{$AD9Tci*7~V{4qDMRKn`ZfOo$S!HhMf%OJ|O6
za3MOX2*%PVt$4*9O$1|Ua4J_$Vn-B3c~lX-eXa=1Ba3M7lSN>b47V85Zr`ZMBN$6V
zEXiCD16d-aa*`>EvQ)~HNvbHyQlX7Cm(xWJ6^B_edakGoTLNe4W*zAgI7^2vOc&+i
zFiYm$2^ck80%z&m1%QbRo4{E*@3O?WcJ-x<9L~{o=1TxAVSQ0!D!v^FaUCtz2)ZP1
z5@~f@B$2K{N<%Fv_GNZeG!3<+_Z6K{iuE`}gr%^1>k~Y2m?J|Z>10DxrDX(Ui4;UX
z%(xyuBo_&~!7cf+L&rFRaWv|DL@KfokRviT55&^ZURuCl9F4oefC0H2TVHg*pd3|w
z?E!<TxuP&fruEWyB}C~GKu0*9KF^aQwq*?EsO<Ukc#e&sEY*7cTw>2~dd4&bw1kt#
z4rDQqC0ahM%G3fv1Y>FRL9VEwWQZlf3w1KcWvnR5Qt1m266G%jvP5`ETJ3I*MlA-i
zM3YBMHYOB`U@Xlnf^jB_foh`FHZ|Ebn~0bOO|sLeh$NJyTF>bLq^O}~h)>e%z#?EQ
zjXH@MRYgR26ha^_oxmYLJd%DQHXsoomSm0-9?9*vF;GpUUN4K<1rN(e`O4DZk~(#M
zI#%CDFqQ_F0b?kzOLZyYFptdLjgyF<y&i{87mpBQ26bsgYDNSaCZHbaW_Q1&`+rhO
z#&8~;zSbp87sFXPy5ED|%TBPxVQn(0)^LT$(n&x|XpaLYw2Q+mnc7&DV2WTYjXru6
zO-dPJNz_5BlrH*yG%h4dgRO_@CAJvI5<TNo^+U2)Mp2e(@|a$)j$ka!B7$-GC<d}b
zXjsz<#Xy#5*I_qW1k<2NRWFK109hi;y%ee8Vg%!8BAD9H5s)RKQi$u0xcusPA{a|!
zt^<nFL@*7Slsbw>09hiOFB?y#ipoVSigHvcyfjx7=Ex$pY^s%tV9W|NQw(KwLX$Xo
zN%fdxAWO7-n$D|4UW#BW&Ezpn6Tw&-)DCH$7|0S;%QQ~}V`&z+j&Qfmr}kRHXj$n3
zw#0ax3TH;8zM#tsahN4TI!|#RSE$7tSsd0TOBNnWCjl)Xx}h6%Y0Gu0UzP$|LIm86
zFa@-P_qcvEQ3Pa&CBbt?D|Ky0G>=44mWuijNo&PGO(J30&gBcuB%mynv1NzxeiG?X
zu8X4TMj1z1O6u+=Qc{NaBy?3@G$Dyb0{KMZp=D=&ViHjeDiPJ>5@D81-<BASo(%Cx
z^!<rRA{l1Ms5~c++r&UkqV+i0rfD)6*d(O$@2L!-q96k;A+65P4cAet#Xy#5T_$ys
zA(mu=GaAWrS22(!!Y&U?ICLOO0WBeJbi$njozCc_p&Ti02#=<L&njSBw8`e&P)fKa
z9qj?_q@oOL60T*St3b4d5hOz?%+hh30@RYC>3D=#w-zCew3O&n#DPxDB+{dtmdo*c
z)j>~?rm&V4dq>P=A<=9Zhgq^NOc9Kwp}kujVi|{7GNf*Wrzn%TI)SrvRFNd0E^Rwt
zEiF>4la>jbrCVz<>Lj;sk8!siDWkYMY-1C@9vhcQY@lS8-3H6QiTWdfvvj)-J5M7R
zOM`w10!*kB19?Q7WIqo?BFH1E=;CP3mM-srNCbIA>A@U`M35!I{p4vI#XufWnVS3u
zkA(4PM4g?5z$1b@BDIZO3eYEu0v+Kat_n*e-A2ZEG#w27D5r`?H0_{|gt0X9b!scL
zh_DohwU`{WnIj-ewEQ-gC)gcR6y;IH7L<Lm7|x?pnM5AU`HTr@n-pW8V*Z?}I}vTg
z8%2B6s}j$*>RZ(sw|d%E00#@ydYHNUF4hVlXv@acTwb8a+GwZ7+byQ}+Rcx>ybNpv
z=oYR|7z=wnq4%9<^M`z1-s2^*L_zHaw-Kw+G&9?oS_X&2Y7VwDa|CUvH!tILnHu)R
zKn<c9uD0aN?{d9jl2DEc7rRMgk(wz2@`&cES#rwcbt9uZDrx__uaboEXclGaRymxi
z6hW2<2V55O`-HV)AWO8GcRa<J0$Rc~?s-feu8)B%5pGb;pNiD=BvF*5stO#X!Uce7
zx+Kz4f-0R!V<1a}JsdbQOuHfyshI+{2-DpMLns0*AylIm<S_9upQMtb1J=?m=MNPQ
z*5Llf^nTC|SVxQ9!`(O&Ku0J~vb&<clrfeDX6|go4p>V&*G;Z@U@BVqrlB0EFvsMg
z?z0NmHZ7Iem+^LjN8>svoWQr~D|+y%J9`{%$v5j=p1$TLit?#CO&}6m3}^|}UeDyH
zBZ{i2ut3>?DuS|9y2G60io-0K*+80POW-UWZCOon#Xy!w?;=f7#bK6grgwcL*%CNM
zr#Jqj>Ebv?r*^TX*`hFy44ZPgkVRk~8TRRpx56~8tE#X<AH!KX8eAqdi-9Z==0w#Y
zuq0UuXbBCQw6tAEluoh9huth4jXYwR-0`J=mQdy2)g~F=1kTZ|Hh3L8HAvgVP?l=5
zM&~8P6a!hJ=Vc7#EEb1ZvMpWy9&G|egr%Ua0$wS?@yDE2=Mr?C(3YC|a^0yrp)ED^
zn5=hcWkrxB!t@00449=x+i_T%Y(JrxWfO3gj^<(LP1RTv?lMdlaGMw=C$WUKmN|-U
zQzL=C19cQ_sm)t}32mdO7FDXgH(UbDlF=dzRdDH@SxKa&#QmGJ<b0TjYaHgt<mGag
zd)6|>($E<>X?nu(#6Xq^=MrYQo)hda4ciHAsp-+4eEm>LGel0$6wne<iV-%E5OJ6#
zqZP-z`2bb4?(T@U^kmG|i&Rnij(AIt;=f95#fxAp4P9yEzrC8UY&SSV9@}l|vVq4Q
z!&~}UnLJ)ALoCU&3~_ozFqQ^w2sT>I({rzJoTbwj26bXgLoF$)b^B&ZG$$r-mX4zA
zo;HaRCXtpBBcbHu4i?jREX}^EY}@9quixfsdZsF8y`#Zuv#W|L?4`JSq^9BG>|%3T
zR#;OJ-gu{nR_<utGwZ!Cwx>HlSKHx)dl3Hk-EM|gl#1!77TWu`dckJT3%p`7ST2Xp
z_usHjv*>rlnyL>7>FMr;e>b1fyY%YGd96u}%lcQC(L-c)1P@OUO*~LHJRd%fvV4I5
ze>1$!8Y@D3wZa3;Jh#?udVB8SHt+7rAlSo^Gmj2=FuWe+Ep1es8i60es)$ZN*NjD$
zX-+6wZ3nOI2po>L<#ImBchD3Q(0TY^rZw$1C0$a97`9)G((i=%uu#`t{al{~>Rg6L
z@kV?3@Erf(e*^qKv|{G0Xk|$kNDdXe`Qjg5sf@o1ssT_jkaO>XwqvN@oq`}CILV3d
z29tS)3@WW(IXQLqvXNEykf{32z|(Sta=abQe^PKorhcCgdVyy!{1Cj=ieZ%UW8*5Q
z?8TMWd0S$8C6f(YTmcB;id+^vR|q%SsM)B3pf6nc0Gwuo;6l)Hb$RZ+y};dwb6n{D
zlfsnj3)r-x9ouEKF;;CS_L0M!<CZIGjmWga<&uAy)kT}qfHjPM@XV$KkI@fIp}H$R
zvjAPokrm_KWEUoFG(8zy18N)f_i7sr7r0@ntd`CIwUPWvufJc5z^o{o)qJ&?PZ+Vb
zZS|98$`mWnURqhaW3@6*_qAD!$&-j#1fdOz%!kdzZ{OxJvZvLlk=>zP!|(OGhEK?j
z;R6~AU)vC?Y<NYU&oidD#UMIoZ&qo{coW0upk@(BI~8J39kn{aVptt=jg3XcJoD+m
z4SoFrvOmJZdKcNAIzdi{tS8xV{hiahj+nt444#K~`SQ-Rc<oHCvz@vTC6+$srV}2b
z(?@Y&Q#_B+@5vb&5ye-E=n;waqJCTzMO~XepFBSC3fPipvr!uPoZ@u3<vC4i^)#tj
zhwi6nwtsPRdUu(hU)-FGetdnZW70qzX~gx-T|T@Wj0V?t45;sDPA>;PVgLsY@P2T2
z^VKy2sX4CSUEDB$TG#7jC?hG9L#@aNKPPurC$}G%fQ+()c|UxAJB&j{0>r3$2bR;D
z>k&iZAqGD}l!6|jO&;Qa-Od<<(N!UMMUHWab8g;+Z1v+aV+|$!)y4JbYbFG{M+<`%
zJSSV|Jz8xdFDh5>F1UQuL?p~{h1fEx_#8-K2H}|lIri?y48$Y3PFnru<K3?@944>c
zAzg+a5_*hIlX@U7iL{k6nWxC;?t;<OOW~1}!pDdPDgjj{&d#nrUb0N~_1!scIQy2*
z1s>7a;8G$oHHs=;7w0EuXSbo=y}eqe_ZR2yZqGX5&PI1UF7s3a-}&XqdoByUZG?I?
zxVvTZvR8?K?CS1gG6L5FR5l~7vhl1j>S7ImI#~m_PSyafgEfHaVhw_&tO1N{4L_L%
zccb^GTyxXQFr3;==+^ael5ZX`qc3DgKRpL`pDxaMzMjwbJfF#grha*M`QiL>^zP$%
znDDkv1q7AY=;G@EgE4gwFt;bss!k#Js?OuMNpjrq9CdVVZ-*zhk(!KRe0!2&0fbk@
zlo+s#2B&vCO%ucO)C!MAU#B4NIzf14Oh$yl2!&GdaGyy>qw?w5?akHTUq;LrjLCyF
zCLetj!g{7AS1?s`js_!CkxWmiXSZulrd7Gjm%GIl7wT<L7*>3>>}xqZ`8eWoq=~Nz
zet3EE<Es&ux8ItAVnW``*Z_N(u0|c%-R*~qYp#WB#$=1_sq|ibt8-!70!4Q_I5|7{
zc*b=Qdxs6}Xf8J&@20fhl4JFiB-YcLn-2p<$sqWvo3laIK~2K-LY!@iTy$<vQu%?@
zySq6>-}{#7(e(Bk@!n$122&TW-C_;eFWCmz)`>c)LHal~`}14LV=FupAHF!5R?cER
zUU7NQ6mA7VCgoa7KP#iHkJVV3=I(~;zWcd-^zq&4m)A`1yVq~n758;b9#;$2tc^7N
zbaQ!ncbg`BE*BG=^Xo)NIe^JZe%+FzqRKxGnZCD+$b7ZlZS(QH+5<0yYvQb-7&&ld
zCo|y=KB1t$ADo}7Iz1@3cKD)U7hD8Pl550=0h7C%VF7ZAV#S!E!F0(LWSdcSVAm&X
z9yA$DV}@V;nCm4rg1S9<m(u2Tz~#XP%ZRNWO%tz`;aZJmls;ZxaD$abVtpFi@~ukd
z>vMaBG>xa{h(*{2VuoD<Soxwp8dLW+(iFYoE@?XAt!PYies+3!JG{M;l|DZ^8=SKU
zS3Gc~;d+gF({wI5#Z4=Hes+HOar9*@G<1aD4c;dZgIKa7kgGd1ogqihR?al5J>IIR
zIH7Vj74P{OYKODiPrLzKs%ZmED;zz;w)$$k-0<mV>QBpxnwaVN^jO-Mbz92b{g^8^
ze%MHMA#B?hFljoQVxWLslEU<b&v|w7qrvShQ|^0qCd6>!>Af7!p<*$CH<HVmFg<H`
zTR)zkU5(z~u;t0+H$pIdL0gi}&xSXbmq=jV9Cb+OY@A*(HL>eNn8;P*mwd7ms(+^y
zO$ah6pSgsDx0vh95Ff)-a3FqnVB?CEOB~?D#HnKl6mM-)!JVJoeT7MaORf=ddVhs1
zaC3_~B`kb(<TMDW8-m(-U+q|l<rdq^{1RQ;{OtDP(;#Ql$%d|Q2>ShinJRSy<VU)m
zpj={WhtM4~X<*s%e(&7{6CK;Xak#Uq6DHCoUpN$+{Oh|5ZWgY$XX*iHsHz*Gr_nPh
z;wV2121BIq-NiLGfzvZ(&cSIUb$auWpGDB46o<XOp#){3WipNfe!Ql+oD1~E8L)3r
zDB8)QgySvhN1ssNA^HUG8hyfciar5j(YM9$>Q40xxvXU85trKs>^Ww;2PPF2$q-W)
zVNawcS(owZSCR0e06T74Zp$^Rh#kldheL{vS|YhRi8M+Ecs6+Vk!$)*O=eKiw3|tD
zgF>f{v0|^MGp_;tNLH;Ghmj1lWn5E^I>lim#liGB4Hs(dQ?~nWdfOIu%T1&<VmgY%
z)b?OUFb&NDGiJ+d6JFu|E2&bnd6;<4PT1ui+q)Q7YjN(;)EbIqaLH=pLvHzyOi>_~
z_dLR6DuuW>A6%cZ8Pp_*L48RrAX9tU@q~f#dEF$M#SFMkx5>i>BaF40nvX6|K5=cb
zXQcOnk$jEN0lMN!=pJox$M!h3p!-QaN}3rn*d5FalxSwpJb+zZTr-_~KP>=CrUhZx
zwD8OfOb0WMcr)8N2V+%QE;_v#-sWGPTr$PXbtWe`?~KvO{q7;Z-wBh2rhNOXmp9yO
zhwIfSR`6kGC7YUdF4d>J<);m7hoK4mgx)CA=(yQRiWP3s=^psm$z7zs)CipKQQKO)
z0sY7f2zs`-c!`8Xdm3ScG2#sMm0+m8Zg6}Bz7nEkbGKIEY&4{)DVq9c7eiV}VBi?~
zGVOjVc7(mYxqf$fbIKH;mdyx^&IFfTn^};6!oa3wFhZh71YH^1+G1B*fuL9*xnlQF
zv`Uwv9uBy9EKZ5L!7V!wur;Hn#K(7M7oX4?Gb)=Yag)YM1$I+NuSt7;mG9pS^TGAW
zJ8o1JqN#l6*lxgfplz>gb3exxN*X3C(N^<it(kCF*t!kcb7n-8*|~`2d`h%4VNsx2
zdK4jcz1&@JbA_$UI2!y~PV>IEm<wTS+bk3WMxnU4mIf!y++*O_jUeG|KS3un;jeE=
z{d6JHSnU3}`;r4UlbXQL;^zF_2{(1+rqVW8k=4hBu#dh=+7a>a<eFP<XY4#2+@8|%
z3S;MHmLOWGEZZ(oZ8C)%q5p7+S~wVVwCWS0=J!l73f~Pzca$6qYiQuRFKH&^=8~JH
zX;#B3)r2hXhS)ggwSDZI=QSmI5+|RyC5)!FcUQML?F+cM#`-4{x2Cp)9t|$pWztrq
zrl2<;Z%;YaW)c$=^G8Fb)VC5sAyJgyVI7-M*5(IH7o6o+7nlV^)<H#uF3#|j>EznE
ziBVes{9N|2H9#AV4m$_8RZ#$EvITZK>n%KO@Y&!Kzwbnp7+imOf{lQTeQcG~4&&ne
z$pzOFk>Yznwhc>{1V``qyJ$mjZgp1Dp71?Jofy8);BCQZTjz?c(j@j9!Ily`pE0GJ
zd^^A{!%v~|VrGB<pM7uyfzNP|7@^b#$twmi8V*iAp!X6=*vg}AQFP(Z*+I_cN{wV#
zRjq}Ai19$zgzwBq5yz@8yQ<Aa{Ht5c>B#j869+9JR4h#9(D+lXs<53r&#W3PmwYD9
zT0|=PVhn_`FXDy5FDfLJ{-^*~{G$R~`DdyxZ6m${5c#DVK;&nu06EuvmpVX4KrGT?
zP7=#BSKM?ULiv>AAr(+rLMoxIm!L`n%&D7wEQD|sOe5hO_!N!i_27~dXe68yIKRNe
z2~+C{%J74VW?YF|8#AI!qomamk~*2`Yu%k<zazIe=ln66VWv1TJ5!3|X-i=zRviX7
z7z@_^qj%hDcxD=kyHoP#=Kh?L7=d$2)&%h9Ha;`}%x*1DMLr5X0lTxV0f>h1qZ{s=
zhFw2Y>n@k4AMP;vV^>a022i-s&H3Hw36l$*Nzuw*TyvXBOz)IvQ0IPRI&^kvnjj;L
z*KV%QFPJ%~77SV75?eij-hf@5Fl_I?y5NRKEfjXVb12_dqmH&3$+j{D{L@{2a(Z|1
zi68!(4mv;>lb>8-#)`$+NnhY#34Vy{7BQZ2$zVAZwlvgQQK_O08jMlxatN4ccLY07
z_f*1Oy6a!#wjg?#+rcGb$dy8qWe_zCZ`rB9%+9cqJ~(S4tn8Wojw50+pBwyw36}uN
z2Ult?nk5~vIn>8UX=0iM==9ae#huCsjKOWr8~oa3juH0t>BUe;7~Q>AxNBj`-p)r5
zh%(Cw=F?8knIhH#D!L>PRb@9u03(d1>UA6ZXm&aO#Pzsr+UuQ%xJ{Mrhl#AxnN%@5
z^=`R+o_0=EVeCFLK6KNUIUnV)JegBi*|hT%HBj79XU%S)H2cdRJ85wOD013`s`uC>
zf}t5#%=UHGA;kF&xADPlAtA)*^cI^L7@8y8euIso+#Wb%coN6rG%M|j-)=QWGSX2|
zwr;WI!ElM%*!JiH9RuR_e%AmTJYoT+_W1z$(Mj}x;Rozm5fG;F{6M?u<ZX*~uKDSn
z?gSQVXp>NijNh>8bf)6XPIuah7bG7ruYYpNHl1d#&C~sKKAt4Dt0^XSg~GKvPxp~x
zS;y)F2GGm^)GpJjfG~N;b;#<7e|RTE+=mFKQf-a$bdNjw#npbFh;|$@&7^}MvkQ%z
zi}uxrJ8aZrNL=1Q`IhsJ;{hsRJ|G0!n701|wPz=%r-LE0K2_(Ok(_gCZ9cr?^MsE`
z>5H^XBsWwMqU4eQ5Fxo;tZ9Rilunk1I;Aw;V-m`A#V>~^?{3&_8zvwFpiU9PVgkAx
zQV09|BQuw72MvIH$#%kxZ7+u?{uj4gFWhMY1oiQ7wkPa4yZ8tTA+}Aqq>XiaIHnc9
z9P%q+bxf%96RKdVdAlo28Vk66&C9KRxj|m2GyQ4B2(=|f8#nCVO0<`Rh_{!7kXtCz
zDiTO)741-`#;_YqTaD)&cfCX#PeUiF=GG}rM_3+J)3(Fgn|Fi!OLjH9Z>M!?_}$6-
z_k%Mva$zS$-8@7f(s50gmYOM(kA^4g(t}+S*2lSMD~X&lOhR~z*kHa9m~CI^4tBwY
z(%Xv@rl5MOniX!{i5(EIeLms?9YPjPLHF(9qD`SV)hWI;?$`JZ?9>1VhqX<`pw#fA
z)xM2G-zl=3JQkBDU2a<lbLE6a<m+NOUTx<S7Md9{+%$Gy0D%id>RXBMjXQweEQ=-X
z9fY6ng^TCZg9EFAZe<JojU#@RxW+#!%mAzye${nWk)T6=Cv^XlqN2w|1ikZ|16Wti
z&x<tYZQU9HLK|(JwyTMD+gZn_;c8WCrH%;>NBdgopXr-qj1l>Cx0lT*5+0&&eX#(0
z`5ygh9q~aI)Ya+lIq1mww~6%+>YJME<xYdobo~|1z<xPiO&3LFuEDBrgzELqkq6IS
zaZ~4fGI|_O%NKoj93Ee5A89<M+sZI-d5<d*cZ+c~-@bBpl^tG<*9}5k=(oJp7i!f3
zXNOlhc4v#x>+%|RpQ$^|rVHF(j~r!VilYAvwds~f+^`VI=8Ug$nlK(-F5|ql+Y;B~
zlyu3OkhxiZ<4BLq#$;K4<A@WwabX@{UCr;fs>bwEiJ*K0)$l?x;6k+rdLBt&5)hs<
zUg?=odM1c++}<q%+rEmdz)|M7tzxjIpH8_Pk2BscFnNf1wn&^+s1YnfMo#9DCCQRJ
z!U-x;_4;^(KtJa01FRU11k68oNQWq{<%ylc(R|MO8^<;4=@m^LU|m(T^zk*i)7Ou&
z#}Pp_y@I+<5#nNXox^aMUd7dQN2et{d{azM$Pk3W2EM$vH$wh7U5qyy=icv;&;H4<
z-x~}D`0V#C$JIkY6{-Ubx77(1TvWl-pBVJ^QRSXYwmaDQ)u3=~D#&pkCRNZM8h3=^
z_Rp8{5<AQmsDdvZFJJq;qr>CD<?H_5+d~Somt8{3#mRJ9;cA4@8rQXcninsFfxZ2d
zs@{ezP5sz5&%Ua)o!+?q?2IaI<uEw5nhrv`vCc5xiSO;HlA_yL&9#h;ySQ*cA=c55
zR2l{hU8ve;(?CW4bc~)|w{)T@i>5(rJsY%+QIj0q>RKJi7TBaw;kH8as?A2Sw5!9m
zHofc&D@Hl)zGzgcT=30mupPd#e5<BwXhOxv)N6seu8bPatqP<SoU0O=GQ4xM0J|Q;
znr=DF@u15NqX*;}oYq8XLLK4vm9n?D#d?A4KYGQZ@5?I{O{Rb+OLDS=w!4n{0zB%J
z#QKD)*zo`}p3oYrPFlNjsu8+7!E@-osmXXeMBnr4=S?1GZXGnuJBx7_^fmJ$tGm9S
zuaPHtP7`jBKUs8EqbaZQ7}KXG#`Ttm=)`O0VIHZ|KxbSYp@TEc84uBE;OBFVYgB(3
z)#hK0H;+{3550@9vt!hDPsQ}Az#Xr%@*Nter@`g2w}=1wgTc{w^6hS3p(VK<?dOMj
zfS~yI7mMfR5vm><d}KXe{@|)Wn>D!ofXg6N`h7pDRCp``Z;^(o)Z>MtyU`hqZ*W2P
z^YD4_#$FDM#RT`fh+o1Ffk)$sGHsDKSh-mjllg3pUVR&iOS!2@wO9-&d2DuIhgRS}
zqh5h-h-*b#H+hrZJPsFuKhP$AYR=!iZi~&JdENN2>I&QzGc;9udDgou$5W(cIo&OA
zov2EwI@I35NKKi5%g}D_N`24zccXp19k$Gq?V~#~d8*aC{m@kO@Dr}IK+oqjP^woe
zrLKCn8buhw@O%4?<?7-cWn~w$+omE}t1RGJ^>`UTMkd>O5zxWA#^ezX3grtJ9^C*_
zAadolL4*s>v4t_-elF|vhtx9peXq;$Yq>M`>DNh_p^&MfN7?-j=ku6$DLYV|n0NEl
zxO%<Ds~j3rlc3?^t`^6w`e#4J!NZHwu=MUjwX=Cay{LPnHx)FLv!G{zG~J%k%$UBh
zzYTY0qRL~Ap%nc??|{Hq-qXqSuu`FB_SWMqUO{Yi2(kF<F7vhpZZNJE+=N3vgGQI-
zi|Y8E=ze%{HD8_3z?BLZbwgYHZ&eIVM!3o#oapEspUu@U5Dy*{XS*dHaJiUW<L&9q
z2G$NN-K$mA6DUZ&(&8HZbIdv8=nm|AM$eADj?~LdZe|Ado~!%uL!%#_;Bt6f_c2L=
z{XNcDr<e<@@SN!2UfmGiiguQ%Jhj-UA%u!~on<mJ2&f-@U~H4^VYcQT@1SDlFWbLF
z|Edm~;4b;+P|5q#OycCr`385hOdfIbF0xSmOz&mdf`vmdWrZRAge;9D>m3Vzst680
zeV_>R(feTua?Daa6b?$=?mGYNFkFE3(K)1on;k|x`_0Fbnv$(|=xR?@0f@ZHS0q-x
z#N0|f)z`clfBuNs@lW%LMuny}VG^p>N3C6L(apv5xu4${Y_>P7$}KAK_v1B9)~WL1
zWsK$<6IjSc^?+x9)|yHLX1I`tkoV;l&rWdRGB1G~>S|^14ZQ-gTs9wg)_r>uzJ6rc
zuVPhSh3XwdeY-jB1^Q86#`zzdqSIkU^<EsB7*%J-dETl%(K&7%#0;i+(xT?W1nXkA
zdAutr4cy}#+l_wpqY)U&y_ZGNUCAf<85KJRf$Vxu9fcrgWc!-=ws=5;RlWB2Zg(qN
zGoPcAf)TGq=q{n^L}8Ge9cfqd`oufc2YVij`sm!XRCz40@o0l$ANbH%#_H7)+O06V
zf#nhF-OA9}F%oL>bcQ^-MPE&&THMyBS=+z2XVSD0leagk|F^iS3z2QP%47b#V*CM-
zT!`RFOhRH5XYcNw=|R-YIn$FnIdf(@(=+GZ<fU&<ch5}E>A%}QGv@>!0s(~>qXdPJ
z7b8dv#0TTk4JceBh=SmKh>$0_4-tY75>di^ki_p>Rke5RU2AvuoS*A(@9FNn*Q#2z
zYE{*$wN|Y%_qlWe{Y*!Q)(?lUg!iTwU3<q$X6DW_7<AyGm;w%7h7-8LXox58xH-P>
zsq8v!N^!aVj$+eV!-Lx3^m3V2x`WHWCnIe+%{MiohX8!BWt)3>GHQP;wuVfJsr={E
z;Rq>iI44k&@?Dt&m5V1Zq(u9bh3imU8Pkv-F)vav1d2p$4?D;GDb~ON#-sT!7&BF&
z8hA^{5KSs9dnLL8UPd5qVkxa%KB54vo3f7|jh#_1D|zYFFPIUKwcL=Kg(;=Y@Al7@
zG$<-@)!3Wv4TV>VcBDw_#^I0?m{xf>hg1)TpTM0x7)eG9v&4q*7_GvWl@T@%hezG1
z>$+vdb8PQ)A$A<wps3To5+_rBHhRX!+|(?(l6?}Ba80#$!p&#wot{l`r4k`+OYK>9
zW0_;=v$b+Z>$WAd@UGMn;SJS;U{oWhCd(b!M5EWAwpBPMJWt8o3_H3p6bmcROejG?
z8?mVkJO{>{eK0y4!whu7W!EkLEBYEJ97W!NELEJe%66YMnqc5knF*;x(1-1b1eVH&
zGJ|llXCyj-JQ<3$N-1?y+3Y1v#TIHXP?m>yFLlHm&S!^b&>8eCTB$4=-wQqL)SXM`
zPHQ+EL6e@%C&Pm|u2u=jinc)3(R8}s8(^CTr+XKs8s(0-x4i4JlTOfY$_RQ&lDq-K
zRS&$vcrG`q_3lL6VMtR4nGPg+>9mA3%Wmv%bN6VJ=RgP{nM<i}#GF<igMy9vXe<te
zUFf^}P_v(u2yiCGy?@+_3vX!I(kQDM7olud4=1qDKvTut=KUJ8Y|`7stKgo|bQBrA
z(lC9%&I8pYP<@Qu^%SXw%$sLbe`RNa8z`Ta3{}#(b%7Mod&qtXTbOgUzU8wpwLG!n
z-^MnN_tb^+P+T$_eu~U)tJj}Ty3mCoGc}3{jP||o-pXiQBx!GYh;$XWw=XA3am5xX
zo4IVESXQ+HD-B!{H?G65SD#=Oc*w{e)nQ-&jY(Uqn3^m{RYQPK*_aOoG{U(Oy$$EY
z*b^oBY;$Vhg4UGK8ir17A2jxAxLj=vbNMUt?hqy??%+xI@VsVB;8`W8iX}nZ%#1~B
zUrZ1JMJhsuWi9n3s;Phrr<?*fF0bo`CEKW{R=-fXEaME=L=`7<><m_@CaSbL9cwi@
zqJaz($f>gx<^()bNmN1b@eHGJ>TFI*i(ubu!(}+_9*;0VQ3#a@E!y1at^id!hOLGU
z+<RumFAyO{NWgkUlVNHH&-UEr$+&}ZmO#NH-0JL)hUY$9UF)|pAwAQ(1{71L^?<}&
zj~>#>!<coTJ3Tm*G?-Y_k~@PAob8%JU=wWEo}6D%H}S@V^&MV}Mh`O2#b#rk=7xJz
zPUXDc?T*Drk#n=8ThjSdU=V4RFQ(Gq8`z#2u<8ywQ>~n`gSiJyp{IgWUTpPdC}oG~
z^3J=H+9iZM(-iB*gcA@xQ1SJ5Gw+!#tJUu}Eh`WQ*-TomSkkHo^TBC%a(H&9eloSa
z7POAB&u|i7bgxGf+^>#+qc*bcadRGmB1&5URbjVwuuT<Tnv2Jwm7%%0v$NZ8or~QA
z=Aw*n$2-bl4LpQ(DZmXd<8`yV*DoI7M$cB?ns}6DWoHKwCB|fw1IB9Ej5c<5a*snk
z?CeBarn>#nxmQ~#J0L&CK1d8ySy-v8f;82c612Lu?(tr1`X-E;whop+3XEgR@Khj_
z4g;Yj^G@!-Mp8JvT0TYB6chuVU{rG#<NB0aG+0(_gx2le%(_kY2NCyZQ5ojfUORn0
zz+N0~`eFC_0H@`Yi4=T-uaND$-9^s(u57P_*230?OUdT#`-tZqUpyXlQPi7a_G(`+
zYO*#!TtHcg3wTPbsAB1=b<W`^mqu>o8H|XhJ=9;T4KBk~nask(op?jI6<y4$K&+mO
zRVnqtr2+2J3~G2<LKBtG5Y<fu?0A$)#R7&sOJb^|h5g38%4K7tSr2>X30dn}Yb8)?
zjr6|^mhCg>aVb}d$4WR@B9(=YH?ASfLd0MYn$$_`rZ=*!Ib^ZCK2vfdoK>LSY)B3S
zmjuT`uyX?}rZp(wsY$-2vv|tff~I6Q6akF^S!fqsJ^58QyOKs5*mDd=%^I(8J=#$(
zC5q!a*t2s<8($!@ms|H2aexB$PIYS#$;;!>E`ld&J?!)_T`0qbmQ&7X1d9-3IvRI}
zL8wATUx+qR^$yo8@rL_)Tcx3Th{B|Pyq5KeKeoj=kL?L44~KDvo)S%BLLMg~HZ))_
zPCZZgveD(eDvYKIVYa9fKZLyn)4o!K)hR@AR$gZ*O1fSs3iX@_R;yNr?My<9P-h(+
z8em~jdmt7JY#oJF2;eB_DX(O+?z1%%KxF#08%yq&%)5ip9R^gj_fOZM0;u(Fz$<5G
zc(WBzxf87mSswem%>lNJE(qt00ZwUn#CiH=jP42htvoTC?K7|nckL-!RNR=KPG`Lt
z)Dx)h)L-wn`g4(YZG*%85`EVQwHdbi^G?@jyk>;*d%ZP5*n?TJBlsf*qhX;^998+s
z2G-c}NMc@jjv^lp{UHHti=lcez;LwKfJzhNBjaTSM5ufLuaO-riSBWsP=5P`MR|_o
zCMbm;6=7(R&?F4s5-<@W@JCr<l%hA#;Ry24H?jfCEg`VE1x*k8XZ50cwFNDVOX)1w
zgOvdXerA)|h~{bxN}9=*(9)}|PDz0@EFdr2?gISP2^cJ5MYgWN;&zqh!w19B^{{yc
z0a5($m~IluiPo{{23v=n`Iu}~v|S%MI-^b7Tn4mMDyuytLE`BZWxdGIP?<jHjhojO
zSZNKsGUxkLtYaOTB4!b-SV}|`P3FVqg*jMBXjO#$(xbF=eKGTC#|~a0HeJ3E^hPrz
zD!%F|yiEbo2kq&Y_F~-#n}HlI&<LW`?sMIq;)rUkeqFzN9S)UDiG2aobqr4%L5ahp
zNTody+S^38^TaWm5`t{)_fGfDhu8>%bsRLC`@P`<E-1Yj4yeP;qL^n=q-Ya{O@rS-
zNfT>j4qE47R1Sw0(!^$yt^hvTOy-&iEdrp4aM~j16^#f7TQqfl27W8jkitVrUA4ss
z=SHYiPt2V6T2r$Nj@SL=f7&~zZZNB9M*Da@X%FBKP1hc<l%D7^s}~c+;>s}VquJ3H
z5D{{8%clS;NFUUSX0UPzo-x07-svtYKv@o|4P3a<?f8wL%?-V<IuT(JJ4*(A4?2)|
z{WRq}9i9Vh)?j;Q)I-lkBo^%g2CdE&O2yt>X=G{(Zudt3Qb5v`l$#Gq_G&=krZ-B9
z;y{5!@vL0luAGu2x5t^yQ3#fH@EPt(;H(DfmEzJmVI_tTVPGn=Zdz%7HLx=QLkmL6
z(&>Q8v&6wE9E%eX=klolq=W+?uJsg<R37F-H|>P<q)-!y(<cwKaqrFk-G|-wl~LvI
z*4wJdFXdQMBG*Im5TJZ9n)f^RV2QI{#bbo87#Rx4+&DhJ%W#6;qp+mvL2Ed22PMpb
zN3b73hTS2yHF*JIs9<e))tiil+LXFWdn%m{W~))bt`wOOnsf9=DaMo-%I%g#c>s=Z
z5?%qt<K<YK2k1Wu{SV1a*V1bcMYln(u61?_ht1o#AWkstYpz2GH>h9f^Fu16D6NAt
z6jL+7M=|i`51Snv9K*qikQtu)2$O0T>w?l@9?*Ewo}pJ=(0O*2O6U0&RGp@S4VaW&
zl3M+lbUy?cC&dC_Z<>M8T#ZLc>%|J!YBPUElX`^&=1*kGQ3}pd0q5oH(M{K-Y!Nsr
z(WNhH(W(-ThA1{}9EPYM>IPFo$bt+=6cN-QS<`xTFEB~C(_Psf$`rK8C@Wl&tJS!5
zG@1`PC*$K$eRNIt8vG!NOCw9Xx*AHXFHS)$JpmDwtOiBD@SZ5#>b$mv?J5K0?@6dS
z%@CLw>Q&B|a2k$z-VpY;aAWRCF;+)-W$U{b#5Jt998-1vgHV}}`7}Tp+k{bm#5fRk
zQXC~q0K^%2P-YjhEF$jw3PrP?plnZmE)js6pvWl5dG`@`bk0Nb@aiM^N)&@CF*<Sw
zXC}sl2ywJGJ)X?FyRCVDhL{N#6`&2cY)u<HQiX6XXYfJKEJs3EXK#kJW_;V8I?2jm
zNzQZKLk6#7UZfJ17nf--+rC0f3|&<w{?7$Kz+|U!Q!K^U-tl0^6;Gn%ml90}gthod
zbwGQ@A=|>tNQ1f`3QTV7sHK$VjN(9lHzj+T#&~y&_H3Z$S4(o2kCENty5K5gcM<9N
zWHi?=5lyb1N@rL#t!nH}<m%bpqN;m8;JR&E6?W97gJLK`5l{qDWZR>++vUyVjftPZ
zaMz(+%c%gKsvvQM4fz~*z{7$H3yHZ8N^5%p2oG>`KAZ9o<>C@Yh(%Zkh*$)v8i31S
zlYXA`8Plp(oa8>8gDFK<8AgsJbFieK90x+)KGdsn>>o^D!L7(0Jqh+Ey0D39Eu5_7
zP3Np%@CLi8@D0wYv~W-bv34$HK<3zsW9ED|&}DoJs|Sw1KGf_E)fTh$b!Qt&wJuX`
zt$+4nvwqRh?{kbQMs-B`x$XlFX{96|1R+qZB_MNJ*n`oN-0<Fd7+0DwVGY21BS)&(
zy3Xv4A2Q4M8M=gZtKAwq;32|imX(~&8Eqt#f(xo(hH|1Z1z4fXrRfW?|9~%c(v;?D
zElTPgkI~VLch}ou^MT&kIXb+zeS(b4w@-GN(mmv;O@nH)G4cj3r0T@IcPE<t-xliu
zBdMvDf)z`$xqisp-6pq%sdqdlxi7ujZsbz0{lgP2oXF-)46gLLccSIbk2AMvm@C7~
z4IBl<7s3~m?{ZnDc{PzyvbndjuU`GRZkd9Q?Y;U-d&j$XROaR2>PLGG^%hqj?xnbb
z)Y~W8EzT~xYlHOy4jH(0IM=-v4o#_VAKhyxQj@D0kZOr8p{lF-;7|o~Y2Y0g@@5uP
z?9zb}5*JL>_xF_4=ze>_W@OUW??H8!QLwNBILq%G-d@ty111bzfwY68qhnmbyO?nY
zM+@3np7wAR-^cGanZEr<OB$X)FoCoR!9YxII6B-;Krh}kVY!OEZj1h20i}8g1At~c
zmf-(`5l_2KuDTNbFQMGQoD!8;+%8ue3#A?mh&QreE1V@jrbbwK8JBo4GVXsE3&Z*o
zC_2Oshe*|+L2bIS2xTH5G%$pGOM@FT=-;hLXBPpaipE<3lj)m09vOvR>~|q;aS3p&
zjH-VY9*kE45P2*c=YwQ_fK9F2HK;AZ@o&PS26nee=>h#*whOr(lm=RnSP*5$5^2z7
zlPHsBChs*;!kJVuq3ru$33)r(-e3cQTff%a3`iF}-9DX;`cScpLpiuxx`Rbom`0aB
z`o;8z^^4`0k=uXLESK3sE(QllhVND)m+x>nFy)Z#a?mP*nA(#b^7V!r$*6+p^$~(q
z5SK1J4x!LS?$-Urfy8W|!4J-{U5`r5=2Z*!Nu-pSwa$YP6eA%JLfNtFKieL5Oyo=&
zvq~!jGYorC8n8-@2_BThtT*TPn6Bg;hlLy!%Jj0eH$`i38AV@nq{ekRLmfa}YbD}$
zUg$nlkgI~rcPgGQmWlL68xR0110qG%8kISRu%d{=?!^%p#Bz9}CK(P&krLIzg8=Ac
zOT`9%HgLR}kC<6WLDNb{)H6>WXJ!eThBv#zSCLEfEb?TE6r#zahNc?Me4Zz5$8i`M
zDHS=_%Ei$Q*luVp#1TqKL}i$>Acl!kRkj07P*WhZH!mYes;JEPMs`nzy@!VJ=sdbx
znD_z*Ug`sy&r}zzSU#s%?C4PSQ*bpa##~gZVLm&eji)TIu%QKkTd*@eRx524g1ew~
z$eS@Ak718yUSw;)qw^6XR9Pey;HV_SlXB&z>7YNfN*2VdEZ_!3Ao3`#Jx=?u)XTw7
z3A3n+qa{-&>CY`0TEnZzl;@FRvM!jJaRuxR5A_ZV+IHZqfLZb|x3!E^8oMJdiqdYF
z5TwbPVLs%Awu>wl))4=3`iEh%y8LZ2(Z=3T0)t`q66-odqRhw299l7+2pnd@iEAWK
zbh%9!c?>oTPEF3XG61v)q<E)0oN{euG8-Ohz+!G5><uW0`Qpnvw2Safg2{updkDqD
z!;KE|M)W{~E2Q4)P4)(`>mx3+#?MBj*{C*ak;G$S-NtY<#u4O}lX)p89Qpd~!L6z0
z49UZ?4rF4%20+tc^3c(`ITpfH9IKSRC4un<T5;?J#e*lzsp0x1oInB$s+chRBg^sB
zVigLt8VOk}!-J(FoAlsYEf+}n)}73-R3P~wBzU$5x%dQxrw*}c)S8F5`*3`CtT=O?
ziQw||d#wz+{I%k|eReg%Delhi3lr3{e*a%xROP`2u7!vFj_D(4HSC?+^soggWSGB3
znZ-gNT_+3SnBam|0q&i`F{Q)vB$TGZh0sxq;_Vp?HKt6-{Ug4?8{#k@#*K0SK||=`
z97AyiE~PEZaS02-1WBH2E9FuuD8n2}#M<)6CC<uf<{?_K+MLVGMYZfkiNbD%e|f3Q
z)VU%e-rM0$wI*)<0d!Q=9j;-+-7cPehplWE77gCZ8!oSxvbbJyb=|<y8u6AoISk(J
z?tS8jZx68S?$z=3w8nLoGpOPXFJ3QX-3hyOxn`Q@PU(|#oLs@}Kf^BagJ@&$88T)=
zU0Z}f<V|B|!RivECc{Y)WKDB12B$?FVuXVh)sSOMTuO_xI~2F%?>=$aTS3{kA01BN
z05B?IXwb_yHs)kw+RCZ+_9Ld$+`|N*&bhg!Q5mG-V|is$?PX_-JWAOU6J&!l%Df_f
z;e?+rX`Qp`O{(6aMmisK4UTK*@`{j9+Oo*vT!!21R4-Pvs;4{Z@2))TpCwTRBTk5u
zA-71O%S$10>cLL>KvBfBMR-Yq%21es9GDpq^>n?*>8KS{$j2%6iS@+CsTAkM-BP2g
zf|yFT!sSKjBy56}^*HPr)c~It5ALWJs1DlFzXi8Et|&$-$qIJPIBgdW^Yyb90y7uG
z!1P3wczH5#7%h-`26N|f(8~5H4ytm2^eos@oHFQial){P>=TllAtZR?HbQwBNR+Nl
zq58p{<x!)ND_*SQ%nl9z3{2%>0SqdLVOe@~+7&(0Rpr*fu%f6WPOIZl<I%J~(hmv6
z>jHB)M4GT6d{c`+xeztr<2oHZgiY2r_bZ9Mc}P;+Vi_yBN$L&#jQOk!yb<t#4Xk$%
zhN$i&@hBF-ukRpEjK<zfN^{j!mz2D$P#kG@M0lP=ZOKIzHwJJ^V&DeSr?`GeL?9<d
zhJn2928sH5XQ2WRSKZOcI!>;(COngxvYi2loj!v=0_lj-g9J&WT2TOSgj2D+2-C$3
zJGkUQP<l+cH4C#e_;@si3F)@Umu<4wuAO&rPMaZE$>kKS*=&N~A@9i)Zi(bI5Cmd4
z4(cnVXLnpgoPYvlN``|GbsEM8Nn!H3V{)+s#nEAfP<rlEwXH;KQ`=(e2yfaLCz_e-
zPScSkeQ%PLzSD2wydGU_%`u{%f@&Gq>v<re7*m=|9*hx1@#GlpaXasZn3a#GTOlU+
zF%t$8!#3Mv#74rT8zPm_CzDYR=^D;$4O1EXAtZGsndhwY&FtYh^&r%iS0LmfNO35<
zhO-og=GOcnChY(wiX@#U?Gwicnbwpr4y$xT1tq}epSzI}4Z?f@11Zdo_zN&7mM1uB
zAiP9ou>c^ZmBLI7CSdx7`8j;UxXz530Z`&Jk=lHlk?WD7Mp4+C;&2;9TeU>+-Vbi?
z4Xk9=+#8yUQbLMjNoov_NQ1{hS18!}83o*@qQzxu4YG&(7S;w2aL^%ayf_{JHoY_I
z<5)lkDMiP)$D_A~Ya?beW%emiF8Yw^Aqs9t+_f|e42CGR5hNE@QaOR)um~%3cI4dY
zaLh^#hq!p*tPg8nxRk{uxb-ESLobMkUK{tup}6Yn4+w?)sghaQ?p)5NX(w``%}hLK
z;gmhhV^9GLq6(TYnIVw?1`rdxNjL$o9)h~oQxCx)orD2J{`(;oR4r(vY*UI^NDahr
zZ`L&%JxHhGR<qXa#vB&~;TGQsGI^S>c5|kPLB#=E{bI^_3_8O6;l?)7<ReRtAZ0xU
zxY@^65nx3)_HT0X9b>SD6*^~!D`3L>7TgQM2ydvgj_4cIXz&W1Ukj<ltYXj+Fc%Kt
zklHejeC27)>|#QkP21STXfd^l<BRT#%-=?*Fr7-wj;*j7LMtYZsVozZdfh(f!@OB|
zLBng5?A2LJk5F<p0yA}^4!Z;`Xqxj6eiE$KqEoj=S5T(UcpjsPjC1g_NqRe6qJqR8
zcnJ%ueDZ+DRLreA2HFZi0*E-$<jAy#Vy2J;s)Xoh7He_=G&_()+!NT9qhUDH^$4x@
zG<_is>SX3&2M-V;Y;LRxhJVa_rzXk(TsS;C!(;)qt=<MZ+BmY2DT?LGNxJ5^-8_d)
zE}PIKd~FmcpG2W1TZTtmt}SrB%Z=UO0Zy0WP&z>^1?^ljGjDj|O#xE&qL~a9E`uq;
zsusPXBeDrv3Tt!#h4ngs!rC1`5e?Jp9-Q_C1a^8?MFrzkQIY4}tPG`WLdU~`$@MzN
zD!JlEeHyl$upw1^8WJSsZUhC(DZ!BG@;Ize9)~r`<FHCoc35Zi6h|~feZ^(jp=gsy
zr=w9{#;X2$+;(`MK7KJa2OWq8-2vJ^!`|CAg=qLRr_~~mP0q$+3O0A*N{FKkXoEil
zv2dJiag`JcJ!bMunIbbY`=UZrR~l@(Qma=P?ZIXKqi8?4L5<2AhOQyZPJ$U4iZTy}
z<g42sogiFr1h%LoaptEIP2Je09u3vDsCwrDX?Pjt%cCi>M~0V40JDpX21#EK*pBnO
ztCd`;L$STQtSe?Mg$~Zx$k4hPFs=^+A54sb*g;pLD1yF821>Mo;AtoGLLMQbHw2l1
z$wpsYtu`3p6hOO$;T?d`QSq-_B$$<GE>O9FVI^KLDl$5-l+h8l5OPa$OfV=V$qK8T
zR`d(^P^2=Lg%(zUo`3^c={B#SX_--kZa$ox46)UM$v`7!YyUIXjFatjP1H@fvEMz7
zaIu~|!-;0-Eyqz4VGqb2VEsus;q|MwGl{xQ)Z1-oG@f0ZMjw#@;sxFxj}E)jQS(xG
zVC6dbVy-PhrY;w#7tV1$1u1)r&<hz|gBnIg*Y8S%ljF5{Jb|~=avG6JT6R{C@=cCN
zFnfv2Z`l#J6mW$x=hdFFAdlyozCc8hK_YXIjM-J7t4u&Bfo0(^UmVh?hlXJ|Gd4^K
zN6JyNz%7imQ7`-wqegLXpjYV;FEyF{R($9Dqyu;p-t)>nl7G*~Ca?wbrAd*CD7xe3
z5$>zoR3@LdiZ))>lM9YQGB!hC9h6C;ToP&`ak5G6QU}(J`NbaMB@cVx8Tqx`nf1g~
zJgH;r#sn;s3Ej=L*=%f!G~`5eL-aIJG)&%tE$;I~+f>z#$mO`kq$+8pgN!Kqk4o|8
zMzyy8XrodoZJ;xXFCTJ?p0a`kgvGFLBUg)~t~no0TW6}pxtz9%sbaTUHatxYqsjqS
zL<_H7ip{{z;YN#9%r{XExy)UIS!#1^_5htXJ&#=5??w1OMpn>;irmeL5*-g+kZDR?
zJHQn5tXGj_+<i(N>lM2KE5K6fA#6b)X;gcjh|iKq+swLVsq%U{vsr+-n%fx}CQzE2
zFZN~^X1GwiGrSFTF%Oq~jMmTL88xW|6}8N-YN-wl!&E}oa?M$;jkT><sxNyCk*q8{
zoQ4@)&W6MML`zNBROQZB#Az|#(p}9J5h?J|y168F7YUWcfuU%w8d@14TEdLo3A{0C
z$hC%s6_okI>g~Ou>3nHbPO3a!Y|4o%IBO`9T;s&>-#6RShPh!T<jbZ|CKU*Ncfu2O
zhZE~^QuY$*Ma)rhDRcdf#`8+@z9Y4i4!YN?z*Xgn+(aQ<rO8TS#zkG&pl<^4>?`)5
z?RbNw;CQ!F$J|XD^C4rrGA2YYiH8+(Hbh(sX>PWL4KZWYNnf1zN2iL9uF69M+@Te1
z7hTEp61eYj@IWlP33}FM$~j;lr>vSSq%9eV1+;lKLaGEt1_u^sY+@$R^a45J#v=f>
z>0{w~Q{l6^Rul?nybc)(Q9~_S!Q~@3gm;mbk7nw^Xy<qVkx_IcZWkHKi$)syxreP0
z=itaX{n%5rQ*|?>HPlpOL`kLscZ8*2Y@<fmi21iT-LSI8Xg+B(*?ucIFH>Auucx+U
z5%=P<Y19EODx5lrv~?+A=ZlJBhIRVCM4T7Qc+TvX3cJb^zmJ?*E3?jKdeOhbyBI1$
z@~3{^bfEOT)<fZp?2{4{VU)k@$;PmV{KTuaDB;NBw}oxpk}<%B6@ozhaMRADic#<a
z_JnY!?q0#Qr6u0>z;VTl%t0Kbhu1?mI+-er-*DE6Ie;~JSrB=B*JQO{biC249W3F<
zEnyKwOb3$7(Pb)|igMbbW0<zFjSJcD6$EE;!U0){QF3x!dscK7VX3BPpSw{sP3mbb
zCJCWMu-XsiOTdr^DlO6OQ?LdoW%~lFuS6*m%(SmVc?<x+v*JJR)zEB*8E&QoPo&q>
z$jLZHraxZoG@~ye?dK9!P#bE!Kf2yWsx5kxtxqD5mZDPNDs}v)c4hV%$s@zZurmrv
zTc!ZyY{o_9!2B`5{eAs{b3(b%fsLeVgIO@88R^6k((hqRIKC>oeFm;`eGhbr0`@C~
zdXg2zE+nr=!;NbX*`c%J(F2ne7M60{#SwHIF0eF!m?StI5lji#h%D3z@@6?OWq;zp
z8Xi~}l@(Wf;+WTRr)xr)lCo7wlWv&Aa^S+utuV`3)kM?EJ*Ji9SVcM^`&w;Mf>CWn
zPJI<b;af_L*WIfMh)0$u9680&16m@n9%QuPL}Rg#`zTOoaIN3;!th3QZ+YM`0|6~Y
z7wO);a6--YUEbfne=lX0EFE%r3v$eB6H9)Cz}1y0fFmP-*-R^Np-BoDF6qt`<$-2d
zAIEaWKPj|+$A+m<5#yYmPOXJ=7+ev^0Fo3z6)#&zqPm1*JvPY#5@ZppEVdfa0@Ajq
z`1VCpQ&#oGW_b@+F~VmT`dqI(-bA18Ta0VD{scp_0mYbCwP5!q6HeR=_66af48?%4
zI+=n(q<l+mxX9$&@EEf}NVXXzigPvug)F8m+=CB*><U{MB0`ItGDc7vv>|k~nFR0K
zoxV)g$|9iPA|;i{#F9w#M5{GjXl2=4GU<i&&}xC0>bfaAYjG(U`wUZHikVK0VN&T3
zQ_NHaC9qQDQ@BsIA8kkQ9BG?^kXW%TB0|&Y785U4Q?Uf(q!hZPn~U^mG&`ZeYLL`p
zXFXFwC}K?brJ6#X?q;>P8eHkiJ+VFcxX3&aNcrOFjdxhG<Zj0-beg9zGszXwcvBNV
zAxoc$Atqk8J(zSkmC(nVp$p2o;T)-#k{{psRb9slSIemERGm97y|jIM&rE>L_QMup
zI*p<qNJ><4IF0MxnW(-Y3~sD6=mLX93FW)lH9NaV`mG6cc^qXnLDUN;mW-5XxFd;*
zZ^HOJ=ppXiY+1d~ys7J2xDq?v0)s1}EIkIO*aYeEf(zW+BzAo;A>R$DrjgDesuCv0
z2n-KsR0#OFV04O7#S~^49%=(L$M<kvTgdpO*sIOv0i{x@<QnsMB(r#~#sG7*3X|~V
zwD`vx&st~R#IBwSUIz+o(8X%;>9St5JyK;;1ahSs(k+x;5O?1z6_lHi)Y2AADXIKE
z_Iw*<2T`EDD*9FqW9if!jPF1u!=W2h8@LS4dE=xTd7@;stq=zXZ(5b;(#9M*{X|Tk
z#tiHm-qcI8GS`rnnF}mAKBR3`7tAswS8-Bm(@bWDE(nj^FnRMsZWX{sn>arS+HtnV
z-Y%gTA}zDzg6>D=ypxTP%pOxhtmy6)bWPsWQq-`T>Lx%Qw_&G`13I`-&DBuny%;Rf
zbV{zO#dTDaFsN(++|FESv6@Akitb!=+YhjnW!f^#qsC|5<XKj+QzlQKeDBrQ=EG}*
zuQwAjCeM)~w_uZ3->kLTugr1fPIK^xSCU}4I~z$N(X~mp-)$k@C3w;N^zfPk4d)K|
z?=|mcpKalI>;-Z^HD~5c`0RSnm_<>Ghl!D_wC@0whXG_*SSw#x0MhA8Dzi?@eBf6I
zf_LFVK(bv#lu!H)Y2;hwX$TvZA7X|uq(Ld3B!!Plr$}$V_B8%}0)_DZx7otJ{<d54
z?d$fzFWKkrgRuVJPvOT;;(v$#ZHxbH`{0%jjPLKr{eBOA`}p6p_}{bmAIQPy@qZQX
ze;WV0`k#CIEx~y1?eekq@NI_T=kSx?_{qQL&Bt1d&maHz$0LXT*>}EA-hB3*2JiE~
zXaHTk4$gi44qpGr!+-aC4gR~|Yw%w;=u$7saOK#?YX|u9Ll6HAi~okj|DPQL>F^x=
z4*w;5VN)GF-+9^8|DBgj{Rh8pJ{^4a*V@|4c)_wA{)fN#egXaP7tNE`<X1|!tKaeO
z67PwB+roX{;(y=bf7kW{VJJ&^{288+CiVmWUi*@%|Ftif`v3V~&2#tf&$}=7^LOz{
zd;`6<_JxaIkvCtsFct57(gNBK$4B?{^Z4T9zy2+Q|N6HKe*Kj!#~l3*|4%IbyVh>W
zqgx;Sp#6NOeE#T%=2L#)pFI6v!86wIE(>t${U4QYx86^wUR&e%@c;7ke-$r%{Hn#T
zT6~UA|1XdKHN5ok>lVLm@xSUAYYWNK{|!7RPu^||{=o+f{Xh7Cq5lPo=RWfM`zBue
z9Pur{56W5mJiPnuKK~AQKkMPYY4P8*_207Px(|ove*Y7`uuZQ1=PRcE=RauZ&%@_`
ze;2sqpToap@o!oDFXtGZ$NzWWvhN-Kt6Qf2S3hFve}jeN&*8aG*AM@RcYgg}{!N4b
z<&PTtmx=8CIeho%zww37T>Woa{5LKBb2<7gISze|)2m<qn|Dq94*zdIoPf%geErYm
z@L!!7{8x_+{aNgk5AON%OYmPNao%6P{`cX%-~NwXnfjl<Z|eW!5B#Zzy|(tT%&)bz
z583+HbI%?AKjfY}a@^-{So|E=+P~TQhrgD6<M7<m-zvc0|D2)!>^<25oX6iQ!2g@C
z{ESrbwLfq*na@1_rwZ_2`VdFu+S*@~44(TpkKecWdH((Hrwo4e_Z+7AbolP`+Tv%S
z*4D~z82s0N+d*dW9sK7C@Za&vW`sq5=thQ)|3?--Pk-gFenAlaW5?B$#dlQw8NOtD
z$iUtDFN6P&|6(9Ng2%rPL3Q%$*5ALx>nw!^|NE~Q#pPSBBJ)_rs{JY--z>oY^S?Lv
N-}vh+b-Ci!{txVA)`9>4

diff --git a/llvm/test/Transforms/PGOProfile/Inputs/memprof_missing_leaf.memprofraw b/llvm/test/Transforms/PGOProfile/Inputs/memprof_missing_leaf.memprofraw
deleted file mode 100644
index fcb661c0ba81d7f680741b4a649bdb9945599fe8..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 960
zcmZoHO3N=Q$o<d4009S>A+!OMZ@~m%6hOr}pyCV+3Sia;6>Yc(h#>;y8EI7QdKARW
z_V3cAVx1-bQX^t+?L$%MP|5MPo`K=LKZIt0@?rYfN`z&&)=XvhbCWq^z4zJg4^t(d
zpy)sFk~syaf2Y$QFwMa508GR5GXzi6vkiSNH<=}0aLa}lPd)v7BvJGyaIk>%Gj{TW
zX@&%LFb&hc;IU)q*#w2v=hUwze>It(|Mg<|cNF~%Y+&_<{~<I(0EC9=mp(YTRK!X2
zPI^mtvj4uGUp}19At?F_?t<OBaMFE<`wbv`nEuxHV*YElbwq~hsCsNqf9tP%^vh&a
z{RhGNVm4oc=oi=zreXSb7~I);?87_xwn{TU{+Z`YZ%?&wN6~*l4D9}il9CYrAAs;-
z`Xe=;G<<69_HJ7xym5VpgS(b-(idd?&}6~D2o;3U$ZFth1*j~DhO%b>B|#VjU~CB>
z8-%F=(A|g4OqiQNGzi1ofv%R8?t-~<LXZa7+TQ1{!0`wZhh{s5cpgYTJD~`nPqsng
F9{^{bjI;m%

diff --git a/llvm/test/Transforms/PGOProfile/Inputs/update_memprof_inputs.sh b/llvm/test/Transforms/PGOProfile/Inputs/update_memprof_inputs.sh
index f06e6418edd65..ba648bf5ac9f7 100755
--- a/llvm/test/Transforms/PGOProfile/Inputs/update_memprof_inputs.sh
+++ b/llvm/test/Transforms/PGOProfile/Inputs/update_memprof_inputs.sh
@@ -95,39 +95,6 @@ rm ${OUTDIR}/memprof.cc
 rm ${OUTDIR}/pgo.exe
 rm ${OUTDIR}/memprof_pgo.profraw
 
-# Use musttail to simulate a missing leaf debug frame in the profiled binary.
-# Note we don't currently match onto explicit ::operator new calls, which is
-# why the non-musttail case uses implicit new (which doesn't support musttail).
-# Note that changes in the code below which affect relative line number
-# offsets of calls from their parent function can affect callsite matching in
-# the LLVM IR.
-cat > ${OUTDIR}/memprof_missing_leaf.cc << EOF
-#include <new>
-#ifndef USE_MUSTTAIL
-#define USE_MUSTTAIL 0
-#endif
-
-// clang::musttail requires that the argument signature matches that of the caller.
-void *bar(std::size_t s) {
-#if USE_MUSTTAIL
-  [[clang::musttail]] return ::operator new (s);
-#else
-  return new char[s];
-#endif
-}
-
-int main() {
-  char *a = (char *)bar(1);
-  delete a;
-  return 0;
-}
-EOF
-
-${CLANG} ${COMMON_FLAGS} -fmemory-profile -DUSE_MUSTTAIL=1 ${OUTDIR}/memprof_missing_leaf.cc -o ${OUTDIR}/memprof_missing_leaf.exe
-env MEMPROF_OPTIONS=log_path=stdout ${OUTDIR}/memprof_missing_leaf.exe > ${OUTDIR}/memprof_missing_leaf.memprofraw
-
-rm ${OUTDIR}/memprof_missing_leaf.cc
-
 cat > ${OUTDIR}/memprof_internal_linkage.cc << EOF
 #include <cstring>
 #include <unistd.h>
diff --git a/llvm/test/Transforms/PGOProfile/memprof_missing_leaf.ll b/llvm/test/Transforms/PGOProfile/memprof_missing_leaf.ll
index e46945b763b1d..e6fb1863acbce 100644
--- a/llvm/test/Transforms/PGOProfile/memprof_missing_leaf.ll
+++ b/llvm/test/Transforms/PGOProfile/memprof_missing_leaf.ll
@@ -6,15 +6,58 @@
 ;; Avoid failures on big-endian systems that can't read the profile properly
 ; REQUIRES: x86_64-linux
 
-;; TODO: Use text profile inputs once that is available for memprof.
-;; # To update the Inputs below, run Inputs/update_memprof_inputs.sh.
 ;; # To generate below LLVM IR for use in matching.
 ;; $ clang++ -gmlt -fdebug-info-for-profiling -S memprof_missing_leaf.cc \
 ;; 	-O2 -emit-llvm
+;;
+;; where memprof_missing_leaf.cc is as follows:
+;;
+;; #include <new>
+;;
+;; // Use musttail to simulate a missing leaf debug frame in the profiled binary.
+;; // Note we don't currently match onto explicit ::operator new calls, which is
+;; // why the non-musttail case uses implicit new (which doesn't support musttail).
+;; // Note that changes in the code below which affect relative line number
+;; // offsets of calls from their parent function can affect callsite matching in
+;; // the LLVM IR.
+;; #ifndef USE_MUSTTAIL
+;; #define USE_MUSTTAIL 0
+;; #endif
+;;
+;; // clang::musttail requires that the argument signature matches that of the caller.
+;; void *bar(std::size_t s) {
+;; #if USE_MUSTTAIL
+;;   [[clang::musttail]] return ::operator new (s);
+;; #else
+;;   return new char[s];
+;; #endif
+;; }
+;;
+;; int main() {
+;;   char *a = (char *)bar(1);
+;;   delete a;
+;;   return 0;
+;;}
 
-; RUN: llvm-profdata merge %S/Inputs/memprof_missing_leaf.memprofraw --profiled-binary %S/Inputs/memprof_missing_leaf.exe -o %t.memprofdata
-; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -S | FileCheck %s
+; RUN: split-file %s %t
+; RUN: llvm-profdata merge %t/memprof_missing_leaf.yaml -o %t/memprof_missing_leaf.memprofdata
+; RUN: opt < %t/memprof_missing_leaf.ll -passes='memprof-use<profile-filename=%t/memprof_missing_leaf.memprofdata>' -S | FileCheck %s
 
+;--- memprof_missing_leaf.yaml
+---
+HeapProfileRecords:
+  - GUID:            main
+    AllocSites:
+      - Callstack:
+          - { Function: main, LineOffset: 1, Column: 21, IsInlineFrame: false }
+        MemInfoBlock:
+          AllocCount:      1
+          TotalSize:       1
+          TotalLifetime:   0
+          TotalLifetimeAccessDensity: 0
+    CallSites:       []
+...
+;--- memprof_missing_leaf.ll
 ; CHECK: call {{.*}} @_Znam{{.*}} #[[ATTR:[0-9]+]]
 ; CHECK: attributes #[[ATTR]] = {{.*}} "memprof"="notcold"
 

From e504ece6c15fa5b347a4d8ff7e6fc98ee109660e Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 19 Dec 2024 10:16:40 -0800
Subject: [PATCH 086/209] [LLVMIR] Migrate away from PointerUnion::{is,get}
 (NFC) (#120530)

Note that PointerUnion::{is,get} have been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

I'm not touching PointerUnion::dyn_cast for now because it's a bit
complicated; we could blindly migrate it to dyn_cast_if_present, but
we should probably use dyn_cast when the operand is known to be
non-null.
---
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 19 ++++++++++---------
 mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp   | 12 +++++++-----
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index d30a6b8398f06..ef5f1b069b40a 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -716,7 +716,7 @@ static void destructureIndices(Type currType, ArrayRef<GEPArg> indices,
         dynamicIndices.push_back(val);
       }
     } else {
-      rawConstantIndices.push_back(iter.get<GEPConstantIndex>());
+      rawConstantIndices.push_back(cast<GEPConstantIndex>(iter));
     }
 
     // Skip for very first iteration of this loop. First index does not index
@@ -805,7 +805,7 @@ static void printGEPIndices(OpAsmPrinter &printer, LLVM::GEPOp gepOp,
         if (Value val = llvm::dyn_cast_if_present<Value>(cst))
           printer.printOperand(val);
         else
-          printer << cst.get<IntegerAttr>().getInt();
+          printer << cast<IntegerAttr>(cst).getInt();
       });
 }
 
@@ -821,11 +821,12 @@ verifyStructIndices(Type baseGEPType, unsigned indexPos,
 
   return TypeSwitch<Type, LogicalResult>(baseGEPType)
       .Case<LLVMStructType>([&](LLVMStructType structType) -> LogicalResult {
-        if (!indices[indexPos].is<IntegerAttr>())
+        auto attr = dyn_cast<IntegerAttr>(indices[indexPos]);
+        if (!attr)
           return emitOpError() << "expected index " << indexPos
                                << " indexing a struct to be constant";
 
-        int32_t gepIndex = indices[indexPos].get<IntegerAttr>().getInt();
+        int32_t gepIndex = attr.getInt();
         ArrayRef<Type> elementTypes = structType.getBody();
         if (gepIndex < 0 ||
             static_cast<size_t>(gepIndex) >= elementTypes.size())
@@ -1100,11 +1101,11 @@ CallInterfaceCallable CallOp::getCallableForCallee() {
 void CallOp::setCalleeFromCallable(CallInterfaceCallable callee) {
   // Direct call.
   if (FlatSymbolRefAttr calleeAttr = getCalleeAttr()) {
-    auto symRef = callee.get<SymbolRefAttr>();
+    auto symRef = cast<SymbolRefAttr>(callee);
     return setCalleeAttr(cast<FlatSymbolRefAttr>(symRef));
   }
   // Indirect call, callee Value is the first operand.
-  return setOperand(0, callee.get<Value>());
+  return setOperand(0, cast<Value>(callee));
 }
 
 Operation::operand_range CallOp::getArgOperands() {
@@ -1564,11 +1565,11 @@ CallInterfaceCallable InvokeOp::getCallableForCallee() {
 void InvokeOp::setCalleeFromCallable(CallInterfaceCallable callee) {
   // Direct call.
   if (FlatSymbolRefAttr calleeAttr = getCalleeAttr()) {
-    auto symRef = callee.get<SymbolRefAttr>();
+    auto symRef = cast<SymbolRefAttr>(callee);
     return setCalleeAttr(cast<FlatSymbolRefAttr>(symRef));
   }
   // Indirect call, callee Value is the first operand.
-  return setOperand(0, callee.get<Value>());
+  return setOperand(0, cast<Value>(callee));
 }
 
 Operation::operand_range InvokeOp::getArgOperands() {
@@ -3259,7 +3260,7 @@ OpFoldResult LLVM::GEPOp::fold(FoldAdaptor adaptor) {
       if (Value val = llvm::dyn_cast_if_present<Value>(existing))
         gepArgs.emplace_back(val);
       else
-        gepArgs.emplace_back(existing.get<IntegerAttr>().getInt());
+        gepArgs.emplace_back(cast<IntegerAttr>(existing).getInt());
 
       continue;
     }
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
index 4e5600c715915..453b206de294e 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypes.cpp
@@ -280,7 +280,7 @@ getPointerDataLayoutEntry(DataLayoutEntryListRef params, LLVMPointerType type,
   for (DataLayoutEntryInterface entry : params) {
     if (!entry.isTypeEntry())
       continue;
-    if (cast<LLVMPointerType>(entry.getKey().get<Type>()).getAddressSpace() ==
+    if (cast<LLVMPointerType>(cast<Type>(entry.getKey())).getAddressSpace() ==
         type.getAddressSpace()) {
       currentEntry = entry.getValue();
       break;
@@ -356,7 +356,8 @@ bool LLVMPointerType::areCompatible(DataLayoutEntryListRef oldLayout,
       continue;
     uint64_t size = kDefaultPointerSizeBits;
     uint64_t abi = kDefaultPointerAlignment;
-    auto newType = llvm::cast<LLVMPointerType>(newEntry.getKey().get<Type>());
+    auto newType =
+        llvm::cast<LLVMPointerType>(llvm::cast<Type>(newEntry.getKey()));
     const auto *it =
         llvm::find_if(oldLayout, [&](DataLayoutEntryInterface entry) {
           if (auto type = llvm::dyn_cast_if_present<Type>(entry.getKey())) {
@@ -392,7 +393,7 @@ LogicalResult LLVMPointerType::verifyEntries(DataLayoutEntryListRef entries,
   for (DataLayoutEntryInterface entry : entries) {
     if (!entry.isTypeEntry())
       continue;
-    auto key = entry.getKey().get<Type>();
+    auto key = llvm::cast<Type>(entry.getKey());
     auto values = llvm::dyn_cast<DenseIntElementsAttr>(entry.getValue());
     if (!values || (values.size() != 3 && values.size() != 4)) {
       return emitError(loc)
@@ -625,11 +626,12 @@ LogicalResult LLVMStructType::verifyEntries(DataLayoutEntryListRef entries,
     if (!entry.isTypeEntry())
       continue;
 
-    auto key = llvm::cast<LLVMStructType>(entry.getKey().get<Type>());
+    auto key = llvm::cast<LLVMStructType>(llvm::cast<Type>(entry.getKey()));
     auto values = llvm::dyn_cast<DenseIntElementsAttr>(entry.getValue());
     if (!values || (values.size() != 2 && values.size() != 1)) {
       return emitError(loc)
-             << "expected layout attribute for " << entry.getKey().get<Type>()
+             << "expected layout attribute for "
+             << llvm::cast<Type>(entry.getKey())
              << " to be a dense integer elements attribute of 1 or 2 elements";
     }
     if (!values.getElementType().isInteger(64))

From 37100505664a4c451eb530bc899de204adb80a13 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Thu, 19 Dec 2024 13:22:31 -0500
Subject: [PATCH 087/209] [RISCV][VLOPT] Set CommonVL as the largest of the
 users (#120349)

Prior to this patch, we required that all users had the same VL in order
to optimize. But as the FIXME said, we can use the largest VL to
optimize, as long as we can determine what the largest is. This patch
implements the FIXME.
---
 llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp | 10 +--
 llvm/test/CodeGen/RISCV/rvv/vl-opt.mir     | 78 ++++++++++++++++++++++
 2 files changed, 83 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index eefbad7460d1b..4e3212c70ee9b 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -973,13 +973,13 @@ bool RISCVVLOptimizer::checkUsers(const MachineOperand *&CommonVL,
     assert((!VLOp.isReg() || VLOp.getReg() != RISCV::X0) &&
            "Did not expect X0 VL");
 
-    if (!CommonVL) {
+    // Use the largest VL among all the users. If we cannot determine this
+    // statically, then we cannot optimize the VL.
+    if (!CommonVL || RISCV::isVLKnownLE(*CommonVL, VLOp)) {
       CommonVL = &VLOp;
       LLVM_DEBUG(dbgs() << "    User VL is: " << VLOp << "\n");
-    } else if (!CommonVL->isIdenticalTo(VLOp)) {
-      // FIXME: This check requires all users to have the same VL. We can relax
-      // this and get the largest VL amongst all users.
-      LLVM_DEBUG(dbgs() << "    Abort because users have different VL\n");
+    } else if (!RISCV::isVLKnownLE(VLOp, *CommonVL)) {
+      LLVM_DEBUG(dbgs() << "    Abort because cannot determine a common VL\n");
       CanReduceVL = false;
       break;
     }
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
index 010e3ca642269..3f966b036589f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir
@@ -33,3 +33,81 @@ body: |
     %y:vr = PseudoVREDSUM_VS_M1_E64 $noreg, %x, $noreg, -1, 6 /* e64 */, 0 /* tu, mu */
     %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, %vl, 5 /* e32 */, 0 /* tu, mu */
 ...
+---
+name: use_largest_common_vl_imm_imm
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: use_largest_common_vl_imm_imm
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, 2, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 2, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+    %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 2, 3 /* e8 */, 0
+...
+---
+name: use_largest_common_vl_same_reg
+body: |
+  bb.0:
+    liveins: $x1
+    ; CHECK-LABEL: name: use_largest_common_vl_same_reg
+    ; CHECK: liveins: $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %vl:gprnox0 = COPY $x1
+    ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, %vl, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, %vl, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, %vl, 3 /* e8 */, 0 /* tu, mu */
+    %vl:gprnox0 = COPY $x1
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, %vl, 3 /* e8 */, 0
+    %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, %vl, 3 /* e8 */, 0
+...
+---
+name: use_largest_common_vl_diff_regs
+body: |
+  bb.0:
+    liveins: $x1, $x2
+    ; CHECK-LABEL: name: use_largest_common_vl_diff_regs
+    ; CHECK: liveins: $x1, $x2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %vl0:gprnox0 = COPY $x1
+    ; CHECK-NEXT: %vl1:gprnox0 = COPY $x2
+    ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, %vl0, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, %vl1, 3 /* e8 */, 0 /* tu, mu */
+    %vl0:gprnox0 = COPY $x1
+    %vl1:gprnox0 = COPY $x2
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, %vl0, 3 /* e8 */, 0
+    %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, %vl1, 3 /* e8 */, 0
+...
+---
+name: use_largest_common_vl_imm_reg
+body: |
+  bb.0:
+    liveins: $x1
+    ; CHECK-LABEL: name: use_largest_common_vl_imm_reg
+    ; CHECK: liveins: $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %vl:gprnox0 = COPY $x1
+    ; CHECK-NEXT: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, %vl, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    %vl:gprnox0 = COPY $x1
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, %vl, 3 /* e8 */, 0
+    %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+...
+---
+name: use_largest_common_vl_imm_vlmax
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: use_largest_common_vl_imm_vlmax
+    ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
+    %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
+    %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+    %z:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, -1, 3 /* e8 */, 0
+...
+

From 28865769440756138a88a9c8e8b72b1f5d8db715 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 19 Dec 2024 10:25:29 -0800
Subject: [PATCH 088/209] [memprof] clang-format MemProf-related files (NFC)
 (#120504)

---
 llvm/lib/ProfileData/MemProfReader.cpp              | 4 ++--
 llvm/lib/Transforms/Instrumentation/MemProfiler.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/ProfileData/MemProfReader.cpp b/llvm/lib/ProfileData/MemProfReader.cpp
index 9dd43d34f2a0b..10c36f25c4b79 100644
--- a/llvm/lib/ProfileData/MemProfReader.cpp
+++ b/llvm/lib/ProfileData/MemProfReader.cpp
@@ -596,8 +596,8 @@ Error RawMemProfReader::symbolizeAndFilterStackFrames(
   // Drop the entries where the callstack is empty.
   for (const uint64_t Id : EntriesToErase) {
     StackMap.erase(Id);
-    if(CallstackProfileData[Id].AccessHistogramSize > 0)
-      free((void*) CallstackProfileData[Id].AccessHistogram);
+    if (CallstackProfileData[Id].AccessHistogramSize > 0)
+      free((void *)CallstackProfileData[Id].AccessHistogram);
     CallstackProfileData.erase(Id);
   }
 
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index db8999911b7f9..fea45d53b2eb4 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -152,7 +152,7 @@ static cl::opt<int> ClDebugMax("memprof-debug-max", cl::desc("Debug max inst"),
 // override these hints anyway.
 static cl::opt<bool> ClMemProfMatchHotColdNew(
     "memprof-match-hot-cold-new",
- cl::desc(
+    cl::desc(
         "Match allocation profiles onto existing hot/cold operator new calls"),
     cl::Hidden, cl::init(false));
 

From 21684e38ee65c89d1d0b399c938a83fba5e5c04e Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks@fb.com>
Date: Thu, 19 Dec 2024 10:40:25 -0800
Subject: [PATCH 089/209] [BOLT][Linux] Refactor reading of PC-relative
 addresses. NFCI (#120491)

Fix evaluation order problem identified in
https://github.com/llvm/llvm-project/pull/119088.
---
 bolt/lib/Rewrite/LinuxKernelRewriter.cpp | 192 ++++++++++++-----------
 1 file changed, 97 insertions(+), 95 deletions(-)

diff --git a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
index 03b414b71caca..0532468c237e0 100644
--- a/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
+++ b/bolt/lib/Rewrite/LinuxKernelRewriter.cpp
@@ -123,6 +123,30 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ORCState &E) {
 
 namespace {
 
+/// Extension to DataExtractor that supports reading addresses stored in
+/// PC-relative format.
+class AddressExtractor : public DataExtractor {
+  uint64_t DataAddress;
+
+public:
+  AddressExtractor(StringRef Data, uint64_t DataAddress, bool IsLittleEndian,
+                   uint8_t AddressSize)
+      : DataExtractor(Data, IsLittleEndian, AddressSize),
+        DataAddress(DataAddress) {}
+
+  /// Extract 32-bit PC-relative address/pointer.
+  uint64_t getPCRelAddress32(Cursor &C) {
+    const uint64_t Base = DataAddress + C.tell();
+    return Base + (int32_t)getU32(C);
+  }
+
+  /// Extract 64-bit PC-relative address/pointer.
+  uint64_t getPCRelAddress64(Cursor &C) {
+    const uint64_t Base = DataAddress + C.tell();
+    return Base + (int64_t)getU64(C);
+  }
+};
+
 class LinuxKernelRewriter final : public MetadataRewriter {
   /// Information required for updating metadata referencing an instruction.
   struct InstructionFixup {
@@ -423,13 +447,13 @@ Error LinuxKernelRewriter::processSMPLocks() {
     return createStringError(errc::executable_format_error,
                              "bad size of .smp_locks section");
 
-  DataExtractor DE = DataExtractor(SMPLocksSection->getContents(),
-                                   BC.AsmInfo->isLittleEndian(),
-                                   BC.AsmInfo->getCodePointerSize());
-  DataExtractor::Cursor Cursor(0);
+  AddressExtractor AE(SMPLocksSection->getContents(), SectionAddress,
+                      BC.AsmInfo->isLittleEndian(),
+                      BC.AsmInfo->getCodePointerSize());
+  AddressExtractor::Cursor Cursor(0);
   while (Cursor && Cursor.tell() < SectionSize) {
     const uint64_t Offset = Cursor.tell();
-    const uint64_t IP = SectionAddress + Offset + (int32_t)DE.getU32(Cursor);
+    const uint64_t IP = AE.getPCRelAddress32(Cursor);
 
     // Consume the status of the cursor.
     if (!Cursor)
@@ -499,20 +523,17 @@ Error LinuxKernelRewriter::readORCTables() {
     return createStringError(errc::executable_format_error,
                              "ORC entries number mismatch detected");
 
-  const uint64_t IPSectionAddress = ORCUnwindIPSection->getAddress();
-  DataExtractor OrcDE = DataExtractor(ORCUnwindSection->getContents(),
-                                      BC.AsmInfo->isLittleEndian(),
-                                      BC.AsmInfo->getCodePointerSize());
-  DataExtractor IPDE = DataExtractor(ORCUnwindIPSection->getContents(),
-                                     BC.AsmInfo->isLittleEndian(),
-                                     BC.AsmInfo->getCodePointerSize());
+  DataExtractor OrcDE(ORCUnwindSection->getContents(),
+                      BC.AsmInfo->isLittleEndian(),
+                      BC.AsmInfo->getCodePointerSize());
+  AddressExtractor IPAE(
+      ORCUnwindIPSection->getContents(), ORCUnwindIPSection->getAddress(),
+      BC.AsmInfo->isLittleEndian(), BC.AsmInfo->getCodePointerSize());
   DataExtractor::Cursor ORCCursor(0);
   DataExtractor::Cursor IPCursor(0);
   uint64_t PrevIP = 0;
   for (uint32_t Index = 0; Index < NumORCEntries; ++Index) {
-    const uint64_t IP =
-        IPSectionAddress + IPCursor.tell() + (int32_t)IPDE.getU32(IPCursor);
-
+    const uint64_t IP = IPAE.getPCRelAddress32(IPCursor);
     // Consume the status of the cursor.
     if (!IPCursor)
       return createStringError(errc::executable_format_error,
@@ -856,15 +877,13 @@ Error LinuxKernelRewriter::validateORCTables() {
   if (!ORCUnwindIPSection)
     return Error::success();
 
-  const uint64_t IPSectionAddress = ORCUnwindIPSection->getAddress();
-  DataExtractor IPDE = DataExtractor(ORCUnwindIPSection->getOutputContents(),
-                                     BC.AsmInfo->isLittleEndian(),
-                                     BC.AsmInfo->getCodePointerSize());
-  DataExtractor::Cursor IPCursor(0);
+  AddressExtractor IPAE(
+      ORCUnwindIPSection->getOutputContents(), ORCUnwindIPSection->getAddress(),
+      BC.AsmInfo->isLittleEndian(), BC.AsmInfo->getCodePointerSize());
+  AddressExtractor::Cursor IPCursor(0);
   uint64_t PrevIP = 0;
   for (uint32_t Index = 0; Index < NumORCEntries; ++Index) {
-    const uint64_t IP =
-        IPSectionAddress + IPCursor.tell() + (int32_t)IPDE.getU32(IPCursor);
+    const uint64_t IP = IPAE.getPCRelAddress32(IPCursor);
     if (!IPCursor)
       return createStringError(errc::executable_format_error,
                                "out of bounds while reading ORC IP table: %s",
@@ -916,16 +935,14 @@ Error LinuxKernelRewriter::readStaticCalls() {
                              "static call table size error");
 
   const uint64_t SectionAddress = StaticCallSection->getAddress();
-  DataExtractor DE(StaticCallSection->getContents(),
-                   BC.AsmInfo->isLittleEndian(),
-                   BC.AsmInfo->getCodePointerSize());
-  DataExtractor::Cursor Cursor(StaticCallTableAddress - SectionAddress);
+  AddressExtractor AE(StaticCallSection->getContents(), SectionAddress,
+                      BC.AsmInfo->isLittleEndian(),
+                      BC.AsmInfo->getCodePointerSize());
+  AddressExtractor::Cursor Cursor(StaticCallTableAddress - SectionAddress);
   uint32_t EntryID = 0;
   while (Cursor && Cursor.tell() < Stop->getAddress() - SectionAddress) {
-    const uint64_t CallAddress =
-        SectionAddress + Cursor.tell() + (int32_t)DE.getU32(Cursor);
-    const uint64_t KeyAddress =
-        SectionAddress + Cursor.tell() + (int32_t)DE.getU32(Cursor);
+    const uint64_t CallAddress = AE.getPCRelAddress32(Cursor);
+    const uint64_t KeyAddress = AE.getPCRelAddress32(Cursor);
 
     // Consume the status of the cursor.
     if (!Cursor)
@@ -1027,18 +1044,15 @@ Error LinuxKernelRewriter::readExceptionTable() {
     return createStringError(errc::executable_format_error,
                              "exception table size error");
 
-  const uint64_t SectionAddress = ExceptionsSection->getAddress();
-  DataExtractor DE(ExceptionsSection->getContents(),
-                   BC.AsmInfo->isLittleEndian(),
-                   BC.AsmInfo->getCodePointerSize());
-  DataExtractor::Cursor Cursor(0);
+  AddressExtractor AE(
+      ExceptionsSection->getContents(), ExceptionsSection->getAddress(),
+      BC.AsmInfo->isLittleEndian(), BC.AsmInfo->getCodePointerSize());
+  AddressExtractor::Cursor Cursor(0);
   uint32_t EntryID = 0;
   while (Cursor && Cursor.tell() < ExceptionsSection->getSize()) {
-    const uint64_t InstAddress =
-        SectionAddress + Cursor.tell() + (int32_t)DE.getU32(Cursor);
-    const uint64_t FixupAddress =
-        SectionAddress + Cursor.tell() + (int32_t)DE.getU32(Cursor);
-    const uint64_t Data = DE.getU32(Cursor);
+    const uint64_t InstAddress = AE.getPCRelAddress32(Cursor);
+    const uint64_t FixupAddress = AE.getPCRelAddress32(Cursor);
+    const uint64_t Data = AE.getU32(Cursor);
 
     // Consume the status of the cursor.
     if (!Cursor)
@@ -1134,9 +1148,9 @@ Error LinuxKernelRewriter::readParaInstructions() {
   if (!ParavirtualPatchSection)
     return Error::success();
 
-  DataExtractor DE = DataExtractor(ParavirtualPatchSection->getContents(),
-                                   BC.AsmInfo->isLittleEndian(),
-                                   BC.AsmInfo->getCodePointerSize());
+  DataExtractor DE(ParavirtualPatchSection->getContents(),
+                   BC.AsmInfo->isLittleEndian(),
+                   BC.AsmInfo->getCodePointerSize());
   uint32_t EntryID = 0;
   DataExtractor::Cursor Cursor(0);
   while (Cursor && !DE.eof(Cursor)) {
@@ -1235,15 +1249,14 @@ Error LinuxKernelRewriter::readBugTable() {
     return createStringError(errc::executable_format_error,
                              "bug table size error");
 
-  const uint64_t SectionAddress = BugTableSection->getAddress();
-  DataExtractor DE(BugTableSection->getContents(), BC.AsmInfo->isLittleEndian(),
-                   BC.AsmInfo->getCodePointerSize());
-  DataExtractor::Cursor Cursor(0);
+  AddressExtractor AE(
+      BugTableSection->getContents(), BugTableSection->getAddress(),
+      BC.AsmInfo->isLittleEndian(), BC.AsmInfo->getCodePointerSize());
+  AddressExtractor::Cursor Cursor(0);
   uint32_t EntryID = 0;
   while (Cursor && Cursor.tell() < BugTableSection->getSize()) {
     const uint64_t Pos = Cursor.tell();
-    const uint64_t InstAddress =
-        SectionAddress + Pos + (int32_t)DE.getU32(Cursor);
+    const uint64_t InstAddress = AE.getPCRelAddress32(Cursor);
     Cursor.seek(Pos + BUG_TABLE_ENTRY_SIZE);
 
     if (!Cursor)
@@ -1402,23 +1415,20 @@ Error LinuxKernelRewriter::readAltInstructions() {
 Error LinuxKernelRewriter::tryReadAltInstructions(uint32_t AltInstFeatureSize,
                                                   bool AltInstHasPadLen,
                                                   bool ParseOnly) {
-  const uint64_t Address = AltInstrSection->getAddress();
-  DataExtractor DE = DataExtractor(AltInstrSection->getContents(),
-                                   BC.AsmInfo->isLittleEndian(),
-                                   BC.AsmInfo->getCodePointerSize());
+  AddressExtractor AE(
+      AltInstrSection->getContents(), AltInstrSection->getAddress(),
+      BC.AsmInfo->isLittleEndian(), BC.AsmInfo->getCodePointerSize());
+  AddressExtractor::Cursor Cursor(0);
   uint64_t EntryID = 0;
-  DataExtractor::Cursor Cursor(0);
-  while (Cursor && !DE.eof(Cursor)) {
-    const uint64_t OrgInstAddress =
-        Address + Cursor.tell() + (int32_t)DE.getU32(Cursor);
-    const uint64_t AltInstAddress =
-        Address + Cursor.tell() + (int32_t)DE.getU32(Cursor);
-    const uint64_t Feature = DE.getUnsigned(Cursor, AltInstFeatureSize);
-    const uint8_t OrgSize = DE.getU8(Cursor);
-    const uint8_t AltSize = DE.getU8(Cursor);
+  while (Cursor && !AE.eof(Cursor)) {
+    const uint64_t OrgInstAddress = AE.getPCRelAddress32(Cursor);
+    const uint64_t AltInstAddress = AE.getPCRelAddress32(Cursor);
+    const uint64_t Feature = AE.getUnsigned(Cursor, AltInstFeatureSize);
+    const uint8_t OrgSize = AE.getU8(Cursor);
+    const uint8_t AltSize = AE.getU8(Cursor);
 
     // Older kernels may have the padlen field.
-    const uint8_t PadLen = AltInstHasPadLen ? DE.getU8(Cursor) : 0;
+    const uint8_t PadLen = AltInstHasPadLen ? AE.getU8(Cursor) : 0;
 
     if (!Cursor)
       return createStringError(
@@ -1537,19 +1547,17 @@ Error LinuxKernelRewriter::readPCIFixupTable() {
     return createStringError(errc::executable_format_error,
                              "PCI fixup table size error");
 
-  const uint64_t Address = PCIFixupSection->getAddress();
-  DataExtractor DE = DataExtractor(PCIFixupSection->getContents(),
-                                   BC.AsmInfo->isLittleEndian(),
-                                   BC.AsmInfo->getCodePointerSize());
+  AddressExtractor AE(
+      PCIFixupSection->getContents(), PCIFixupSection->getAddress(),
+      BC.AsmInfo->isLittleEndian(), BC.AsmInfo->getCodePointerSize());
+  AddressExtractor::Cursor Cursor(0);
   uint64_t EntryID = 0;
-  DataExtractor::Cursor Cursor(0);
-  while (Cursor && !DE.eof(Cursor)) {
-    const uint16_t Vendor = DE.getU16(Cursor);
-    const uint16_t Device = DE.getU16(Cursor);
-    const uint32_t Class = DE.getU32(Cursor);
-    const uint32_t ClassShift = DE.getU32(Cursor);
-    const uint64_t HookAddress =
-        Address + Cursor.tell() + (int32_t)DE.getU32(Cursor);
+  while (Cursor && !AE.eof(Cursor)) {
+    const uint16_t Vendor = AE.getU16(Cursor);
+    const uint16_t Device = AE.getU16(Cursor);
+    const uint32_t Class = AE.getU32(Cursor);
+    const uint32_t ClassShift = AE.getU32(Cursor);
+    const uint64_t HookAddress = AE.getPCRelAddress32(Cursor);
 
     if (!Cursor)
       return createStringError(errc::executable_format_error,
@@ -1654,18 +1662,15 @@ Error LinuxKernelRewriter::readStaticKeysJumpTable() {
                              "static keys jump table size error");
 
   const uint64_t SectionAddress = StaticKeysJumpSection->getAddress();
-  DataExtractor DE(StaticKeysJumpSection->getContents(),
-                   BC.AsmInfo->isLittleEndian(),
-                   BC.AsmInfo->getCodePointerSize());
-  DataExtractor::Cursor Cursor(StaticKeysJumpTableAddress - SectionAddress);
+  AddressExtractor AE(StaticKeysJumpSection->getContents(), SectionAddress,
+                      BC.AsmInfo->isLittleEndian(),
+                      BC.AsmInfo->getCodePointerSize());
+  AddressExtractor::Cursor Cursor(StaticKeysJumpTableAddress - SectionAddress);
   uint32_t EntryID = 0;
   while (Cursor && Cursor.tell() < Stop->getAddress() - SectionAddress) {
-    const uint64_t JumpAddress =
-        SectionAddress + Cursor.tell() + (int32_t)DE.getU32(Cursor);
-    const uint64_t TargetAddress =
-        SectionAddress + Cursor.tell() + (int32_t)DE.getU32(Cursor);
-    const uint64_t KeyAddress =
-        SectionAddress + Cursor.tell() + (int64_t)DE.getU64(Cursor);
+    const uint64_t JumpAddress = AE.getPCRelAddress32(Cursor);
+    const uint64_t TargetAddress = AE.getPCRelAddress32(Cursor);
+    const uint64_t KeyAddress = AE.getPCRelAddress64(Cursor);
 
     // Consume the status of the cursor.
     if (!Cursor)
@@ -1859,21 +1864,18 @@ Error LinuxKernelRewriter::updateStaticKeysJumpTablePostEmit() {
     return Error::success();
 
   const uint64_t SectionAddress = StaticKeysJumpSection->getAddress();
-  DataExtractor DE(StaticKeysJumpSection->getOutputContents(),
-                   BC.AsmInfo->isLittleEndian(),
-                   BC.AsmInfo->getCodePointerSize());
-  DataExtractor::Cursor Cursor(StaticKeysJumpTableAddress - SectionAddress);
+  AddressExtractor AE(StaticKeysJumpSection->getOutputContents(),
+                      SectionAddress, BC.AsmInfo->isLittleEndian(),
+                      BC.AsmInfo->getCodePointerSize());
+  AddressExtractor::Cursor Cursor(StaticKeysJumpTableAddress - SectionAddress);
   const BinaryData *Stop = BC.getBinaryDataByName("__stop___jump_table");
   uint32_t EntryID = 0;
   uint64_t NumShort = 0;
   uint64_t NumLong = 0;
   while (Cursor && Cursor.tell() < Stop->getAddress() - SectionAddress) {
-    const uint64_t JumpAddress =
-        SectionAddress + Cursor.tell() + (int32_t)DE.getU32(Cursor);
-    const uint64_t TargetAddress =
-        SectionAddress + Cursor.tell() + (int32_t)DE.getU32(Cursor);
-    const uint64_t KeyAddress =
-        SectionAddress + Cursor.tell() + (int64_t)DE.getU64(Cursor);
+    const uint64_t JumpAddress = AE.getPCRelAddress32(Cursor);
+    const uint64_t TargetAddress = AE.getPCRelAddress32(Cursor);
+    const uint64_t KeyAddress = AE.getPCRelAddress64(Cursor);
 
     // Consume the status of the cursor.
     if (!Cursor)

From b71c44b9be17dc6295eb733d685b38e797f3c846 Mon Sep 17 00:00:00 2001
From: ChiaHungDuan <chiahungduan@google.com>
Date: Thu, 19 Dec 2024 10:47:44 -0800
Subject: [PATCH 090/209] [scudo] Add the record of number of attempted page
 release (#120497)

This also removes the `RangesReleased` which doesn't give much insight
to whether we should adjust the heuristic of doing page release.
---
 compiler-rt/lib/scudo/standalone/primary32.h | 25 +++++++++++---------
 compiler-rt/lib/scudo/standalone/primary64.h | 10 ++++----
 compiler-rt/lib/scudo/standalone/release.h   |  8 -------
 3 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
index 654b129d9f547..596c48f004b22 100644
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -387,7 +387,7 @@ template <typename Config> class SizeClassAllocator32 {
 
   struct ReleaseToOsInfo {
     uptr BytesInFreeListAtLastCheckpoint;
-    uptr RangesReleased;
+    uptr NumReleasesAttempted;
     uptr LastReleasedBytes;
     u64 LastReleaseAtNs;
   };
@@ -880,14 +880,14 @@ template <typename Config> class SizeClassAllocator32 {
           BytesInFreeList - Sci->ReleaseInfo.BytesInFreeListAtLastCheckpoint;
     }
     const uptr AvailableChunks = Sci->AllocatedUser / BlockSize;
-    Str->append("  %02zu (%6zu): mapped: %6zuK popped: %7zu pushed: %7zu "
-                "inuse: %6zu avail: %6zu releases: %6zu last released: %6zuK "
-                "latest pushed bytes: %6zuK\n",
-                ClassId, getSizeByClassId(ClassId), Sci->AllocatedUser >> 10,
-                Sci->FreeListInfo.PoppedBlocks, Sci->FreeListInfo.PushedBlocks,
-                InUse, AvailableChunks, Sci->ReleaseInfo.RangesReleased,
-                Sci->ReleaseInfo.LastReleasedBytes >> 10,
-                PushedBytesDelta >> 10);
+    Str->append(
+        "  %02zu (%6zu): mapped: %6zuK popped: %7zu pushed: %7zu "
+        "inuse: %6zu avail: %6zu releases attempted: %6zu last released: %6zuK "
+        "latest pushed bytes: %6zuK\n",
+        ClassId, getSizeByClassId(ClassId), Sci->AllocatedUser >> 10,
+        Sci->FreeListInfo.PoppedBlocks, Sci->FreeListInfo.PushedBlocks, InUse,
+        AvailableChunks, Sci->ReleaseInfo.NumReleasesAttempted,
+        Sci->ReleaseInfo.LastReleasedBytes >> 10, PushedBytesDelta >> 10);
   }
 
   void getSizeClassFragmentationInfo(SizeClassInfo *Sci, uptr ClassId,
@@ -972,6 +972,10 @@ template <typename Config> class SizeClassAllocator32 {
     const uptr Base = First * RegionSize;
     const uptr NumberOfRegions = Last - First + 1U;
 
+    // The following steps contribute to the majority time spent in page
+    // releasing thus we increment the counter here.
+    ++Sci->ReleaseInfo.NumReleasesAttempted;
+
     // ==================================================================== //
     // 2. Mark the free blocks and we can tell which pages are in-use by
     //    querying `PageReleaseContext`.
@@ -991,9 +995,8 @@ template <typename Config> class SizeClassAllocator32 {
     };
     releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
 
-    if (Recorder.getReleasedRangesCount() > 0) {
+    if (Recorder.getReleasedBytes() > 0) {
       Sci->ReleaseInfo.BytesInFreeListAtLastCheckpoint = BytesInFreeList;
-      Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
       Sci->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
       TotalReleasedBytes += Sci->ReleaseInfo.LastReleasedBytes;
     }
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index e382e01f5d8e5..2b520ceb33148 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -530,7 +530,7 @@ template <typename Config> class SizeClassAllocator64 {
 
   struct ReleaseToOsInfo {
     uptr BytesInFreeListAtLastCheckpoint;
-    uptr RangesReleased;
+    uptr NumReleasesAttempted;
     uptr LastReleasedBytes;
     // The minimum size of pushed blocks to trigger page release.
     uptr TryReleaseThreshold;
@@ -1144,11 +1144,12 @@ template <typename Config> class SizeClassAllocator64 {
     Str->append(
         "%s %02zu (%6zu): mapped: %6zuK popped: %7zu pushed: %7zu "
         "inuse: %6zu total: %6zu releases: %6zu last "
-        "released: %6zuK latest pushed bytes: %6zuK region: 0x%zx (0x%zx)\n",
+        "releases attempted: %6zuK latest pushed bytes: %6zuK region: 0x%zx "
+        "(0x%zx)\n",
         Region->Exhausted ? "E" : " ", ClassId, getSizeByClassId(ClassId),
         Region->MemMapInfo.MappedUser >> 10, Region->FreeListInfo.PoppedBlocks,
         Region->FreeListInfo.PushedBlocks, InUseBlocks, TotalChunks,
-        Region->ReleaseInfo.RangesReleased,
+        Region->ReleaseInfo.NumReleasesAttempted,
         Region->ReleaseInfo.LastReleasedBytes >> 10,
         RegionPushedBytesDelta >> 10, Region->RegionBeg,
         getRegionBaseByClassId(ClassId));
@@ -1322,7 +1323,7 @@ template <typename Config> class SizeClassAllocator64 {
                                             Context.getReleaseOffset());
     auto SkipRegion = [](UNUSED uptr RegionIndex) { return false; };
     releaseFreeMemoryToOS(Context, Recorder, SkipRegion);
-    if (Recorder.getReleasedRangesCount() > 0) {
+    if (Recorder.getReleasedBytes() > 0) {
       // This is the case that we didn't hit the release threshold but it has
       // been past a certain period of time. Thus we try to release some pages
       // and if it does release some additional pages, it's hint that we are
@@ -1342,7 +1343,6 @@ template <typename Config> class SizeClassAllocator64 {
       }
 
       Region->ReleaseInfo.BytesInFreeListAtLastCheckpoint = BytesInFreeList;
-      Region->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();
       Region->ReleaseInfo.LastReleasedBytes = Recorder.getReleasedBytes();
     }
     Region->ReleaseInfo.LastReleaseAtNs = getMonotonicTimeFast();
diff --git a/compiler-rt/lib/scudo/standalone/release.h b/compiler-rt/lib/scudo/standalone/release.h
index 6353dafde3160..7a4912ec0b5a6 100644
--- a/compiler-rt/lib/scudo/standalone/release.h
+++ b/compiler-rt/lib/scudo/standalone/release.h
@@ -22,8 +22,6 @@ template <typename MemMapT> class RegionReleaseRecorder {
   RegionReleaseRecorder(MemMapT *RegionMemMap, uptr Base, uptr Offset = 0)
       : RegionMemMap(RegionMemMap), Base(Base), Offset(Offset) {}
 
-  uptr getReleasedRangesCount() const { return ReleasedRangesCount; }
-
   uptr getReleasedBytes() const { return ReleasedBytes; }
 
   uptr getBase() const { return Base; }
@@ -33,12 +31,10 @@ template <typename MemMapT> class RegionReleaseRecorder {
   void releasePageRangeToOS(uptr From, uptr To) {
     const uptr Size = To - From;
     RegionMemMap->releasePagesToOS(getBase() + Offset + From, Size);
-    ReleasedRangesCount++;
     ReleasedBytes += Size;
   }
 
 private:
-  uptr ReleasedRangesCount = 0;
   uptr ReleasedBytes = 0;
   MemMapT *RegionMemMap = nullptr;
   uptr Base = 0;
@@ -52,8 +48,6 @@ class ReleaseRecorder {
   ReleaseRecorder(uptr Base, uptr Offset = 0, MapPlatformData *Data = nullptr)
       : Base(Base), Offset(Offset), Data(Data) {}
 
-  uptr getReleasedRangesCount() const { return ReleasedRangesCount; }
-
   uptr getReleasedBytes() const { return ReleasedBytes; }
 
   uptr getBase() const { return Base; }
@@ -62,12 +56,10 @@ class ReleaseRecorder {
   void releasePageRangeToOS(uptr From, uptr To) {
     const uptr Size = To - From;
     releasePagesToOS(Base, From + Offset, Size, Data);
-    ReleasedRangesCount++;
     ReleasedBytes += Size;
   }
 
 private:
-  uptr ReleasedRangesCount = 0;
   uptr ReleasedBytes = 0;
   // The starting address to release. Note that we may want to combine (Base +
   // Offset) as a new Base. However, the Base is retrieved from

From 1808255a44e67446715cb6b16df49c6cec41b0b4 Mon Sep 17 00:00:00 2001
From: alx32 <103613512+alx32@users.noreply.github.com>
Date: Thu, 19 Dec 2024 10:50:44 -0800
Subject: [PATCH 091/209] [DWARFVerifier] Fix and enable broken test
 llvm-dwarfdump/X86/verify_no_overlap_error_icf.yaml (#120330)

Fixing broken test - calling `sed` in a cross-platform compatible way.
Verified to pass on Mac (which uses BSD sed).
---
 .../X86/verify_no_overlap_error_icf.yaml               | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/llvm/test/tools/llvm-dwarfdump/X86/verify_no_overlap_error_icf.yaml b/llvm/test/tools/llvm-dwarfdump/X86/verify_no_overlap_error_icf.yaml
index 43652831bdb2d..5312e54e756a2 100644
--- a/llvm/test/tools/llvm-dwarfdump/X86/verify_no_overlap_error_icf.yaml
+++ b/llvm/test/tools/llvm-dwarfdump/X86/verify_no_overlap_error_icf.yaml
@@ -1,7 +1,3 @@
-# FIXME: Currently disabled as it fails on some Mac hosts
-# https://github.com/llvm/llvm-project/pull/117952
-# UNSUPPORTED: true
-
 #--- comments.txt
 
 # This test verifies several scenarios with DW_TAG_subprogram address ranges:
@@ -22,13 +18,13 @@
 # RUN: yaml2obj %t/test.yaml | llvm-dwarfdump --error-display=details --verify - | FileCheck %s
 # CHECK: No errors.
 
-# RUN: sed '0,/HighOffset: 0x6000/{s//HighOffset: 0x5999/}' %t/test.yaml | yaml2obj | not llvm-dwarfdump --error-display=details --verify - | FileCheck %s --check-prefix=CHECK-RANGES
+# RUN: sed -e '1,/HighOffset: 0x6000/s/HighOffset: 0x6000/HighOffset: 0x5999/' %t/test.yaml | yaml2obj | not llvm-dwarfdump --error-display=details --verify - | FileCheck %s --check-prefix=CHECK-RANGES
 # CHECK-RANGES: error: DIEs have overlapping address ranges
 
-# RUN: sed '0,/Value:  0x77/{s/Value:  0x77/Value:  0x66/}' %t/test.yaml | yaml2obj | not llvm-dwarfdump --error-display=details --verify - | FileCheck %s --check-prefix=CHECK-HIGH-PC
+# RUN: sed -e '1,/Value:  0x77/s/Value:  0x77/Value:  0x66/' %t/test.yaml | yaml2obj | not llvm-dwarfdump --error-display=details --verify - | FileCheck %s --check-prefix=CHECK-HIGH-PC
 # CHECK-HIGH-PC: error: DIEs have overlapping address ranges
 
-# RUN: sed '0,/LowOffset:  0x880111/{s//LowOffset:  0x880112/}' %t/test.yaml | yaml2obj | not llvm-dwarfdump --error-display=details --verify - | FileCheck %s --check-prefix=CHECK-LEX-BLOCK
+# RUN: sed -e '1,/LowOffset:  0x880111/s/LowOffset:  0x880111/LowOffset:  0x880112/' %t/test.yaml | yaml2obj | not llvm-dwarfdump --error-display=details --verify - | FileCheck %s --check-prefix=CHECK-LEX-BLOCK
 # CHECK-LEX-BLOCK: DIE has overlapping ranges in DW_AT_ranges attribute
 
 #--- test.yaml

From 395a369056e1a9b55015b81a8667f39f1f48457f Mon Sep 17 00:00:00 2001
From: Sylvestre Ledru <sylvestre@debian.org>
Date: Thu, 19 Dec 2024 19:51:38 +0100
Subject: [PATCH 092/209] [Xtensa] Fix build after splitting
 SDNode::use_iterator

Same as: 145ddf7ede28d9131a65b7f86ad07736a824ee21
---
 llvm/lib/Target/Xtensa/XtensaISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
index 6dfda02b7622b..e8ede330bbac5 100644
--- a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
@@ -747,7 +747,7 @@ SDValue XtensaTargetLowering::LowerImmediate(SDValue Op,
       return Op;
     // Check if use node maybe lowered to the ADDMI instruction
     SDNode &OpNode = *Op.getNode();
-    if ((OpNode.hasOneUse() && OpNode.use_begin()->getOpcode() == ISD::ADD) &&
+    if ((OpNode.hasOneUse() && OpNode.user_begin()->getOpcode() == ISD::ADD) &&
         isShiftedInt<16, 8>(Value))
       return Op;
     Type *Ty = Type::getInt32Ty(*DAG.getContext());

From 98c97d4a19412a76f7279003af6cb219dea1f0c3 Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Thu, 19 Dec 2024 11:09:36 -0800
Subject: [PATCH 093/209] [WebKit checkers] Recognize adoptRef as a safe
 function (#119846)

adoptRef in WebKit constructs Ref/RefPtr so treat it as such in
isCtorOfRefCounted. Also removed the support for makeRef and makeRefPtr
as they don't exist any more.
---
 .../Checkers/WebKit/PtrTypesSemantics.cpp     |  5 ++-
 .../Analysis/Checkers/WebKit/call-args.cpp    | 18 ++++++++++
 .../Analysis/Checkers/WebKit/mock-types.h     | 36 ++++++++++++++++++-
 .../ref-cntbl-crtp-base-no-virtual-dtor.cpp   |  8 -----
 4 files changed, 55 insertions(+), 12 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index 797f3e1f3fba5..5487fea1b956c 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -125,9 +125,8 @@ bool isCtorOfRefCounted(const clang::FunctionDecl *F) {
   assert(F);
   const std::string &FunctionName = safeGetName(F);
 
-  return isRefType(FunctionName) || FunctionName == "makeRef" ||
-         FunctionName == "makeRefPtr" || FunctionName == "UniqueRef" ||
-         FunctionName == "makeUniqueRef" ||
+  return isRefType(FunctionName) || FunctionName == "adoptRef" ||
+         FunctionName == "UniqueRef" || FunctionName == "makeUniqueRef" ||
          FunctionName == "makeUniqueRefWithoutFastMallocCheck"
 
          || FunctionName == "String" || FunctionName == "AtomString" ||
diff --git a/clang/test/Analysis/Checkers/WebKit/call-args.cpp b/clang/test/Analysis/Checkers/WebKit/call-args.cpp
index 2146eae9975b9..b4613d5090f29 100644
--- a/clang/test/Analysis/Checkers/WebKit/call-args.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/call-args.cpp
@@ -376,3 +376,21 @@ namespace call_with_explicit_temporary_obj {
 
 namespace call_with_explicit_construct {
 }
+
+namespace call_with_adopt_ref {
+  class Obj {
+  public:
+    void ref() const;
+    void deref() const;
+    void method();
+  };
+
+  // This is needed due to rdar://141692212.
+  struct dummy {
+    RefPtr<Obj> any;
+  };
+
+  void foo() {
+    adoptRef(new Obj)->method();
+  }
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/mock-types.h b/clang/test/Analysis/Checkers/WebKit/mock-types.h
index f3bd20f8bcf60..0908e7fdb34dc 100644
--- a/clang/test/Analysis/Checkers/WebKit/mock-types.h
+++ b/clang/test/Analysis/Checkers/WebKit/mock-types.h
@@ -74,7 +74,10 @@ template<typename T> struct DefaultRefDerefTraits {
 template <typename T, typename PtrTraits = RawPtrTraits<T>, typename RefDerefTraits = DefaultRefDerefTraits<T>> struct Ref {
   typename PtrTraits::StorageType t;
 
+  enum AdoptTag { Adopt };
+
   Ref() : t{} {};
+  Ref(T &t, AdoptTag) : t(&t) { }
   Ref(T &t) : t(&RefDerefTraits::ref(t)) { }
   Ref(const Ref& o) : t(RefDerefTraits::refIfNotNull(PtrTraits::unwrap(o.t))) { }
   Ref(Ref&& o) : t(o.leakRef()) { }
@@ -101,10 +104,19 @@ template <typename T, typename PtrTraits = RawPtrTraits<T>, typename RefDerefTra
   T* leakRef() { return PtrTraits::exchange(t, nullptr); }
 };
 
+template <typename T> Ref<T> adoptRef(T& t) {
+  using Ref = Ref<T>;
+  return Ref(t, Ref::Adopt);
+}
+
+template<typename T> class RefPtr;
+template<typename T> RefPtr<T> adoptRef(T*);
+
 template <typename T> struct RefPtr {
   T *t;
 
-  RefPtr() : t(new T) {}
+  RefPtr() : t(nullptr) { }
+
   RefPtr(T *t)
     : t(t) {
     if (t)
@@ -113,6 +125,17 @@ template <typename T> struct RefPtr {
   RefPtr(Ref<T>&& o)
     : t(o.leakRef())
   { }
+  RefPtr(RefPtr&& o)
+    : t(o.t)
+  {
+    o.t = nullptr;
+  }
+  RefPtr(const RefPtr& o)
+    : t(o.t)
+  {
+      if (t)
+          t->ref();
+  }
   ~RefPtr() {
     if (t)
       t->deref();
@@ -138,8 +161,19 @@ template <typename T> struct RefPtr {
     return *this;
   }
   operator bool() const { return t; }
+
+private:
+  friend RefPtr adoptRef<T>(T*);
+
+  // call_with_adopt_ref in call-args.cpp requires this method to be private.
+  enum AdoptTag { Adopt };
+  RefPtr(T *t, AdoptTag) : t(t) { }
 };
 
+template <typename T> RefPtr<T> adoptRef(T* t) {
+  return RefPtr<T>(t, RefPtr<T>::Adopt);
+}
+
 template <typename T> bool operator==(const RefPtr<T> &, const RefPtr<T> &) {
   return false;
 }
diff --git a/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp b/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp
index 33c60ea8ca64d..4209db14eaa52 100644
--- a/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp
@@ -61,14 +61,6 @@ template<typename Out, typename... In> Function<Out(In...)> adopt(Detail::Callab
     return Function<Out(In...)>(impl, Function<Out(In...)>::Adopt);
 }
 
-template<typename T, typename PtrTraits = RawPtrTraits<T>, typename RefDerefTraits = DefaultRefDerefTraits<T>> Ref<T, PtrTraits, RefDerefTraits> adoptRef(T&);
-
-template<typename T, typename _PtrTraits, typename RefDerefTraits>
-inline Ref<T, _PtrTraits, RefDerefTraits> adoptRef(T& reference)
-{
-    return Ref<T, _PtrTraits, RefDerefTraits>(reference);
-}
-
 enum class DestructionThread : unsigned char { Any, Main, MainRunLoop };
 void ensureOnMainThread(Function<void()>&&); // Sync if called on main thread, async otherwise.
 void ensureOnMainRunLoop(Function<void()>&&); // Sync if called on main run loop, async otherwise.

From aa07f922103ebe8e78c8da4c754b43af3c129f3e Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Thu, 19 Dec 2024 12:17:21 -0700
Subject: [PATCH 094/209] [DirectX][SPIRV] Consistent names for HLSL resource
 intrinsics (#120466)

Rename HLSL resource-related intrinsics to be consistent with the naming
conventions discussed in [wg-hlsl:0014].

This is an entirely mechanical change, consisting of the following
commands and automated formatting.

```sh
git grep -l handle.fromBinding | xargs perl -pi -e \
  's/(dx|spv)(.)handle.fromBinding/$1$2resource$2handlefrombinding/g'
git grep -l typedBufferLoad_checkbit | xargs perl -pi -e \
  's/(dx|spv)(.)typedBufferLoad_checkbit/$1$2resource$2loadchecked$2typedbuffer/g'
git grep -l typedBufferLoad | xargs perl -pi -e \
  's/(dx|spv)(.)typedBufferLoad/$1$2resource$2load$2typedbuffer/g'
git grep -l typedBufferStore | xargs perl -pi -e \
  's/(dx|spv)(.)typedBufferStore/$1$2resource$2store$2typedbuffer/g'
git grep -l bufferUpdateCounter | xargs perl -pi -e \
  's/(dx|spv)(.)bufferUpdateCounter/$1$2resource$2updatecounter/g'
git grep -l cast_handle | xargs perl -pi -e \
  's/(dx|spv)(.)cast.handle/$1$2resource$2casthandle/g'
```

[wg-hlsl:0014]: https://github.com/llvm/wg-hlsl/blob/main/proposals/0014-consistent-naming-for-dx-intrinsics.md
---
 clang/lib/CodeGen/CGHLSLRuntime.h             |  5 +-
 .../ByteAddressBuffers-constructors.hlsl      |  6 +-
 .../builtins/RWBuffer-constructor-opt.hlsl    |  4 +-
 .../builtins/RWBuffer-constructor.hlsl        |  6 +-
 .../StructuredBuffers-constructors.hlsl       | 20 +++----
 .../StructuredBuffers-methods-lib.hlsl        | 10 ++--
 .../StructuredBuffers-methods-ps.hlsl         | 12 ++--
 clang/test/CodeGenHLSL/resource-bindings.hlsl |  8 +--
 llvm/docs/DirectX/DXILResources.rst           | 60 +++++++++----------
 llvm/docs/SPIRVUsage.rst                      |  4 +-
 llvm/include/llvm/IR/IntrinsicsDirectX.td     | 12 ++--
 llvm/include/llvm/IR/IntrinsicsSPIRV.td       |  8 +--
 llvm/lib/Analysis/DXILResource.cpp            |  2 +-
 llvm/lib/Target/DirectX/DXILOpLowering.cpp    | 19 +++---
 .../lib/Target/DirectX/DXILResourceAccess.cpp | 10 ++--
 llvm/lib/Target/DirectX/DXILShaderFlags.cpp   |  2 +-
 .../Target/SPIRV/SPIRVInstructionSelector.cpp |  6 +-
 .../DXILResource/buffer-frombinding.ll        | 14 ++---
 llvm/test/CodeGen/DirectX/BufferLoad.ll       | 38 ++++++------
 .../CodeGen/DirectX/BufferStore-errors.ll     | 12 ++--
 llvm/test/CodeGen/DirectX/BufferStore.ll      | 26 ++++----
 .../DirectX/ContainerData/PSVResources.ll     | 14 ++---
 llvm/test/CodeGen/DirectX/CreateHandle.ll     | 12 ++--
 .../DirectX/CreateHandleFromBinding.ll        | 12 ++--
 .../DirectX/Metadata/resource-symbols.ll      | 10 ++--
 .../ResourceAccess/load_typedbuffer.ll        |  8 +--
 .../ResourceAccess/store_typedbuffer.ll       | 40 ++++++-------
 .../DirectX/ResourceGlobalElimination.ll      | 16 ++---
 .../typed-uav-load-additional-formats.ll      | 12 ++--
 .../CodeGen/DirectX/bufferUpdateCounter.ll    | 12 ++--
 .../SPIRV/hlsl-resources/BufferLoad.ll        | 12 ++--
 .../SPIRV/hlsl-resources/BufferStore.ll       |  4 +-
 .../CombinedSamplerImageDynIdx.ll             |  4 +-
 .../CombinedSamplerImageNonUniformIdx.ll      |  4 +-
 .../InputAttachmentImageDynIdx.ll             |  4 +-
 .../InputAttachmentImageNonUniformIdx.ll      |  4 +-
 .../hlsl-resources/SampledImageDynIdx.ll      |  8 +--
 .../SampledImageNonUniformIdx.ll              |  4 +-
 .../hlsl-resources/SamplerArrayDynIdx.ll      |  4 +-
 .../SamplerArrayNonUniformIdx.ll              |  4 +-
 .../hlsl-resources/ScalarResourceType.ll      |  8 +--
 .../hlsl-resources/StorageImageDynIdx.ll      |  4 +-
 .../StorageImageNonUniformIdx.ll              |  4 +-
 .../StorageTexelBufferDynIdx.ll               |  4 +-
 .../StorageTexelBufferNonUniformIdx.ll        |  4 +-
 .../UniformTexelBufferDynIdx.ll               |  4 +-
 .../UniformTexelBufferNonUniformIdx.ll        |  4 +-
 .../SPIRV/hlsl-resources/UnknownBufferLoad.ll |  4 +-
 .../hlsl-resources/UnknownBufferStore.ll      |  4 +-
 49 files changed, 257 insertions(+), 255 deletions(-)

diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index 85238ee34b447..edb87f9d5efdf 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -103,8 +103,9 @@ class CGHLSLRuntime {
   GENERATE_HLSL_INTRINSIC_FUNCTION(SClamp, sclamp)
   GENERATE_HLSL_INTRINSIC_FUNCTION(UClamp, uclamp)
 
-  GENERATE_HLSL_INTRINSIC_FUNCTION(CreateHandleFromBinding, handle_fromBinding)
-  GENERATE_HLSL_INTRINSIC_FUNCTION(BufferUpdateCounter, bufferUpdateCounter)
+  GENERATE_HLSL_INTRINSIC_FUNCTION(CreateHandleFromBinding,
+                                   resource_handlefrombinding)
+  GENERATE_HLSL_INTRINSIC_FUNCTION(BufferUpdateCounter, resource_updatecounter)
   GENERATE_HLSL_INTRINSIC_FUNCTION(GroupMemoryBarrierWithGroupSync,
                                    group_memory_barrier_with_group_sync)
 
diff --git a/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl b/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl
index 45e135427ba9c..7507e741a9c9b 100644
--- a/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/ByteAddressBuffers-constructors.hlsl
@@ -21,9 +21,9 @@ RasterizerOrderedByteAddressBuffer Buffer2: register(u3, space4);
 
 // CHECK: define internal void @_init_resource_bindings() {
 // CHECK-NEXT: entry:
-// CHECK-DXIL-NEXT: %Buffer0_h = call target("dx.RawBuffer", i8, 0, 0) @llvm.dx.handle.fromBinding.tdx.RawBuffer_i8_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false)
+// CHECK-DXIL-NEXT: %Buffer0_h = call target("dx.RawBuffer", i8, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false)
 // CHECK-DXIL-NEXT: store target("dx.RawBuffer", i8, 0, 0) %Buffer0_h, ptr @Buffer0, align 4
-// CHECK-DXIL-NEXT: %Buffer1_h = call target("dx.RawBuffer", i8, 1, 0) @llvm.dx.handle.fromBinding.tdx.RawBuffer_i8_1_0t(i32 2, i32 1, i32 1, i32 0, i1 false)
+// CHECK-DXIL-NEXT: %Buffer1_h = call target("dx.RawBuffer", i8, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_1_0t(i32 2, i32 1, i32 1, i32 0, i1 false)
 // CHECK-DXIL-NEXT: store target("dx.RawBuffer", i8, 1, 0) %Buffer1_h, ptr @Buffer1, align 4
-// CHECK-DXIL-NEXT: %Buffer2_h = call target("dx.RawBuffer", i8, 1, 1) @llvm.dx.handle.fromBinding.tdx.RawBuffer_i8_1_1t(i32 4, i32 3, i32 1, i32 0, i1 false)
+// CHECK-DXIL-NEXT: %Buffer2_h = call target("dx.RawBuffer", i8, 1, 1) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_1_1t(i32 4, i32 3, i32 1, i32 0, i1 false)
 // CHECK-DXIL-NEXT: store target("dx.RawBuffer", i8, 1, 1) %Buffer2_h, ptr @Buffer2, align 4
diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor-opt.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor-opt.hlsl
index d834a22917c1f..237b97394024c 100644
--- a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor-opt.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor-opt.hlsl
@@ -11,10 +11,10 @@ void main() {
 // CHECK: define void @main()
 // CHECK-NEXT: entry:
 
-// CHECK-SPIRV-NEXT: %Buf_h.i = tail call target("spirv.Image", float, 5, 2, 0, 0, 2, 0) @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_0t(i32 3, i32 5, i32 1, i32 0, i1 false)
+// CHECK-SPIRV-NEXT: %Buf_h.i = tail call target("spirv.Image", float, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0t(i32 3, i32 5, i32 1, i32 0, i1 false)
 // CHECK-SPIRV-NEXT: store target("spirv.Image", float, 5, 2, 0, 0, 2, 0) %Buf_h.i, ptr @Buf, align 8
 
-// CHECK-DXIL-NEXT: %Buf_h.i = tail call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false)
+// CHECK-DXIL-NEXT: %Buf_h.i = tail call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false)
 // CHECK-DXIL-NEXT: store target("dx.TypedBuffer", float, 1, 0, 0) %Buf_h.i, ptr @Buf, align 4
 
 // CHECK-NEXT: ret void
diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
index c2db56e2b2bdd..e4226abf71b8e 100644
--- a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
-// FIXME: SPIR-V codegen of llvm.spv.handle.fromBinding is not yet implemented
+// FIXME: SPIR-V codegen of llvm.spv.resource.handlefrombinding is not yet implemented
 // RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
 
 // NOTE: SPIRV codegen for resource types is not yet implemented
@@ -19,7 +19,7 @@ RWBuffer<float> Buf : register(u5, space3);
 
 // CHECK: define internal void @_init_resource_bindings() {
 // CHECK-NEXT: entry:
-// CHECK-DXIL-NEXT: %Buf_h = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false)
+// CHECK-DXIL-NEXT: %Buf_h = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false)
 // CHECK-DXIL-NEXT: store target("dx.TypedBuffer", float, 1, 0, 0) %Buf_h, ptr @Buf, align 4
-// CHECK-SPIRV-NEXT: %Buf_h = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.spv.handle.fromBinding.tdx.TypedBuffer_f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false)
+// CHECK-SPIRV-NEXT: %Buf_h = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.spv.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false)
 // CHECK-SPIRV-NEXT: store target("dx.TypedBuffer", float, 1, 0, 0) %Buf_h, ptr @Buf, align 4
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl
index d84e92242ffb4..16f4f80231dae 100644
--- a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-constructors.hlsl
@@ -37,24 +37,24 @@ RasterizerOrderedStructuredBuffer<float> Buf5 : register(u1, space2);
 
 // CHECK: define internal void @_init_resource_bindings() {
 // CHECK-NEXT: entry:
-// CHECK-DXIL-NEXT: %Buf_h = call target("dx.RawBuffer", float, 0, 0) @llvm.dx.handle.fromBinding.tdx.RawBuffer_f32_0_0t(i32 0, i32 10, i32 1, i32 0, i1 false)
+// CHECK-DXIL-NEXT: %Buf_h = call target("dx.RawBuffer", float, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_0_0t(i32 0, i32 10, i32 1, i32 0, i1 false)
 // CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 0, 0) %Buf_h, ptr @Buf, align 4
-// CHECK-DXIL-NEXT: %Buf2_h = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.handle.fromBinding.tdx.RawBuffer_f32_1_0t(i32 1, i32 5, i32 1, i32 0, i1 false)
+// CHECK-DXIL-NEXT: %Buf2_h = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 1, i32 5, i32 1, i32 0, i1 false)
 // CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 1, 0) %Buf2_h, ptr @Buf2, align 4
-// CHECK-DXIL-NEXT: %Buf3_h = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.handle.fromBinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 3, i32 1, i32 0, i1 false)
+// CHECK-DXIL-NEXT: %Buf3_h = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 3, i32 1, i32 0, i1 false)
 // CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 1, 0) %Buf3_h, ptr @Buf3, align 4
-// CHECK-DXIL-NEXT: %Buf4_h = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.handle.fromBinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 4, i32 1, i32 0, i1 false)
+// CHECK-DXIL-NEXT: %Buf4_h = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 4, i32 1, i32 0, i1 false)
 // CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 1, 0) %Buf4_h, ptr @Buf4, align 4
-// CHECK-DXIL-NEXT: %Buf5_h = call target("dx.RawBuffer", float, 1, 1) @llvm.dx.handle.fromBinding.tdx.RawBuffer_f32_1_1t(i32 2, i32 1, i32 1, i32 0, i1 false)
+// CHECK-DXIL-NEXT: %Buf5_h = call target("dx.RawBuffer", float, 1, 1) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_1t(i32 2, i32 1, i32 1, i32 0, i1 false)
 // CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 1, 1) %Buf5_h, ptr @Buf5, align 4
 
-// CHECK-SPIRV-NEXT: %Buf_h = call target("dx.RawBuffer", float, 0, 0) @llvm.spv.handle.fromBinding.tdx.RawBuffer_f32_0_0t(i32 0, i32 10, i32 1, i32 0, i1 false)
+// CHECK-SPIRV-NEXT: %Buf_h = call target("dx.RawBuffer", float, 0, 0) @llvm.spv.resource.handlefrombinding.tdx.RawBuffer_f32_0_0t(i32 0, i32 10, i32 1, i32 0, i1 false)
 // CHECK-SPIRV-NEXT: store target("dx.RawBuffer", float, 0, 0) %Buf_h, ptr @Buf", align 4
-// CHECK-SPIRV-NEXT: %Buf2_h = call target("dx.RawBuffer", float, 1, 0) @llvm.spv.handle.fromBinding.tdx.RawBuffer_f32_1_0t(i32 1, i32 5, i32 1, i32 0, i1 false)
+// CHECK-SPIRV-NEXT: %Buf2_h = call target("dx.RawBuffer", float, 1, 0) @llvm.spv.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 1, i32 5, i32 1, i32 0, i1 false)
 // CHECK-SPIRV-NEXT: store target("dx.RawBuffer", float, 1, 0) %Buf2_h, ptr @Buf2", align 4
-// CHECK-SPIRV-NEXT: %Buf3_h = call target("dx.RawBuffer", float, 0, 0) @llvm.spv.handle.fromBinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 3, i32 1, i32 0, i1 false)
+// CHECK-SPIRV-NEXT: %Buf3_h = call target("dx.RawBuffer", float, 0, 0) @llvm.spv.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 3, i32 1, i32 0, i1 false)
 // CHECK-SPIRV-NEXT: store target("dx.RawBuffer", float, 0, 0) %Buf3_h, ptr @Buf3, align 4
-// CHECK-SPIRV-NEXT: %Buf4_h = call target("dx.RawBuffer", float, 1, 0) @llvm.spv.handle.fromBinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 4, i32 1, i32 0, i1 false)
+// CHECK-SPIRV-NEXT: %Buf4_h = call target("dx.RawBuffer", float, 1, 0) @llvm.spv.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 4, i32 1, i32 0, i1 false)
 // CHECK-SPIRV-NEXT: store target("dx.RawBuffer", float, 1, 0) %Buf4_h, ptr @Buf4, align 4
-// CHECK-SPIRV-NEXT: %Buf5_h = call target("dx.RawBuffer", float, 1, 1) @llvm.spv.handle.fromBinding.tdx.RawBuffer_f32_1_1t(i32 2, i32 1, i32 1, i32 0, i1 false)
+// CHECK-SPIRV-NEXT: %Buf5_h = call target("dx.RawBuffer", float, 1, 1) @llvm.spv.resource.handlefrombinding.tdx.RawBuffer_f32_1_1t(i32 2, i32 1, i32 1, i32 0, i1 false)
 // CHECK-SPIRV-NEXT: store target("dx.RawBuffer", float, 1, 1) %Buf5_h, ptr @Buf5, align 4
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-methods-lib.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-methods-lib.hlsl
index 53abdc71bdd4b..5fc2d2ead564d 100644
--- a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-methods-lib.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-methods-lib.hlsl
@@ -15,14 +15,14 @@ export int TestIncrementCounter() {
 }
 
 // CHECK: define noundef i32 @_Z20TestIncrementCounterv()
-// CHECK-DXIL: %[[INDEX:.*]] = call i32 @llvm.dx.bufferUpdateCounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i8 1)
+// CHECK-DXIL: %[[INDEX:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i8 1)
 // CHECK-DXIL: ret i32 %[[INDEX]]
 export int TestDecrementCounter() {
     return RWSB2.DecrementCounter();
 }
 
 // CHECK: define noundef i32 @_Z20TestDecrementCounterv()
-// CHECK-DXIL: %[[INDEX:.*]] = call i32 @llvm.dx.bufferUpdateCounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i8 -1)
+// CHECK-DXIL: %[[INDEX:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i8 -1)
 // CHECK-DXIL: ret i32 %[[INDEX]]
 
 export void TestAppend(float value) {
@@ -31,7 +31,7 @@ export void TestAppend(float value) {
 
 // CHECK: define void @_Z10TestAppendf(float noundef %value)
 // CHECK-DXIL: %[[VALUE:.*]] = load float, ptr %value.addr, align 4
-// CHECK-DXIL: %[[INDEX:.*]] = call i32 @llvm.dx.bufferUpdateCounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i8 1)
+// CHECK-DXIL: %[[INDEX:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i8 1)
 // CHECK-DXIL: %[[RESPTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i32 %[[INDEX]])
 // CHECK-DXIL: store float %[[VALUE]], ptr %[[RESPTR]], align 4
 
@@ -40,10 +40,10 @@ export float TestConsume() {
 }
 
 // CHECK: define noundef float @_Z11TestConsumev()
-// CHECK-DXIL: %[[INDEX:.*]] = call i32 @llvm.dx.bufferUpdateCounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %1, i8 -1)
+// CHECK-DXIL: %[[INDEX:.*]] = call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %1, i8 -1)
 // CHECK-DXIL: %[[RESPTR:.*]] = call ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %0, i32 %[[INDEX]])
 // CHECK-DXIL: %[[VALUE:.*]] = load float, ptr %[[RESPTR]], align 4
 // CHECK-DXIL: ret float %[[VALUE]]
 
-// CHECK: declare i32 @llvm.dx.bufferUpdateCounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0), i8)
+// CHECK: declare i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0), i8)
 // CHECK: declare ptr @llvm.dx.resource.getpointer.p0.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0), i32)
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-methods-ps.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-methods-ps.hlsl
index f6959b5cefb7c..f7c091084d3ed 100644
--- a/clang/test/CodeGenHLSL/builtins/StructuredBuffers-methods-ps.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffers-methods-ps.hlsl
@@ -10,19 +10,19 @@ RasterizerOrderedStructuredBuffer<float> ROSB1, ROSB2;
 
 export void TestIncrementCounter() {
 // CHECK: define void @_Z20TestIncrementCounterv()
-// CHECK-DXIL: call i32 @llvm.dx.bufferUpdateCounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i8 1)
-// CHECK-DXIL: call i32 @llvm.dx.bufferUpdateCounter.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1) %{{[0-9]+}}, i8 1)
+// CHECK-DXIL: call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i8 1)
+// CHECK-DXIL: call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1) %{{[0-9]+}}, i8 1)
     RWSB1.IncrementCounter();
     ROSB1.IncrementCounter();
 }
 
 export void TestDecrementCounter() {
 // CHECK: define void @_Z20TestDecrementCounterv()
-// CHECK-DXIL: call i32 @llvm.dx.bufferUpdateCounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i8 -1)
-// CHECK-DXIL: call i32 @llvm.dx.bufferUpdateCounter.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1) %{{[0-9]+}}, i8 -1)
+// CHECK-DXIL: call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0) %{{[0-9]+}}, i8 -1)
+// CHECK-DXIL: call i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1) %{{[0-9]+}}, i8 -1)
     RWSB2.DecrementCounter();
     ROSB2.DecrementCounter();
 }
 
-// CHECK: declare i32 @llvm.dx.bufferUpdateCounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0), i8)
-// CHECK: declare i32 @llvm.dx.bufferUpdateCounter.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1), i8)
+// CHECK: declare i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_0t(target("dx.RawBuffer", float, 1, 0), i8)
+// CHECK: declare i32 @llvm.dx.resource.updatecounter.tdx.RawBuffer_f32_1_1t(target("dx.RawBuffer", float, 1, 1), i8)
diff --git a/clang/test/CodeGenHLSL/resource-bindings.hlsl b/clang/test/CodeGenHLSL/resource-bindings.hlsl
index d4e6308210653..4049a87a8ab71 100644
--- a/clang/test/CodeGenHLSL/resource-bindings.hlsl
+++ b/clang/test/CodeGenHLSL/resource-bindings.hlsl
@@ -2,18 +2,18 @@
 
 // CHECK: define internal void @_init_resource_bindings() {
 
-// CHECK: %U0S0_h = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false)
+// CHECK: %U0S0_h = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false)
 RWBuffer<float4> U0S0 : register(u0);
 
-// CHECK: %U5S3_h = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false)
+// CHECK: %U5S3_h = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false)
 RWBuffer<float> U5S3 : register(u5, space3);
 
-// CHECK: %T2S2_h = call target("dx.RawBuffer", i32, 0, 0) @llvm.dx.handle.fromBinding.tdx.RawBuffer_i32_0_0t(i32 2, i32 2, i32 1, i32 0, i1 false)
+// CHECK: %T2S2_h = call target("dx.RawBuffer", i32, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i32_0_0t(i32 2, i32 2, i32 1, i32 0, i1 false)
 StructuredBuffer<int> T2S2 : register(t2, space2);
 struct S {
   float4 f;
   int i;
 };
 
-// CHECK: %T3S0_h = call target("dx.RawBuffer", %struct.S, 0, 0) @llvm.dx.handle.fromBinding.tdx.RawBuffer_s_struct.Ss_0_0t(i32 0, i32 3, i32 1, i32 0, i1 false)
+// CHECK: %T3S0_h = call target("dx.RawBuffer", %struct.S, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_s_struct.Ss_0_0t(i32 0, i32 3, i32 1, i32 0, i1 false)
 StructuredBuffer<S> T3S0 : register(t3);
diff --git a/llvm/docs/DirectX/DXILResources.rst b/llvm/docs/DirectX/DXILResources.rst
index dcec9611d8aaa..3971d3788b8a0 100644
--- a/llvm/docs/DirectX/DXILResources.rst
+++ b/llvm/docs/DirectX/DXILResources.rst
@@ -154,7 +154,7 @@ We provide a few different ways to instantiate resources in the IR via the
 type, returning an appropriate handle for the resource, and represent binding
 information in the arguments to the intrinsic.
 
-The three operations we need are ``llvm.dx.handle.fromBinding``,
+The three operations we need are ``llvm.dx.resource.handlefrombinding``,
 ``llvm.dx.handle.fromHeap``, and ``llvm.dx.handle.fromPointer``. These are
 rougly equivalent to the DXIL operations ``dx.op.createHandleFromBinding``,
 ``dx.op.createHandleFromHeap``, and ``dx.op.createHandleForLib``, but they fold
@@ -169,7 +169,7 @@ arguments.
 
 .. _dx.op.createHandle: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#resource-handles
 
-.. list-table:: ``@llvm.dx.handle.fromBinding``
+.. list-table:: ``@llvm.dx.resource.handlefrombinding``
    :header-rows: 1
 
    * - Argument
@@ -210,35 +210,35 @@ Examples:
 
    ; RWBuffer<float4> Buf : register(u5, space3)
    %buf = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-               @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0(
-                   i32 3, i32 5, i32 1, i32 0, i1 false)
+        @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0(
+            i32 3, i32 5, i32 1, i32 0, i1 false)
 
    ; RWBuffer<int> Buf : register(u7, space2)
    %buf = call target("dx.TypedBuffer", i32, 1, 0, 1)
-               @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_1_0t(
-                   i32 2, i32 7, i32 1, i32 0, i1 false)
+        @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i32_1_0t(
+            i32 2, i32 7, i32 1, i32 0, i1 false)
 
    ; Buffer<uint4> Buf[24] : register(t3, space5)
    %buf = call target("dx.TypedBuffer", <4 x i32>, 0, 0, 0)
-               @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_0_0t(
-                   i32 2, i32 7, i32 24, i32 0, i1 false)
+        @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i32_0_0t(
+            i32 2, i32 7, i32 24, i32 0, i1 false)
 
    ; struct S { float4 a; uint4 b; };
    ; StructuredBuffer<S> Buf : register(t2, space4)
    %buf = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0)
-               @llvm.dx.handle.fromBinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0t(
-                   i32 4, i32 2, i32 1, i32 0, i1 false)
+       @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0t(
+           i32 4, i32 2, i32 1, i32 0, i1 false)
 
    ; ByteAddressBuffer Buf : register(t8, space1)
    %buf = call target("dx.RawBuffer", i8, 0, 0)
-               @llvm.dx.handle.fromBinding.tdx.RawBuffer_i8_0_0t(
-                   i32 1, i32 8, i32 1, i32 0, i1 false)
+       @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0t(
+           i32 1, i32 8, i32 1, i32 0, i1 false)
 
    ; RWBuffer<float4> Global[3] : register(u6, space5)
    ; RWBuffer<float4> Buf = Global[2];
    %buf = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-               @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0(
-                   i32 5, i32 6, i32 3, i32 2, i1 false)
+       @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0(
+           i32 5, i32 6, i32 3, i32 2, i1 false)
 
 .. list-table:: ``@llvm.dx.handle.fromHeap``
    :header-rows: 1
@@ -291,10 +291,10 @@ containing 4 elements of the same type, and in the case of `ResRet` a 5th
 element that is used by the `CheckAccessFullyMapped`_ operation.
 
 In LLVM IR the intrinsics will return the contained type of the resource
-instead. That is, ``llvm.dx.typedBufferLoad`` from a ``Buffer<float>`` would
-return a single float, from ``Buffer<float4>`` a vector of 4 floats, and from
-``Buffer<double2>`` a vector of two doubles, etc. The operations are then
-expanded out to match DXIL's format during lowering.
+instead. That is, ``llvm.dx.resource.load.typedbuffer`` from a
+``Buffer<float>`` would return a single float, from ``Buffer<float4>`` a vector
+of 4 floats, and from ``Buffer<double2>`` a vector of two doubles, etc. The
+operations are then expanded out to match DXIL's format during lowering.
 
 In cases where we need ``CheckAccessFullyMapped``, we have a second intrinsic
 that returns an anonymous struct with element-0 being the contained type, and
@@ -308,7 +308,7 @@ HLSL source, but this actually matches DXC's behaviour in practice.
 .. _CBufRet: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#cbufferloadlegacy
 .. _CheckAccessFullyMapped: https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/checkaccessfullymapped
 
-.. list-table:: ``@llvm.dx.typedBufferLoad``
+.. list-table:: ``@llvm.dx.resource.load.typedbuffer``
    :header-rows: 1
 
    * - Argument
@@ -333,22 +333,22 @@ Examples:
 .. code-block:: llvm
 
    %ret = call <4 x float>
-       @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_0_0_0t(
+       @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_0_0_0t(
            target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %index)
    %ret = call float
-       @llvm.dx.typedBufferLoad.f32.tdx.TypedBuffer_f32_0_0_0t(
+       @llvm.dx.resource.load.typedbuffer.f32.tdx.TypedBuffer_f32_0_0_0t(
            target("dx.TypedBuffer", float, 0, 0, 0) %buffer, i32 %index)
    %ret = call <4 x i32>
-       @llvm.dx.typedBufferLoad.v4i32.tdx.TypedBuffer_v4i32_0_0_0t(
+       @llvm.dx.resource.load.typedbuffer.v4i32.tdx.TypedBuffer_v4i32_0_0_0t(
            target("dx.TypedBuffer", <4 x i32>, 0, 0, 0) %buffer, i32 %index)
    %ret = call <4 x half>
-       @llvm.dx.typedBufferLoad.v4f16.tdx.TypedBuffer_v4f16_0_0_0t(
+       @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_0_0_0t(
            target("dx.TypedBuffer", <4 x half>, 0, 0, 0) %buffer, i32 %index)
    %ret = call <2 x double>
-       @llvm.dx.typedBufferLoad.v2f64.tdx.TypedBuffer_v2f64_0_0t(
+       @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_0_0t(
            target("dx.TypedBuffer", <2 x double>, 0, 0, 0) %buffer, i32 %index)
 
-.. list-table:: ``@llvm.dx.typedBufferLoad.checkbit``
+.. list-table:: ``@llvm.dx.resource.loadchecked.typedbuffer``
    :header-rows: 1
 
    * - Argument
@@ -371,7 +371,7 @@ Examples:
 .. code-block:: llvm
 
    %ret = call {<4 x float>, i1}
-       @llvm.dx.typedBufferLoad.checkbit.v4f32.tdx.TypedBuffer_v4f32_0_0_0t(
+       @llvm.dx.resource.loadchecked.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_0_0_0t(
            target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %index)
 
 Texture and Typed Buffer Stores
@@ -397,7 +397,7 @@ types that are 32-bits or fewer, such as ``<4 x i32>``, ``<4 x float>``, and
 
 Examples:
 
-.. list-table:: ``@llvm.dx.typedBufferStore``
+.. list-table:: ``@llvm.dx.resource.store.typedbuffer``
    :header-rows: 1
 
    * - Argument
@@ -425,9 +425,9 @@ Examples:
 
 .. code-block:: llvm
 
-   call void @llvm.dx.typedBufferStore.tdx.Buffer_v4f32_1_0_0t(
+   call void @llvm.dx.resource.store.typedbuffer.tdx.Buffer_v4f32_1_0_0t(
        target("dx.TypedBuffer", f32, 1, 0) %buf, i32 %index, <4 x f32> %data)
-   call void @llvm.dx.typedBufferStore.tdx.Buffer_v4f16_1_0_0t(
+   call void @llvm.dx.resource.store.typedbuffer.tdx.Buffer_v4f16_1_0_0t(
        target("dx.TypedBuffer", f16, 1, 0) %buf, i32 %index, <4 x f16> %data)
-   call void @llvm.dx.typedBufferStore.tdx.Buffer_v2f64_1_0_0t(
+   call void @llvm.dx.resource.store.typedbuffer.tdx.Buffer_v2f64_1_0_0t(
        target("dx.TypedBuffer", f64, 1, 0) %buf, i32 %index, <2 x f64> %data)
diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index b7b3d21545168..23c5fe37a9b87 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -395,7 +395,7 @@ SPIR-V backend, along with their descriptions and argument details.
      - Pointer
      - `[8-bit Integer]`
      - Creates a resource handle for graphics or compute resources. Facilitates the management and use of resources in shaders.
-   * - `int_spv_handle_fromBinding`
+   * - `int_spv_resource_handlefrombinding`
      - spirv.Image
      - `[32-bit Integer set, 32-bit Integer binding, 32-bit Integer arraySize, 32-bit Integer index, bool isUniformIndex]`
      - Returns the handle for the resource at the given set and binding.\
@@ -410,7 +410,7 @@ SPIR-V backend, along with their descriptions and argument details.
        return type is a scalar, then the first element of the vector is \
        returned. If the return type is an n-element vector, then the first \
        n-elements of the 4-element vector are returned.
-   * - `int_spv_typedBufferStore`
+   * - `int_spv_resource_store_typedbuffer`
      - void
      - `[spirv.Image Image, 32-bit Integer coordinate, vec4 data]`
      - Stores the data to the image buffer at the given coordinate. The \
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index d509c17ee36c8..d31d5afe5145a 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -21,7 +21,7 @@ def int_dx_flattened_thread_id_in_group : Intrinsic<[llvm_i32_ty], [], [IntrNoMe
 // type appropriate for the kind of resource given a register space ID, lower
 // bound and range size of the binding, as well as an index and an indicator
 // whether that index may be non-uniform.
-def int_dx_handle_fromBinding
+def int_dx_resource_handlefrombinding
     : DefaultAttrsIntrinsic<
           [llvm_any_ty],
           [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty],
@@ -30,22 +30,22 @@ def int_dx_handle_fromBinding
 def int_dx_resource_getpointer
     : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [llvm_any_ty, llvm_i32_ty],
                             [IntrNoMem]>;
-def int_dx_typedBufferLoad
+def int_dx_resource_load_typedbuffer
     : DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_any_ty, llvm_i32_ty],
                             [IntrReadMem]>;
-def int_dx_typedBufferLoad_checkbit
+def int_dx_resource_loadchecked_typedbuffer
     : DefaultAttrsIntrinsic<[llvm_any_ty, llvm_i1_ty],
                             [llvm_any_ty, llvm_i32_ty], [IntrReadMem]>;
-def int_dx_typedBufferStore
+def int_dx_resource_store_typedbuffer
     : DefaultAttrsIntrinsic<[], [llvm_any_ty, llvm_i32_ty, llvm_anyvector_ty],
                             [IntrWriteMem]>;
 
-def int_dx_bufferUpdateCounter
+def int_dx_resource_updatecounter
     : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_i8_ty],
                             [IntrInaccessibleMemOrArgMemOnly]>;
     
 // Cast between target extension handle types and dxil-style opaque handles
-def int_dx_cast_handle : Intrinsic<[llvm_any_ty], [llvm_any_ty]>;
+def int_dx_resource_casthandle : Intrinsic<[llvm_any_ty], [llvm_any_ty]>;
 
 def int_dx_all : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty], [IntrNoMem]>;
 def int_dx_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty], [IntrNoMem]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 3895ad6ee297a..bcff0f20b985d 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -103,7 +103,7 @@ let TargetPrefix = "spv" in {
   // type appropriate for the kind of resource given the set id, binding id,
   // array size of the binding, as well as an index and an indicator
   // whether that index may be non-uniform.
-  def int_spv_handle_fromBinding
+  def int_spv_resource_handlefrombinding
       : DefaultAttrsIntrinsic<
             [llvm_any_ty],
             [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty],
@@ -112,19 +112,19 @@ let TargetPrefix = "spv" in {
   def int_spv_firstbituhigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
   def int_spv_firstbitshigh : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_anyint_ty], [IntrNoMem]>;
 
-  def int_spv_bufferUpdateCounter
+  def int_spv_resource_updatecounter
       : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_i8_ty],
                               [IntrInaccessibleMemOrArgMemOnly]>;
 
   // Read a value from the image buffer. It does not translate directly to a
   // single OpImageRead because the result type is not necessarily a 4 element
   // vector.
-  def int_spv_typedBufferLoad
+  def int_spv_resource_load_typedbuffer
       : DefaultAttrsIntrinsic<[llvm_any_ty], [llvm_any_ty, llvm_i32_ty]>;
 
   // Write a value to the image buffer. Translates directly to a single
   // OpImageWrite.
-  def int_spv_typedBufferStore
+  def int_spv_resource_store_typedbuffer
     : DefaultAttrsIntrinsic<[], [llvm_any_ty, llvm_i32_ty, llvm_anyvector_ty]>;
 
 }
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index b6d98407fe809..7f28e63cc117d 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -691,7 +691,7 @@ void DXILBindingMap::populate(Module &M, DXILResourceTypeMap &DRTM) {
     switch (ID) {
     default:
       continue;
-    case Intrinsic::dx_handle_fromBinding: {
+    case Intrinsic::dx_resource_handlefrombinding: {
       auto *HandleTy = cast<TargetExtType>(F.getReturnType());
       ResourceTypeInfo &RTI = DRTM[HandleTy];
 
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index c283b9081e087..4e01dd1145a55 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -191,7 +191,7 @@ class OpLowerer {
   /// or defs, and by the end all of the casts will be redundant.
   Value *createTmpHandleCast(Value *V, Type *Ty) {
     CallInst *Cast = OpBuilder.getIRB().CreateIntrinsic(
-        Intrinsic::dx_cast_handle, {Ty, V->getType()}, {V});
+        Intrinsic::dx_resource_casthandle, {Ty, V->getType()}, {V});
     CleanupCasts.push_back(Cast);
     return Cast;
   }
@@ -216,7 +216,7 @@ class OpLowerer {
       // Otherwise, we're the second handle in a pair. Forward the arguments and
       // remove the (second) cast.
       CallInst *Def = cast<CallInst>(Cast->getOperand(0));
-      assert(Def->getIntrinsicID() == Intrinsic::dx_cast_handle &&
+      assert(Def->getIntrinsicID() == Intrinsic::dx_resource_casthandle &&
              "Unbalanced pair of temporary handle casts");
       Cast->replaceAllUsesWith(Def->getOperand(0));
       Cast->eraseFromParent();
@@ -349,8 +349,9 @@ class OpLowerer {
     });
   }
 
-  /// Lower `dx.handle.fromBinding` intrinsics depending on the shader model and
-  /// taking into account binding information from DXILResourceBindingAnalysis.
+  /// Lower `dx.resource.handlefrombinding` intrinsics depending on the shader
+  /// model and taking into account binding information from
+  /// DXILResourceBindingAnalysis.
   bool lowerHandleFromBinding(Function &F) {
     Triple TT(Triple(M.getTargetTriple()));
     if (TT.getDXILVersion() < VersionTuple(1, 6))
@@ -715,22 +716,22 @@ class OpLowerer {
         F, OpCode, ArrayRef<IntrinArgSelect>{__VA_ARGS__});                    \
     break;
 #include "DXILOperation.inc"
-      case Intrinsic::dx_handle_fromBinding:
+      case Intrinsic::dx_resource_handlefrombinding:
         HasErrors |= lowerHandleFromBinding(F);
         break;
       case Intrinsic::dx_resource_getpointer:
         HasErrors |= lowerGetPointer(F);
         break;
-      case Intrinsic::dx_typedBufferLoad:
+      case Intrinsic::dx_resource_load_typedbuffer:
         HasErrors |= lowerTypedBufferLoad(F, /*HasCheckBit=*/false);
         break;
-      case Intrinsic::dx_typedBufferLoad_checkbit:
+      case Intrinsic::dx_resource_loadchecked_typedbuffer:
         HasErrors |= lowerTypedBufferLoad(F, /*HasCheckBit=*/true);
         break;
-      case Intrinsic::dx_typedBufferStore:
+      case Intrinsic::dx_resource_store_typedbuffer:
         HasErrors |= lowerTypedBufferStore(F);
         break;
-      case Intrinsic::dx_bufferUpdateCounter:
+      case Intrinsic::dx_resource_updatecounter:
         HasErrors |= lowerUpdateCounter(F);
         break;
       // TODO: this can be removed when
diff --git a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
index 7e9f9e1593e96..1ff8f09f066db 100644
--- a/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
+++ b/llvm/lib/Target/DirectX/DXILResourceAccess.cpp
@@ -81,7 +81,7 @@ static void replaceTypedBufferAccess(IntrinsicInst *II,
         // We're storing a scalar, so we need to load the current value and only
         // replace the relevant part.
         auto *Load = Builder.CreateIntrinsic(
-            ContainedType, Intrinsic::dx_typedBufferLoad,
+            ContainedType, Intrinsic::dx_resource_load_typedbuffer,
             {II->getOperand(0), II->getOperand(1)});
         // If we have an offset from seeing a GEP earlier, use it.
         Value *IndexOp = Current.Index
@@ -93,16 +93,16 @@ static void replaceTypedBufferAccess(IntrinsicInst *II,
       }
 
       auto *Inst = Builder.CreateIntrinsic(
-          Builder.getVoidTy(), Intrinsic::dx_typedBufferStore,
+          Builder.getVoidTy(), Intrinsic::dx_resource_store_typedbuffer,
           {II->getOperand(0), II->getOperand(1), V});
       SI->replaceAllUsesWith(Inst);
       DeadInsts.push_back(SI);
 
     } else if (auto *LI = dyn_cast<LoadInst>(Current.Access)) {
       IRBuilder<> Builder(LI);
-      Value *V =
-          Builder.CreateIntrinsic(ContainedType, Intrinsic::dx_typedBufferLoad,
-                                  {II->getOperand(0), II->getOperand(1)});
+      Value *V = Builder.CreateIntrinsic(
+          ContainedType, Intrinsic::dx_resource_load_typedbuffer,
+          {II->getOperand(0), II->getOperand(1)});
       if (Current.Index)
         V = Builder.CreateExtractElement(V, Current.Index);
 
diff --git a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
index 1e88963345763..2edfc707ce6c7 100644
--- a/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
+++ b/llvm/lib/Target/DirectX/DXILShaderFlags.cpp
@@ -54,7 +54,7 @@ static void updateFunctionFlags(ComputedShaderFlags &CSF, const Instruction &I,
     switch (II->getIntrinsicID()) {
     default:
       break;
-    case Intrinsic::dx_typedBufferLoad: {
+    case Intrinsic::dx_resource_load_typedbuffer: {
       dxil::ResourceTypeInfo &RTI =
           DRTM[cast<TargetExtType>(II->getArgOperand(0)->getType())];
       if (RTI.isTyped())
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index b593b9bd1d7aa..289d5f3166487 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -2987,14 +2987,14 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
   // Discard internal intrinsics.
   case Intrinsic::spv_value_md:
     break;
-  case Intrinsic::spv_handle_fromBinding: {
+  case Intrinsic::spv_resource_handlefrombinding: {
     return selectHandleFromBinding(ResVReg, ResType, I);
   }
-  case Intrinsic::spv_typedBufferStore: {
+  case Intrinsic::spv_resource_store_typedbuffer: {
     selectImageWriteIntrinsic(I);
     return true;
   }
-  case Intrinsic::spv_typedBufferLoad: {
+  case Intrinsic::spv_resource_load_typedbuffer: {
     selectReadImageIntrinsic(ResVReg, ResType, I);
     return true;
   }
diff --git a/llvm/test/Analysis/DXILResource/buffer-frombinding.ll b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll
index 313c8376483b9..ab7151c57473f 100644
--- a/llvm/test/Analysis/DXILResource/buffer-frombinding.ll
+++ b/llvm/test/Analysis/DXILResource/buffer-frombinding.ll
@@ -5,7 +5,7 @@
 define void @test_typedbuffer() {
   ; ByteAddressBuffer Buf : register(t8, space1)
   %srv0 = call target("dx.RawBuffer", void, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.RawBuffer_i8_0_0t(
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0t(
           i32 1, i32 8, i32 1, i32 0, i1 false)
   ; CHECK: Binding [[SRV0:[0-9]+]]:
   ; CHECK:   Binding:
@@ -19,7 +19,7 @@ define void @test_typedbuffer() {
   ; struct S { float4 a; uint4 b; };
   ; StructuredBuffer<S> Buf : register(t2, space4)
   %srv1 = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0t(
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0t(
           i32 4, i32 2, i32 1, i32 0, i1 false)
   ; CHECK: Binding [[SRV1:[0-9]+]]:
   ; CHECK:   Binding:
@@ -34,7 +34,7 @@ define void @test_typedbuffer() {
 
   ; Buffer<uint4> Buf[24] : register(t3, space5)
   %srv2 = call target("dx.TypedBuffer", <4 x i32>, 0, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_0_0t(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i32_0_0t(
           i32 5, i32 3, i32 24, i32 0, i1 false)
   ; CHECK: Binding [[SRV2:[0-9]+]]:
   ; CHECK:   Binding:
@@ -49,7 +49,7 @@ define void @test_typedbuffer() {
 
   ; RWBuffer<int> Buf : register(u7, space2)
   %uav0 = call target("dx.TypedBuffer", i32, 1, 0, 1)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_1_0t(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i32_1_0t(
           i32 2, i32 7, i32 1, i32 0, i1 false)
   ; CHECK: Binding [[UAV0:[0-9]+]]:
   ; CHECK:   Binding:
@@ -67,7 +67,7 @@ define void @test_typedbuffer() {
 
   ; RWBuffer<float4> Buf : register(u5, space3)
   %uav1 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-              @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0(
+              @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0(
                   i32 3, i32 5, i32 1, i32 0, i1 false)
   ; CHECK: Binding [[UAV1:[0-9]+]]:
   ; CHECK:   Binding:
@@ -86,11 +86,11 @@ define void @test_typedbuffer() {
   ; RWBuffer<float4> BufferArray[10] : register(u0, space4)
   ; RWBuffer<float4> Buf = BufferArray[0]
   %uav2_1 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-              @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0(
+              @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0(
                   i32 4, i32 0, i32 10, i32 0, i1 false)
   ; RWBuffer<float4> Buf = BufferArray[5]
   %uav2_2 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-              @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0(
+              @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0(
                   i32 4, i32 0, i32 10, i32 5, i1 false)
   ; CHECK: Binding [[UAV2:[0-9]+]]:
   ; CHECK:   Binding:
diff --git a/llvm/test/CodeGen/DirectX/BufferLoad.ll b/llvm/test/CodeGen/DirectX/BufferLoad.ll
index 24d65fe1648c1..7f1291bf4a5c8 100644
--- a/llvm/test/CodeGen/DirectX/BufferLoad.ll
+++ b/llvm/test/CodeGen/DirectX/BufferLoad.ll
@@ -10,14 +10,14 @@ define void @loadv4f32() {
   ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217,
   ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]]
   %buffer = call target("dx.TypedBuffer", <4 x float>, 0, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_0_0_0(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_0_0_0(
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; The temporary casts should all have been cleaned up
-  ; CHECK-NOT: %dx.cast_handle
+  ; CHECK-NOT: %dx.resource.casthandle
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call <4 x float> @llvm.dx.typedBufferLoad(
+  %data0 = call <4 x float> @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 0)
 
   ; The extract order depends on the users, so don't enforce that here.
@@ -34,7 +34,7 @@ define void @loadv4f32() {
   call void @scalar_user(float %data0_2)
 
   ; CHECK: [[DATA4:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 4, i32 undef)
-  %data4 = call <4 x float> @llvm.dx.typedBufferLoad(
+  %data4 = call <4 x float> @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 4)
 
   ; CHECK: extractvalue %dx.types.ResRet.f32 [[DATA4]], 0
@@ -48,7 +48,7 @@ define void @loadv4f32() {
   call void @vector_user(<4 x float> %data4)
 
   ; CHECK: [[DATA12:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 12, i32 undef)
-  %data12 = call <4 x float> @llvm.dx.typedBufferLoad(
+  %data12 = call <4 x float> @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 12)
 
   ; CHECK: [[DATA12_3:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA12]], 3
@@ -66,11 +66,11 @@ define void @index_dynamic(i32 %bufindex, i32 %elemindex) {
   ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217,
   ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]]
   %buffer = call target("dx.TypedBuffer", <4 x float>, 0, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_0_0_0(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_0_0_0(
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[LOAD:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 %bufindex, i32 undef)
-  %load = call <4 x float> @llvm.dx.typedBufferLoad(
+  %load = call <4 x float> @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 %bufindex)
 
   ; CHECK: [[ALLOCA:%.*]] = alloca [4 x float]
@@ -101,11 +101,11 @@ define void @loadf32() {
   ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217,
   ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]]
   %buffer = call target("dx.TypedBuffer", float, 0, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_0_0_0(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_0_0_0(
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call float @llvm.dx.typedBufferLoad(
+  %data0 = call float @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", float, 0, 0, 0) %buffer, i32 0)
 
   ; CHECK: [[VAL0:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA0]], 0
@@ -119,11 +119,11 @@ define void @loadv2f32() {
   ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217,
   ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]]
   %buffer = call target("dx.TypedBuffer", <2 x float>, 0, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v2f32_0_0_0(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2f32_0_0_0(
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call <2 x float> @llvm.dx.typedBufferLoad(
+  %data0 = call <2 x float> @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <2 x float>, 0, 0, 0) %buffer, i32 0)
 
   ret void
@@ -133,11 +133,11 @@ define void @loadv4f32_checkbit() {
   ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217,
   ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]]
   %buffer = call target("dx.TypedBuffer", <4 x float>, 0, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_0_0_0(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_0_0_0(
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f32 @dx.op.bufferLoad.f32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call {<4 x float>, i1} @llvm.dx.typedBufferLoad.checkbit.f32(
+  %data0 = call {<4 x float>, i1} @llvm.dx.resource.loadchecked.typedbuffer.f32(
       target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i32 0)
 
   ; CHECK: [[STATUS:%.*]] = extractvalue %dx.types.ResRet.f32 [[DATA0]], 4
@@ -154,11 +154,11 @@ define void @loadv4i32() {
   ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217,
   ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]]
   %buffer = call target("dx.TypedBuffer", <4 x i32>, 0, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4i32_0_0_0(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4i32_0_0_0(
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.i32 @dx.op.bufferLoad.i32(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call <4 x i32> @llvm.dx.typedBufferLoad(
+  %data0 = call <4 x i32> @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x i32>, 0, 0, 0) %buffer, i32 0)
 
   ret void
@@ -168,11 +168,11 @@ define void @loadv4f16() {
   ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217,
   ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]]
   %buffer = call target("dx.TypedBuffer", <4 x half>, 0, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f16_0_0_0(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f16_0_0_0(
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.f16 @dx.op.bufferLoad.f16(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call <4 x half> @llvm.dx.typedBufferLoad(
+  %data0 = call <4 x half> @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x half>, 0, 0, 0) %buffer, i32 0)
 
   ret void
@@ -182,11 +182,11 @@ define void @loadv4i16() {
   ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217,
   ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]]
   %buffer = call target("dx.TypedBuffer", <4 x i16>, 0, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4i16_0_0_0(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4i16_0_0_0(
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0:%.*]] = call %dx.types.ResRet.i16 @dx.op.bufferLoad.i16(i32 68, %dx.types.Handle [[HANDLE]], i32 0, i32 undef)
-  %data0 = call <4 x i16> @llvm.dx.typedBufferLoad(
+  %data0 = call <4 x i16> @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x i16>, 0, 0, 0) %buffer, i32 0)
 
   ret void
diff --git a/llvm/test/CodeGen/DirectX/BufferStore-errors.ll b/llvm/test/CodeGen/DirectX/BufferStore-errors.ll
index ffdcb037b8968..6e529973bd604 100644
--- a/llvm/test/CodeGen/DirectX/BufferStore-errors.ll
+++ b/llvm/test/CodeGen/DirectX/BufferStore-errors.ll
@@ -8,10 +8,10 @@ target triple = "dxil-pc-shadermodel6.6-compute"
 ; CHECK-SAME: typedBufferStore data must be a vector of 4 elements
 define void @storetoomany(<5 x float> %data, i32 %index) "hlsl.export" {
   %buffer = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_1_0_0(
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
-  call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v5f32(
+  call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v5f32(
       target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer,
       i32 %index, <5 x float> %data)
 
@@ -23,15 +23,15 @@ define void @storetoomany(<5 x float> %data, i32 %index) "hlsl.export" {
 ; CHECK-SAME: typedBufferStore data must be a vector of 4 elements
 define void @storetoofew(<3 x i32> %data, i32 %index) "hlsl.export" {
   %buffer = call target("dx.TypedBuffer", <4 x i32>, 1, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4i32_1_0_0(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4i32_1_0_0(
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
-  call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4i32_1_0_0t.v3i32(
+  call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4i32_1_0_0t.v3i32(
       target("dx.TypedBuffer", <4 x i32>, 1, 0, 0) %buffer,
       i32 %index, <3 x i32> %data)
 
   ret void
 }
 
-declare void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v5f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32, <5 x float>)
-declare void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4i32_1_0_0t.v3i32(target("dx.TypedBuffer", <4 x i32>, 1, 0, 0), i32, <3 x i32>)
+declare void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v5f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32, <5 x float>)
+declare void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4i32_1_0_0t.v3i32(target("dx.TypedBuffer", <4 x i32>, 1, 0, 0), i32, <3 x i32>)
diff --git a/llvm/test/CodeGen/DirectX/BufferStore.ll b/llvm/test/CodeGen/DirectX/BufferStore.ll
index 81cc5fd328e0a..381df6a63962e 100644
--- a/llvm/test/CodeGen/DirectX/BufferStore.ll
+++ b/llvm/test/CodeGen/DirectX/BufferStore.ll
@@ -7,18 +7,18 @@ define void @storefloat(<4 x float> %data, i32 %index) {
   ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217,
   ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]]
   %buffer = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_1_0_0(
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; The temporary casts should all have been cleaned up
-  ; CHECK-NOT: %dx.cast_handle
+  ; CHECK-NOT: %dx.resource.casthandle
 
   ; CHECK: [[DATA0_0:%.*]] = extractelement <4 x float> %data, i32 0
   ; CHECK: [[DATA0_1:%.*]] = extractelement <4 x float> %data, i32 1
   ; CHECK: [[DATA0_2:%.*]] = extractelement <4 x float> %data, i32 2
   ; CHECK: [[DATA0_3:%.*]] = extractelement <4 x float> %data, i32 3
   ; CHECK: call void @dx.op.bufferStore.f32(i32 69, %dx.types.Handle [[HANDLE]], i32 %index, i32 undef, float [[DATA0_0]], float [[DATA0_1]], float [[DATA0_2]], float [[DATA0_3]], i8 15)
-  call void @llvm.dx.typedBufferStore(
+  call void @llvm.dx.resource.store.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer,
       i32 %index, <4 x float> %data)
 
@@ -30,7 +30,7 @@ define void @storeint(<4 x i32> %data, i32 %index) {
   ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217,
   ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]]
   %buffer = call target("dx.TypedBuffer", <4 x i32>, 1, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4i32_1_0_0(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4i32_1_0_0(
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK: [[DATA0_0:%.*]] = extractelement <4 x i32> %data, i32 0
@@ -38,7 +38,7 @@ define void @storeint(<4 x i32> %data, i32 %index) {
   ; CHECK: [[DATA0_2:%.*]] = extractelement <4 x i32> %data, i32 2
   ; CHECK: [[DATA0_3:%.*]] = extractelement <4 x i32> %data, i32 3
   ; CHECK: call void @dx.op.bufferStore.i32(i32 69, %dx.types.Handle [[HANDLE]], i32 %index, i32 undef, i32 [[DATA0_0]], i32 [[DATA0_1]], i32 [[DATA0_2]], i32 [[DATA0_3]], i8 15)
-  call void @llvm.dx.typedBufferStore(
+  call void @llvm.dx.resource.store.typedbuffer(
       target("dx.TypedBuffer", <4 x i32>, 1, 0, 0) %buffer,
       i32 %index, <4 x i32> %data)
 
@@ -50,18 +50,18 @@ define void @storehalf(<4 x half> %data, i32 %index) {
   ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217,
   ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]]
   %buffer = call target("dx.TypedBuffer", <4 x half>, 1, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f16_1_0_0(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f16_1_0_0(
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; The temporary casts should all have been cleaned up
-  ; CHECK-NOT: %dx.cast_handle
+  ; CHECK-NOT: %dx.resource.casthandle
 
   ; CHECK: [[DATA0_0:%.*]] = extractelement <4 x half> %data, i32 0
   ; CHECK: [[DATA0_1:%.*]] = extractelement <4 x half> %data, i32 1
   ; CHECK: [[DATA0_2:%.*]] = extractelement <4 x half> %data, i32 2
   ; CHECK: [[DATA0_3:%.*]] = extractelement <4 x half> %data, i32 3
   ; CHECK: call void @dx.op.bufferStore.f16(i32 69, %dx.types.Handle [[HANDLE]], i32 %index, i32 undef, half [[DATA0_0]], half [[DATA0_1]], half [[DATA0_2]], half [[DATA0_3]], i8 15)
-  call void @llvm.dx.typedBufferStore(
+  call void @llvm.dx.resource.store.typedbuffer(
       target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer,
       i32 %index, <4 x half> %data)
 
@@ -73,18 +73,18 @@ define void @storei16(<4 x i16> %data, i32 %index) {
   ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217,
   ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]]
   %buffer = call target("dx.TypedBuffer", <4 x i16>, 1, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4i16_1_0_0(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4i16_1_0_0(
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; The temporary casts should all have been cleaned up
-  ; CHECK-NOT: %dx.cast_handle
+  ; CHECK-NOT: %dx.resource.casthandle
 
   ; CHECK: [[DATA0_0:%.*]] = extractelement <4 x i16> %data, i32 0
   ; CHECK: [[DATA0_1:%.*]] = extractelement <4 x i16> %data, i32 1
   ; CHECK: [[DATA0_2:%.*]] = extractelement <4 x i16> %data, i32 2
   ; CHECK: [[DATA0_3:%.*]] = extractelement <4 x i16> %data, i32 3
   ; CHECK: call void @dx.op.bufferStore.i16(i32 69, %dx.types.Handle [[HANDLE]], i32 %index, i32 undef, i16 [[DATA0_0]], i16 [[DATA0_1]], i16 [[DATA0_2]], i16 [[DATA0_3]], i8 15)
-  call void @llvm.dx.typedBufferStore(
+  call void @llvm.dx.resource.store.typedbuffer(
       target("dx.TypedBuffer", <4 x i16>, 1, 0, 0) %buffer,
       i32 %index, <4 x i16> %data)
 
@@ -96,7 +96,7 @@ define void @store_scalarized_floats(float %data0, float %data1, float %data2, f
   ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217,
   ; CHECK: [[HANDLE:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]]
   %buffer = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_1_0_0(
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; We shouldn't end up with any inserts/extracts.
@@ -108,7 +108,7 @@ define void @store_scalarized_floats(float %data0, float %data1, float %data2, f
   %vec.upto1 = insertelement <4 x float> %vec.upto0, float %data1, i64 1
   %vec.upto2 = insertelement <4 x float> %vec.upto1, float %data2, i64 2
   %vec = insertelement <4 x float> %vec.upto2, float %data3, i64 3
-  call void @llvm.dx.typedBufferStore(
+  call void @llvm.dx.resource.store.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer,
       i32 %index, <4 x float> %vec)
 
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll
index 2bd7a2e8df12d..ce67812c3988f 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll
@@ -15,7 +15,7 @@ define void @main() #0 {
 ; CHECK:          Flags:
 ; CHECK:            UsedByAtomic64:  false
   %srv0 = call target("dx.RawBuffer", i8, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.RawBuffer_i8_0_0t(
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0t(
           i32 1, i32 8, i32 1, i32 0, i1 false)
 
   ; struct S { float4 a; uint4 b; };
@@ -28,7 +28,7 @@ define void @main() #0 {
 ; CHECK:          Flags:
 ; CHECK:            UsedByAtomic64:  false
   %srv1 = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0t(
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0t(
           i32 4, i32 2, i32 1, i32 0, i1 false)
 
   ; Buffer<uint4> Buf[24] : register(t3, space5)
@@ -40,7 +40,7 @@ define void @main() #0 {
 ; CHECK:          Flags:
 ; CHECK:            UsedByAtomic64:  false
   %srv2 = call target("dx.TypedBuffer", <4 x i32>, 0, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_0_0t(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i32_0_0t(
           i32 5, i32 3, i32 24, i32 0, i1 false)
 
   ; RWBuffer<int> Buf : register(u7, space2)
@@ -52,7 +52,7 @@ define void @main() #0 {
 ; CHECK:          Flags:
 ; CHECK:            UsedByAtomic64:  false
   %uav0 = call target("dx.TypedBuffer", i32, 1, 0, 1)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_1_0t(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i32_1_0t(
           i32 2, i32 7, i32 1, i32 0, i1 false)
 
   ; RWBuffer<float4> Buf : register(u5, space3)
@@ -64,7 +64,7 @@ define void @main() #0 {
 ; CHECK:          Flags:
 ; CHECK:            UsedByAtomic64:  false
   %uav1 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-              @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0(
+              @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0(
                   i32 3, i32 5, i32 1, i32 0, i1 false)
 
   ; RWBuffer<float4> BufferArray[10] : register(u0, space4)
@@ -77,11 +77,11 @@ define void @main() #0 {
 ; CHECK:            UsedByAtomic64:  false
   ; RWBuffer<float4> Buf = BufferArray[0]
   %uav2_1 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-              @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0(
+              @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0(
                   i32 4, i32 0, i32 10, i32 0, i1 false)
   ; RWBuffer<float4> Buf = BufferArray[5]
   %uav2_2 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-              @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0(
+              @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_f32_1_0(
                   i32 4, i32 0, i32 10, i32 5, i1 false)
   ret void
 }
diff --git a/llvm/test/CodeGen/DirectX/CreateHandle.ll b/llvm/test/CodeGen/DirectX/CreateHandle.ll
index c9969c9c7ffdb..80daa879f0f86 100644
--- a/llvm/test/CodeGen/DirectX/CreateHandle.ll
+++ b/llvm/test/CodeGen/DirectX/CreateHandle.ll
@@ -17,14 +17,14 @@ declare i32 @some_val();
 define void @test_buffers() {
   ; RWBuffer<float4> Buf : register(u5, space3)
   %typed0 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-              @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0(
+              @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_1_0_0(
                   i32 3, i32 5, i32 1, i32 0, i1 false)
   ; CHECK: call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 1, i32 5, i1 false)
   ; CHECK-NOT: @llvm.dx.cast.handle
 
   ; RWBuffer<int> Buf : register(u7, space2)
   %typed1 = call target("dx.TypedBuffer", i32, 1, 0, 1)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_1_0_1t(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i32_1_0_1t(
           i32 2, i32 7, i32 1, i32 0, i1 false)
   ; CHECK: call %dx.types.Handle @dx.op.createHandle(i32 57, i8 1, i32 0, i32 7, i1 false)
 
@@ -32,20 +32,20 @@ define void @test_buffers() {
   ; Buffer<uint4> typed2 = Buf[4]
   ; Note that the index below is 3 + 4 = 7
   %typed2 = call target("dx.TypedBuffer", <4 x i32>, 0, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_0_0_0t(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i32_0_0_0t(
           i32 5, i32 3, i32 24, i32 4, i1 false)
   ; CHECK: call %dx.types.Handle @dx.op.createHandle(i32 57, i8 0, i32 3, i32 7, i1 false)
 
   ; struct S { float4 a; uint4 b; };
   ; StructuredBuffer<S> Buf : register(t2, space4)
   %struct0 = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0t(
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0t(
           i32 4, i32 2, i32 1, i32 0, i1 true)
   ; CHECK: call %dx.types.Handle @dx.op.createHandle(i32 57, i8 0, i32 2, i32 2, i1 true)
 
   ; ByteAddressBuffer Buf : register(t8, space1)
   %byteaddr0 = call target("dx.RawBuffer", i8, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.RawBuffer_i8_0_0t(
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0t(
           i32 1, i32 8, i32 1, i32 0, i1 false)
   ; CHECK: call %dx.types.Handle @dx.op.createHandle(i32 57, i8 0, i32 1, i32 8, i1 false)
 
@@ -53,7 +53,7 @@ define void @test_buffers() {
   ; Buffer<float4> typed3 = Buf[ix]
   %typed3_ix = call i32 @some_val()
   %typed3 = call target("dx.TypedBuffer", <4 x float>, 0, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_0_0_0t(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_0_0_0t(
           i32 0, i32 7, i32 -1, i32 %typed3_ix, i1 false)
   ; CHECK: %[[IX:.*]] = add i32 %typed3_ix, 7
   ; CHECK: call %dx.types.Handle @dx.op.createHandle(i32 57, i8 0, i32 0, i32 %[[IX]], i1 false)
diff --git a/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll
index 425084e2a65a9..bf11bfa143c93 100644
--- a/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll
+++ b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll
@@ -17,14 +17,14 @@ declare i32 @some_val();
 define void @test_bindings() {
   ; RWBuffer<float4> Buf : register(u5, space3)
   %typed0 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-              @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0(
+              @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_1_0_0(
                   i32 3, i32 5, i32 1, i32 0, i1 false)
   ; CHECK: [[BUF0:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 5, i32 5, i32 3, i8 1 }, i32 5, i1 false)
   ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BUF0]], %dx.types.ResourceProperties { i32 4106, i32 1033 })
 
   ; RWBuffer<int> Buf : register(u7, space2)
   %typed1 = call target("dx.TypedBuffer", i32, 1, 0, 1)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_1_0_0t(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i32_1_0_0t(
           i32 2, i32 7, i32 1, i32 0, i1 false)
   ; CHECK: [[BUF1:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 7, i32 7, i32 2, i8 1 }, i32 7, i1 false)
   ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BUF1]], %dx.types.ResourceProperties { i32 4106, i32 260 })
@@ -33,7 +33,7 @@ define void @test_bindings() {
   ; Buffer<uint4> typed2 = Buf[4]
   ; Note that the index below is 3 + 4 = 7
   %typed2 = call target("dx.TypedBuffer", <4 x i32>, 0, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_i32_0_0_0t(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_i32_0_0_0t(
           i32 5, i32 3, i32 24, i32 4, i1 false)
   ; CHECK: [[BUF2:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 3, i32 26, i32 5, i8 0 }, i32 7, i1 false)
   ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BUF2]], %dx.types.ResourceProperties { i32 10, i32 1029 })
@@ -41,14 +41,14 @@ define void @test_bindings() {
   ; struct S { float4 a; uint4 b; };
   ; StructuredBuffer<S> Buf : register(t2, space4)
   %struct0 = call target("dx.RawBuffer", {<4 x float>, <4 x i32>}, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0t(
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_sl_v4f32v4i32s_0_0t(
           i32 4, i32 2, i32 1, i32 0, i1 true)
   ; CHECK: [[BUF3:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 2, i32 2, i32 4, i8 0 }, i32 2, i1 true)
   ; CHECK: = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BUF3]], %dx.types.ResourceProperties { i32 1036, i32 32 })
 
   ; ByteAddressBuffer Buf : register(t8, space1)
   %byteaddr0 = call target("dx.RawBuffer", i8, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.RawBuffer_i8_0_0t(
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0t(
           i32 1, i32 8, i32 1, i32 0, i1 false)
   ; CHECK: [[BUF4:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 8, i32 8, i32 1, i8 0 }, i32 8, i1 false)
   ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BUF4]], %dx.types.ResourceProperties { i32 11, i32 0 })
@@ -57,7 +57,7 @@ define void @test_bindings() {
   ; Buffer<float4> typed3 = Buf[ix]
   %typed3_ix = call i32 @some_val()
   %typed3 = call target("dx.TypedBuffer", <4 x float>, 0, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_0_0_0t(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_0_0_0t(
           i32 0, i32 7, i32 -1, i32 %typed3_ix, i1 false)
   ; CHECK: %[[IX:.*]] = add i32 %typed3_ix, 7
   ; CHECK: [[BUF5:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 7, i32 -1, i32 0, i8 0 }, i32 %[[IX]], i1 false)
diff --git a/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll b/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll
index 5ac4baa96c659..6ebe4b5eb23a5 100644
--- a/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll
+++ b/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll
@@ -7,27 +7,27 @@ target triple = "dxil-pc-shadermodel6.6-compute"
 define void @test() {
   ; Buffer<float4>
   %float4 = call target("dx.TypedBuffer", <4 x float>, 0, 0, 0)
-      @llvm.dx.handle.fromBinding(i32 0, i32 0, i32 1, i32 0, i1 false)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
   ; CHECK: %TypedBuffer = type { <4 x float> }
 
   ; Buffer<int>
   %int = call target("dx.TypedBuffer", i32, 0, 0, 1)
-      @llvm.dx.handle.fromBinding(i32 0, i32 1, i32 1, i32 0, i1 false)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 1, i32 1, i32 0, i1 false)
   ; CHECK: %TypedBuffer.0 = type { i32 }
 
   ; Buffer<uint3>
   %uint3 = call target("dx.TypedBuffer", <3 x i32>, 0, 0, 0)
-      @llvm.dx.handle.fromBinding(i32 0, i32 2, i32 1, i32 0, i1 false)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 2, i32 1, i32 0, i1 false)
   ; CHECK: %TypedBuffer.1 = type { <3 x i32> }
 
   ; StructuredBuffer<S>
   %struct0 = call target("dx.RawBuffer", %struct.S, 0, 0)
-      @llvm.dx.handle.fromBinding(i32 0, i32 10, i32 1, i32 0, i1 true)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 10, i32 1, i32 0, i1 true)
   ; CHECK: %StructuredBuffer = type { %struct.S }
 
   ; ByteAddressBuffer
   %byteaddr = call target("dx.RawBuffer", i8, 0, 0)
-      @llvm.dx.handle.fromBinding(i32 0, i32 20, i32 1, i32 0, i1 false)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 20, i32 1, i32 0, i1 false)
   ; CHECK: %ByteAddressBuffer = type { i32 }
 
   ret void
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/load_typedbuffer.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/load_typedbuffer.ll
index a40e6cbc6fa4f..9b7e7fd04f605 100644
--- a/llvm/test/CodeGen/DirectX/ResourceAccess/load_typedbuffer.ll
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/load_typedbuffer.ll
@@ -8,24 +8,24 @@ declare void @use_float(float)
 ; CHECK-LABEL: define void @load_float4
 define void @load_float4(i32 %index, i32 %elemindex) {
   %buffer = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_1_0_0(
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK-NOT: @llvm.dx.resource.getpointer
   %ptr = call ptr @llvm.dx.resource.getpointer(
       target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
 
-  ; CHECK: %[[VALUE:.*]] = call <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VALUE:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
   %vec_data = load <4 x float>, ptr %ptr
   call void @use_float4(<4 x float> %vec_data)
 
-  ; CHECK: %[[VALUE:.*]] = call <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VALUE:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
   ; CHECK: extractelement <4 x float> %[[VALUE]], i32 1
   %y_ptr = getelementptr inbounds <4 x float>, ptr %ptr, i32 0, i32 1
   %y_data = load float, ptr %y_ptr
   call void @use_float(float %y_data)
 
-  ; CHECK: %[[VALUE:.*]] = call <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[VALUE:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
   ; CHECK: extractelement <4 x float> %[[VALUE]], i32 %elemindex
   %dynamic = getelementptr inbounds <4 x float>, ptr %ptr, i32 0, i32 %elemindex
   %dyndata = load float, ptr %dynamic
diff --git a/llvm/test/CodeGen/DirectX/ResourceAccess/store_typedbuffer.ll b/llvm/test/CodeGen/DirectX/ResourceAccess/store_typedbuffer.ll
index dd63acc3c0e96..17606408cadff 100644
--- a/llvm/test/CodeGen/DirectX/ResourceAccess/store_typedbuffer.ll
+++ b/llvm/test/CodeGen/DirectX/ResourceAccess/store_typedbuffer.ll
@@ -5,7 +5,7 @@ target triple = "dxil-pc-shadermodel6.6-compute"
 ; CHECK-LABEL: define void @store_float4
 define void @store_float4(<4 x float> %data, i32 %index, i32 %elemindex) {
   %buffer = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_1_0_0(
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK-NOT: @llvm.dx.resource.getpointer
@@ -13,27 +13,27 @@ define void @store_float4(<4 x float> %data, i32 %index, i32 %elemindex) {
       target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
 
   ; Store the whole value
-  ; CHECK: call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index, <4 x float> %data)
+  ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index, <4 x float> %data)
   store <4 x float> %data, ptr %ptr
 
   ; Store just the .x component
   %scalar = extractelement <4 x float> %data, i32 0
-  ; CHECK: %[[LOAD:.*]] = call <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[LOAD:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
   ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[LOAD]], float %scalar, i32 0
-  ; CHECK: call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index, <4 x float> %[[INSERT]])
+  ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index, <4 x float> %[[INSERT]])
   store float %scalar, ptr %ptr
 
   ; Store just the .y component
-  ; CHECK: %[[LOAD:.*]] = call <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[LOAD:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
   ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[LOAD]], float %scalar, i32 1
-  ; CHECK: call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index, <4 x float> %[[INSERT]])
+  ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index, <4 x float> %[[INSERT]])
   %y_ptr = getelementptr inbounds i8, ptr %ptr, i32 4
   store float %scalar, ptr %y_ptr
 
   ; Store to one of the elements dynamically
-  ; CHECK: %[[LOAD:.*]] = call <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[LOAD:.*]] = call <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index)
   ; CHECK: %[[INSERT:.*]] = insertelement <4 x float> %[[LOAD]], float %scalar, i32 %elemindex
-  ; CHECK: call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index, <4 x float> %[[INSERT]])
+  ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %buffer, i32 %index, <4 x float> %[[INSERT]])
   %dynamic = getelementptr inbounds <4 x float>, ptr %ptr, i32 0, i32 %elemindex
   store float %scalar, ptr %dynamic
 
@@ -43,7 +43,7 @@ define void @store_float4(<4 x float> %data, i32 %index, i32 %elemindex) {
 ; CHECK-LABEL: define void @store_half4
 define void @store_half4(<4 x half> %data, i32 %index) {
   %buffer = call target("dx.TypedBuffer", <4 x half>, 1, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f16_1_0_0(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f16_1_0_0(
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK-NOT: @llvm.dx.resource.getpointer
@@ -51,20 +51,20 @@ define void @store_half4(<4 x half> %data, i32 %index) {
       target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index)
 
   ; Store the whole value
-  ; CHECK: call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f16_1_0_0t.v4f16(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index, <4 x half> %data)
+  ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f16_1_0_0t.v4f16(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index, <4 x half> %data)
   store <4 x half> %data, ptr %ptr
 
   ; Store just the .x component
   %scalar = extractelement <4 x half> %data, i32 0
-  ; CHECK: %[[LOAD:.*]] = call <4 x half> @llvm.dx.typedBufferLoad.v4f16.tdx.TypedBuffer_v4f16_1_0_0t(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[LOAD:.*]] = call <4 x half> @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_1_0_0t(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index)
   ; CHECK: %[[INSERT:.*]] = insertelement <4 x half> %[[LOAD]], half %scalar, i32 0
-  ; CHECK: call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f16_1_0_0t.v4f16(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index, <4 x half> %[[INSERT]])
+  ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f16_1_0_0t.v4f16(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index, <4 x half> %[[INSERT]])
   store half %scalar, ptr %ptr
 
   ; Store just the .y component
-  ; CHECK: %[[LOAD:.*]] = call <4 x half> @llvm.dx.typedBufferLoad.v4f16.tdx.TypedBuffer_v4f16_1_0_0t(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[LOAD:.*]] = call <4 x half> @llvm.dx.resource.load.typedbuffer.v4f16.tdx.TypedBuffer_v4f16_1_0_0t(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index)
   ; CHECK: %[[INSERT:.*]] = insertelement <4 x half> %[[LOAD]], half %scalar, i32 1
-  ; CHECK: call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f16_1_0_0t.v4f16(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index, <4 x half> %[[INSERT]])
+  ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f16_1_0_0t.v4f16(target("dx.TypedBuffer", <4 x half>, 1, 0, 0) %buffer, i32 %index, <4 x half> %[[INSERT]])
   %y_ptr = getelementptr inbounds i8, ptr %ptr, i32 2
   store half %scalar, ptr %y_ptr
 
@@ -74,7 +74,7 @@ define void @store_half4(<4 x half> %data, i32 %index) {
 ; CHECK-LABEL: define void @store_double2
 define void @store_double2(<2 x double> %data, i32 %index) {
   %buffer = call target("dx.TypedBuffer", <2 x double>, 1, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v2f64_1_0_0(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v2f64_1_0_0(
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
   ; CHECK-NOT: @llvm.dx.resource.getpointer
@@ -82,20 +82,20 @@ define void @store_double2(<2 x double> %data, i32 %index) {
       target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index)
 
   ; Store the whole value
-  ; CHECK: call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v2f64_1_0_0t.v2f64(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index, <2 x double> %data)
+  ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2f64_1_0_0t.v2f64(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index, <2 x double> %data)
   store <2 x double> %data, ptr %ptr
 
   ; Store just the .x component
   %scalar = extractelement <2 x double> %data, i32 0
-  ; CHECK: %[[LOAD:.*]] = call <2 x double> @llvm.dx.typedBufferLoad.v2f64.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[LOAD:.*]] = call <2 x double> @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index)
   ; CHECK: %[[INSERT:.*]] = insertelement <2 x double> %[[LOAD]], double %scalar, i32 0
-  ; CHECK: call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v2f64_1_0_0t.v2f64(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index, <2 x double> %[[INSERT]])
+  ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2f64_1_0_0t.v2f64(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index, <2 x double> %[[INSERT]])
   store double %scalar, ptr %ptr
 
   ; Store just the .y component
-  ; CHECK: %[[LOAD:.*]] = call <2 x double> @llvm.dx.typedBufferLoad.v2f64.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index)
+  ; CHECK: %[[LOAD:.*]] = call <2 x double> @llvm.dx.resource.load.typedbuffer.v2f64.tdx.TypedBuffer_v2f64_1_0_0t(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index)
   ; CHECK: %[[INSERT:.*]] = insertelement <2 x double> %[[LOAD]], double %scalar, i32 1
-  ; CHECK: call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v2f64_1_0_0t.v2f64(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index, <2 x double> %[[INSERT]])
+  ; CHECK: call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v2f64_1_0_0t.v2f64(target("dx.TypedBuffer", <2 x double>, 1, 0, 0) %buffer, i32 %index, <2 x double> %[[INSERT]])
   %y_ptr = getelementptr inbounds i8, ptr %ptr, i32 8
   store double %scalar, ptr %y_ptr
 
diff --git a/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll b/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll
index c0fbc3d9150bc..c837b36a19e11 100644
--- a/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll
+++ b/llvm/test/CodeGen/DirectX/ResourceGlobalElimination.ll
@@ -21,27 +21,27 @@ define void @main() local_unnamed_addr #0 {
 entry:
   ; DXOP: %In_h.i1 = call %dx.types.Handle @dx.op.createHandle
   ; DXOP: %Out_h.i2 = call %dx.types.Handle @dx.op.createHandle
-  %In_h.i = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false)
+  %In_h.i = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_1_0_0t(i32 0, i32 0, i32 1, i32 0, i1 false)
   store target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %In_h.i, ptr @In, align 4
-  %Out_h.i = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_1_0_0t(i32 4, i32 1, i32 1, i32 0, i1 false)
+  %Out_h.i = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0) @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_1_0_0t(i32 4, i32 1, i32 1, i32 0, i1 false)
   store target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %Out_h.i, ptr @Out, align 4
   ; CSE: call i32 @llvm.dx.flattened.thread.id.in.group()
   %0 = call i32 @llvm.dx.flattened.thread.id.in.group()
   ; CHECK-NOT: load {{.*}} ptr @In
   %1 = load target("dx.TypedBuffer", <4 x float>, 1, 0, 0), ptr @In, align 4
-  ; CSE: call noundef <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t
-  %2 = call noundef <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %1, i32 %0)
+  ; CSE: call noundef <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t
+  %2 = call noundef <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %1, i32 %0)
   ; CHECK-NOT: load {{.*}} ptr @In
   %3 = load target("dx.TypedBuffer", <4 x float>, 1, 0, 0), ptr @In, align 4
-  %4 = call noundef <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %3, i32 %0)
+  %4 = call noundef <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %3, i32 %0)
   %add.i = fadd <4 x float> %2, %4
-  call void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %Out_h.i, i32 %0, <4 x float> %add.i)
+  call void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %Out_h.i, i32 %0, <4 x float> %add.i)
   ; CHECK: ret void
   ret void
 }
 
-; CSE-DAG: declare <4 x float> @llvm.dx.typedBufferLoad.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32) [[ROAttr:#[0-9]+]]
-; CSE-DAG: declare void @llvm.dx.typedBufferStore.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32, <4 x float>) [[WOAttr:#[0-9]+]]
+; CSE-DAG: declare <4 x float> @llvm.dx.resource.load.typedbuffer.v4f32.tdx.TypedBuffer_v4f32_1_0_0t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32) [[ROAttr:#[0-9]+]]
+; CSE-DAG: declare void @llvm.dx.resource.store.typedbuffer.tdx.TypedBuffer_v4f32_1_0_0t.v4f32(target("dx.TypedBuffer", <4 x float>, 1, 0, 0), i32, <4 x float>) [[WOAttr:#[0-9]+]]
 
 attributes #0 = { convergent noinline norecurse "frame-pointer"="all" "hlsl.numthreads"="8,1,1" "hlsl.shader"="compute" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 
diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/typed-uav-load-additional-formats.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/typed-uav-load-additional-formats.ll
index b6947393c4533..26223359dfdf1 100644
--- a/llvm/test/CodeGen/DirectX/ShaderFlags/typed-uav-load-additional-formats.ll
+++ b/llvm/test/CodeGen/DirectX/ShaderFlags/typed-uav-load-additional-formats.ll
@@ -16,8 +16,8 @@ target triple = "dxil-pc-shadermodel6.7-library"
 ; CHECK: Function multicomponent : 0x00002000
 define <4 x float> @multicomponent() #0 {
   %res = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-      @llvm.dx.handle.fromBinding(i32 0, i32 0, i32 1, i32 0, i1 false)
-  %val = call <4 x float> @llvm.dx.typedBufferLoad(
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
+  %val = call <4 x float> @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %res, i32 0)
   ret <4 x float> %val
 }
@@ -25,8 +25,8 @@ define <4 x float> @multicomponent() #0 {
 ; CHECK: Function onecomponent : 0x00000000
 define float @onecomponent() #0 {
   %res = call target("dx.TypedBuffer", float, 1, 0, 0)
-      @llvm.dx.handle.fromBinding(i32 0, i32 0, i32 1, i32 0, i1 false)
-  %val = call float @llvm.dx.typedBufferLoad(
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
+  %val = call float @llvm.dx.resource.load.typedbuffer(
       target("dx.TypedBuffer", float, 1, 0, 0) %res, i32 0)
   ret float %val
 }
@@ -34,8 +34,8 @@ define float @onecomponent() #0 {
 ; CHECK: Function noload : 0x00000000
 define void @noload(<4 x float> %val) #0 {
   %res = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-      @llvm.dx.handle.fromBinding(i32 0, i32 0, i32 1, i32 0, i1 false)
-  call void @llvm.dx.typedBufferStore(
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 1, i32 0, i1 false)
+  call void @llvm.dx.resource.store.typedbuffer(
       target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %res, i32 0,
       <4 x float> %val)
   ret void
diff --git a/llvm/test/CodeGen/DirectX/bufferUpdateCounter.ll b/llvm/test/CodeGen/DirectX/bufferUpdateCounter.ll
index 3f2610649cba1..57a47d0a39a7c 100644
--- a/llvm/test/CodeGen/DirectX/bufferUpdateCounter.ll
+++ b/llvm/test/CodeGen/DirectX/bufferUpdateCounter.ll
@@ -7,12 +7,12 @@ target triple = "dxil-pc-shadermodel6.6-compute"
 define void @update_counter_decrement_vector() {
  ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, 
   %buffer = call target("dx.TypedBuffer", <4 x float>, 0, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_0_0_0(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_0_0_0(
           i32 0, i32 0, i32 1, i32 0, i1 false)
 
  ; CHECK-NEXT: [[BUFFANOT:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]]
  ; CHECK-NEXT: [[REG:%.*]] = call i32 @dx.op.bufferUpdateCounter(i32 70, %dx.types.Handle [[BUFFANOT]], i8 -1)
-  %1 = call i32 @llvm.dx.bufferUpdateCounter(target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i8 -1)
+  %1 = call i32 @llvm.dx.resource.updatecounter(target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i8 -1)
   ret void
 }
 
@@ -20,11 +20,11 @@ define void @update_counter_decrement_vector() {
 define void @update_counter_increment_vector() {
   ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, 
   %buffer = call target("dx.TypedBuffer", <4 x float>, 0, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.TypedBuffer_v4f32_0_0_0(
+      @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_0_0_0(
           i32 0, i32 0, i32 1, i32 0, i1 false)
   ; CHECK-NEXT: [[BUFFANOT:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]]
   ; CHECK-NEXT: [[REG:%.*]] = call i32 @dx.op.bufferUpdateCounter(i32 70, %dx.types.Handle [[BUFFANOT]], i8 1)
-  %1 = call i32 @llvm.dx.bufferUpdateCounter(target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i8 1)
+  %1 = call i32 @llvm.dx.resource.updatecounter(target("dx.TypedBuffer", <4 x float>, 0, 0, 0) %buffer, i8 1)
   ret void
 }
 
@@ -32,10 +32,10 @@ define void @update_counter_increment_vector() {
 define void @update_counter_decrement_scalar() {
     ; CHECK: [[BIND:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, 
   %buffer = call target("dx.RawBuffer", i8, 0, 0)
-      @llvm.dx.handle.fromBinding.tdx.RawBuffer_i8_0_0t(
+      @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_i8_0_0t(
           i32 1, i32 8, i32 1, i32 0, i1 false)
   ; CHECK-NEXT: [[BUFFANOT:%.*]] = call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BIND]]
   ; CHECK-NEXT: [[REG:%.*]] = call i32 @dx.op.bufferUpdateCounter(i32 70, %dx.types.Handle [[BUFFANOT]], i8 -1)
-  %1 = call i32 @llvm.dx.bufferUpdateCounter(target("dx.RawBuffer", i8, 0, 0) %buffer, i8 -1)
+  %1 = call i32 @llvm.dx.resource.updatecounter(target("dx.RawBuffer", i8, 0, 0) %buffer, i8 -1)
   ret void
 }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoad.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoad.ll
index c2749d13c214d..58252fe297f3e 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoad.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferLoad.ll
@@ -19,11 +19,11 @@
 define void @RWBufferLoad_Vec4_I32() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_i32_5_2_0_0_2_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_i32_5_2_0_0_2_24(
           i32 16, i32 7, i32 1, i32 0, i1 false)
 
 ; CHECK: OpImageRead [[v4_int]] [[buffer]] [[zero]]
-  %data0 = call <4 x i32> @llvm.spv.typedBufferLoad(
+  %data0 = call <4 x i32> @llvm.spv.resource.load.typedbuffer(
       target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) %buffer0, i32 0)
 
   ret void
@@ -34,12 +34,12 @@ define void @RWBufferLoad_Vec4_I32() #0 {
 define void @RWBufferLoad_I32() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_i32_5_2_0_0_2_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_i32_5_2_0_0_2_24(
           i32 16, i32 7, i32 1, i32 0, i1 false)
 
 ; CHECK: [[V:%[0-9]+]] = OpImageRead [[v4_int]] [[buffer]] [[zero]]
 ; CHECK: OpCompositeExtract [[int]] [[V]] 0
-  %data1 = call i32 @llvm.spv.typedBufferLoad(
+  %data1 = call i32 @llvm.spv.resource.load.typedbuffer(
       target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) %buffer1, i32 0)
 
   ret void
@@ -50,14 +50,14 @@ define void @RWBufferLoad_I32() #0 {
 define void @RWBufferLoad_Vec2_I32() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_i32_5_2_0_0_2_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_i32_5_2_0_0_2_24(
           i32 16, i32 7, i32 1, i32 0, i1 false)
 
 ; CHECK: [[V:%[0-9]+]] = OpImageRead [[v4_int]] [[buffer]] [[zero]]
 ; CHECK: [[e0:%[0-9]+]] = OpCompositeExtract [[int]] [[V]] 0
 ; CHECK: [[e1:%[0-9]+]] = OpCompositeExtract [[int]] [[V]] 1
 ; CHECK: OpCompositeConstruct [[v2_int]] [[e0]] [[e1]]
-  %data0 = call <2 x i32> @llvm.spv.typedBufferLoad(
+  %data0 = call <2 x i32> @llvm.spv.resource.load.typedbuffer(
       target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) %buffer0, i32 0)
 
   ret void
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferStore.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferStore.ll
index afcc0ed0a455d..b0ffa01ccdd44 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferStore.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/BufferStore.ll
@@ -22,13 +22,13 @@ declare <4 x i32> @get_data() #1
 define void @RWBufferStore_Vec4_I32() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_i32_5_2_0_0_2_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_i32_5_2_0_0_2_24(
           i32 16, i32 7, i32 1, i32 0, i1 false)
 
 ; CHECK: [[data:%[0-9]+]] = OpFunctionCall
   %data = call <4 x i32> @get_data()
 ; CHECK: OpImageWrite [[buffer]] [[zero]] [[data]]
-  call void @llvm.spv.typedBufferStore(target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) %buffer0, i32 0, <4 x i32> %data)
+  call void @llvm.spv.resource.store.typedbuffer(target("spirv.Image", i32, 5, 2, 0, 0, 2, 24) %buffer0, i32 0, <4 x i32> %data)
 
   ret void
 }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/CombinedSamplerImageDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/CombinedSamplerImageDynIdx.ll
index 7a21a6c4bf7ea..97a7252eb067b 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/CombinedSamplerImageDynIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/CombinedSamplerImageDynIdx.ll
@@ -26,13 +26,13 @@ define void @main() #0 {
 ; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[CombindedType]] [[ac]]
   %buffer0 = call target("spirv.SampledImage", i32, 0, 2, 0, 0, 1, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_1_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_1_24(
           i32 3, i32 4, i32 3, i32 0, i1 false)
 
 ; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[CombindedType]] [[ac]]
   %buffer1 = call target("spirv.SampledImage", i32, 0, 2, 0, 0, 1, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_1_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_1_24(
           i32 3, i32 4, i32 3, i32 1, i1 false)
   ret void
 }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/CombinedSamplerImageNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/CombinedSamplerImageNonUniformIdx.ll
index b821f5bdfa137..6c5c126e4462b 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/CombinedSamplerImageNonUniformIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/CombinedSamplerImageNonUniformIdx.ll
@@ -33,13 +33,13 @@ define void @main() #0 {
 ; CHECK: [[ac0]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
 ; CHECK: [[ld0:%[0-9]+]] = OpLoad [[CombindedType]] [[ac0]]
   %buffer0 = call target("spirv.SampledImage", i32, 0, 2, 0, 0, 1, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_1_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_1_24(
           i32 3, i32 4, i32 3, i32 0, i1 true)
 
 ; CHECK: [[ac1]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
 ; CHECK: [[ld1]] = OpLoad [[CombindedType]] [[ac1]]
   %buffer1 = call target("spirv.SampledImage", i32, 0, 2, 0, 0, 1, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_1_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_1_24(
           i32 3, i32 4, i32 3, i32 1, i1 true)
   ret void
 }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/InputAttachmentImageDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/InputAttachmentImageDynIdx.ll
index c925be1f8216a..2a52dd1817f0c 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/InputAttachmentImageDynIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/InputAttachmentImageDynIdx.ll
@@ -25,13 +25,13 @@ define void @main() #0 {
 ; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
   %buffer0 = call target("spirv.Image", i32, 6, 2, 0, 0, 2, 0)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_6_2_0_0_2_0(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_6_2_0_0_2_0(
           i32 3, i32 4, i32 3, i32 0, i1 false)
 
 ; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
   %buffer1 = call target("spirv.Image", i32, 6, 2, 0, 0, 2, 0)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_6_2_0_0_2_0(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_6_2_0_0_2_0(
           i32 3, i32 4, i32 3, i32 1, i1 false)
   ret void
 }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/InputAttachmentImageNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/InputAttachmentImageNonUniformIdx.ll
index bb2e7549fd3ba..6dae79c5b385d 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/InputAttachmentImageNonUniformIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/InputAttachmentImageNonUniformIdx.ll
@@ -32,13 +32,13 @@ define void @main() #0 {
 ; CHECK: [[ac0]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
 ; CHECK: [[ld0]] = OpLoad [[BufferType]] [[ac0]]
   %buffer0 = call target("spirv.Image", i32, 6, 2, 0, 0, 2, 0)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_6_2_0_0_2_0(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_6_2_0_0_2_0(
           i32 3, i32 4, i32 3, i32 0, i1 true)
 
 ; CHECK: [[ac1:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
 ; CHECK: [[ld1]] = OpLoad [[BufferType]] [[ac1]]
   %buffer1 = call target("spirv.Image", i32, 6, 2, 0, 0, 2, 0)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_6_2_0_0_2_0(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_6_2_0_0_2_0(
           i32 3, i32 4, i32 3, i32 1, i1 true)
   ret void
 }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/SampledImageDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/SampledImageDynIdx.ll
index 69b1ac9078ff6..efd89c5977f97 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/SampledImageDynIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/SampledImageDynIdx.ll
@@ -32,13 +32,13 @@ define void @main() #0 {
 ; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
   %buffer0 = call target("spirv.Image", i32, 0, 2, 0, 0, 1, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_1_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_1_24(
           i32 3, i32 4, i32 3, i32 0, i1 false)
 
 ; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
   %buffer1 = call target("spirv.Image", i32, 0, 2, 0, 0, 1, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_1_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_1_24(
           i32 3, i32 4, i32 3, i32 1, i1 false)
   ret void
 }
@@ -51,13 +51,13 @@ define void @DifferentArraySizesAreDifferentVariables() #0 {
 ; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
   %buffer0 = call target("spirv.Image", i32, 0, 2, 0, 0, 1, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_1_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_1_24(
           i32 3, i32 4, i32 3, i32 0, i1 false)
 
 ; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[OtherVar]] [[One]]
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
   %buffer1 = call target("spirv.Image", i32, 0, 2, 0, 0, 1, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_1_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_1_24(
           i32 3, i32 4, i32 5, i32 1, i1 false)
   ret void
 }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/SampledImageNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/SampledImageNonUniformIdx.ll
index 7d1865aca6735..6d93051ce3f0a 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/SampledImageNonUniformIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/SampledImageNonUniformIdx.ll
@@ -32,13 +32,13 @@ define void @main() #0 {
 ; CHECK: [[ac0]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
 ; CHECK: [[ld0]] = OpLoad [[BufferType]] [[ac0]]
   %buffer0 = call target("spirv.Image", i32, 0, 2, 0, 0, 1, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_1_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_1_24(
           i32 3, i32 4, i32 3, i32 0, i1 true)
 
 ; CHECK: [[ac1:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
 ; CHECK: [[ld1]] = OpLoad [[BufferType]] [[ac1]]
   %buffer1 = call target("spirv.Image", i32, 0, 2, 0, 0, 1, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_1_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_1_24(
           i32 3, i32 4, i32 3, i32 1, i1 true)
   ret void
 }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/SamplerArrayDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/SamplerArrayDynIdx.ll
index 3ca6788f0e48a..fd276e9ef4a98 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/SamplerArrayDynIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/SamplerArrayDynIdx.ll
@@ -24,13 +24,13 @@ define void @main() #0 {
 ; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[SamplerPtrType]] [[Var]] [[Zero]]
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[SamplerType]] [[ac]]
   %buffer0 = call target("spirv.Sampler")
-      @llvm.spv.handle.fromBinding.tspirv.Image(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image(
           i32 3, i32 4, i32 3, i32 0, i1 false)
 
 ; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[SamplerPtrType]] [[Var]] [[One]]
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[SamplerType]] [[ac]]
   %buffer1 = call target("spirv.Sampler")
-      @llvm.spv.handle.fromBinding.tspirv.Image(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image(
           i32 3, i32 4, i32 3, i32 1, i1 false)
   ret void
 }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/SamplerArrayNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/SamplerArrayNonUniformIdx.ll
index 0917d4751f459..3e59d66febf0b 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/SamplerArrayNonUniformIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/SamplerArrayNonUniformIdx.ll
@@ -31,13 +31,13 @@ define void @main() #0 {
 ; CHECK: [[ac0]] = OpAccessChain [[SamplerPtrType]] [[Var]] [[Zero]]
 ; CHECK: [[ld0]] = OpLoad [[SamplerType]] [[ac0]]
   %buffer0 = call target("spirv.Sampler")
-      @llvm.spv.handle.fromBinding.tspirv.Image(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image(
           i32 3, i32 4, i32 3, i32 0, i1 true)
 
 ; CHECK: [[ac1:%[0-9]+]] = OpAccessChain [[SamplerPtrType]] [[Var]] [[One]]
 ; CHECK: [[ld1]] = OpLoad [[SamplerType]] [[ac1]]
   %buffer1 = call target("spirv.Sampler")
-      @llvm.spv.handle.fromBinding.tspirv.Image(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image(
           i32 3, i32 4, i32 3, i32 1, i1 true)
   ret void
 }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/ScalarResourceType.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/ScalarResourceType.ll
index b264227771c33..52cc2275bc7a6 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/ScalarResourceType.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/ScalarResourceType.ll
@@ -20,13 +20,13 @@
 define void @RWBufferLoad() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_24(
           i32 16, i32 7, i32 1, i32 0, i1 false)
 
 ; Make sure we use the same variable with multiple loads.
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_24(
           i32 16, i32 7, i32 1, i32 0, i1 false)
   ret void
 }
@@ -38,7 +38,7 @@ define void @UseDifferentGlobalVar() #0 {
 ; different types.
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeFloat]] [[FloatBufferVar]]
   %buffer0 = call target("spirv.Image", float, 5, 2, 0, 0, 2, 3)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_3(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_3(
           i32 16, i32 7, i32 1, i32 0, i1 false)
   ret void
 }
@@ -50,7 +50,7 @@ define void @ReuseGlobalVarFromFirstFunction() #0 {
 ; same in case one function calls the other.
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_24(
           i32 16, i32 7, i32 1, i32 0, i1 false)
   ret void
 }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll
index 1922e66388324..082a5c832f1c4 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageDynIdx.ll
@@ -25,13 +25,13 @@ define void @main() #0 {
 ; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
   %buffer0 = call target("spirv.Image", i32, 0, 2, 0, 0, 2, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_2_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_2_24(
           i32 3, i32 4, i32 3, i32 0, i1 false)
 
 ; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
   %buffer1 = call target("spirv.Image", i32, 0, 2, 0, 0, 2, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_2_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_2_24(
           i32 3, i32 4, i32 3, i32 1, i1 false)
   ret void
 }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll
index 231e1cf7567a0..d6419492bb952 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageImageNonUniformIdx.ll
@@ -32,13 +32,13 @@ define void @main() #0 {
 ; CHECK: [[ac0]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
 ; CHECK: [[ld0]] = OpLoad [[BufferType]] [[ac0]]
   %buffer0 = call target("spirv.Image", i32, 0, 2, 0, 0, 2, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_2_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_2_24(
           i32 3, i32 4, i32 3, i32 0, i1 true)
 
 ; CHECK: [[ac1:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
 ; CHECK: [[ld1]] = OpLoad [[BufferType]] [[ac1]]
   %buffer1 = call target("spirv.Image", i32, 0, 2, 0, 0, 2, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_0_2_0_0_2_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_0_2_0_0_2_24(
           i32 3, i32 4, i32 3, i32 1, i1 true)
   ret void
 }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageTexelBufferDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageTexelBufferDynIdx.ll
index 454ba1f47db0a..31fdcb362eb73 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageTexelBufferDynIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageTexelBufferDynIdx.ll
@@ -25,13 +25,13 @@ define void @void() #0 {
 ; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_24(
           i32 3, i32 4, i32 3, i32 0, i1 false)
 
 ; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
   %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_24(
           i32 3, i32 4, i32 3, i32 1, i1 false)
   ret void
 }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageTexelBufferNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageTexelBufferNonUniformIdx.ll
index a579aaa1eed69..a5608979025fe 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageTexelBufferNonUniformIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/StorageTexelBufferNonUniformIdx.ll
@@ -32,13 +32,13 @@ define void @main() #0 {
 ; CHECK: [[ac0]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
 ; CHECK: [[ld0]] = OpLoad [[BufferType]] [[ac0]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_24(
           i32 3, i32 4, i32 3, i32 0, i1 true)
 
 ; CHECK: [[ac1:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
 ; CHECK: [[ld1]] = OpLoad [[BufferType]] [[ac1]]
   %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_24(
           i32 3, i32 4, i32 3, i32 1, i1 true)
   ret void
 }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/UniformTexelBufferDynIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/UniformTexelBufferDynIdx.ll
index 98c4ff7a965d5..131a6b38d393e 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/UniformTexelBufferDynIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/UniformTexelBufferDynIdx.ll
@@ -25,13 +25,13 @@ define void @main() #0 {
 ; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 1, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_1_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_1_24(
           i32 3, i32 4, i32 3, i32 0, i1 false)
 
 ; CHECK: [[ac:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[BufferType]] [[ac]]
   %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 1, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_1_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_1_24(
           i32 3, i32 4, i32 3, i32 1, i1 false)
   ret void
 }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/UniformTexelBufferNonUniformIdx.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/UniformTexelBufferNonUniformIdx.ll
index da523f215046b..cfb3eb5f52076 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/UniformTexelBufferNonUniformIdx.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/UniformTexelBufferNonUniformIdx.ll
@@ -32,13 +32,13 @@ define void @main() #0 {
 ; CHECK: [[ac0]] = OpAccessChain [[BufferPtrType]] [[Var]] [[Zero]]
 ; CHECK: [[ld0]] = OpLoad [[BufferType]] [[ac0]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 1, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_1_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_1_24(
           i32 3, i32 4, i32 3, i32 0, i1 true)
 
 ; CHECK: [[ac1:%[0-9]+]] = OpAccessChain [[BufferPtrType]] [[Var]] [[One]]
 ; CHECK: [[ld1]] = OpLoad [[BufferType]] [[ac1]]
   %buffer1 = call target("spirv.Image", i32, 5, 2, 0, 0, 1, 24)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_1_24(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_1_24(
           i32 3, i32 4, i32 3, i32 1, i1 true)
   ret void
 }
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferLoad.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferLoad.ll
index 7f9c6f7da2859..4ec8605f68137 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferLoad.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferLoad.ll
@@ -17,11 +17,11 @@
 define void @RWBufferLoad_Vec4_I32() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 0)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_0(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0(
           i32 16, i32 7, i32 1, i32 0, i1 false)
 
 ; CHECK: OpImageRead [[v4_int]] [[buffer]] [[zero]]
-  %data0 = call <4 x i32> @llvm.spv.typedBufferLoad(
+  %data0 = call <4 x i32> @llvm.spv.resource.load.typedbuffer(
       target("spirv.Image", i32, 5, 2, 0, 0, 2, 0) %buffer0, i32 0)
 
   ret void
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferStore.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferStore.ll
index 4d26f50256814..4c6f9bfd97ed7 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferStore.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/UnknownBufferStore.ll
@@ -20,13 +20,13 @@ declare <4 x i32> @get_data() #1
 define void @RWBufferLoad_Vec4_I32() #0 {
 ; CHECK: [[buffer:%[0-9]+]] = OpLoad [[RWBufferTypeInt]] [[IntBufferVar]]
   %buffer0 = call target("spirv.Image", i32, 5, 2, 0, 0, 2, 0)
-      @llvm.spv.handle.fromBinding.tspirv.Image_f32_5_2_0_0_2_0(
+      @llvm.spv.resource.handlefrombinding.tspirv.Image_f32_5_2_0_0_2_0(
           i32 16, i32 7, i32 1, i32 0, i1 false)
 
 ; CHECK: [[data:%[0-9]+]] = OpFunctionCall
   %data = call <4 x i32> @get_data()
 ; CHECK: OpImageWrite [[buffer]] [[ten]] [[data]]
-  call void @llvm.spv.typedBufferStore(
+  call void @llvm.spv.resource.store.typedbuffer(
       target("spirv.Image", i32, 5, 2, 0, 0, 2, 0) %buffer0, i32 10, <4 x i32> %data)
 
   ret void

From d3508ccd1512c57094ec7b321d147aa72c9fbc7e Mon Sep 17 00:00:00 2001
From: Konstantina Mitropoulou <44334539+kmitropoulou@users.noreply.github.com>
Date: Thu, 19 Dec 2024 11:20:43 -0800
Subject: [PATCH 095/209] [AMDGPU] Emit S_CBRANCH_SCC for floating-point
 conditions. (#120588)

- **[AMDGPU] Add new test.**
- **[AMDGPU] Emit S_CBRANCH_SCC for floating-point conditions.**

---------

Co-authored-by: Konstantina Mitropoulou <KonstantinaMitropoulou@amd.com>
---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |   3 +
 llvm/test/CodeGen/AMDGPU/branch-relaxation.ll |   5 +-
 ...uniform_branch_with_floating_point_cond.ll | 100 ++++++++++++++++++
 3 files changed, 104 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/uniform_branch_with_floating_point_cond.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 1abb75eb72b4a..d9eaf82c52140 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2394,6 +2394,9 @@ bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
            Subtarget->hasScalarCompareEq64();
   }
 
+  if ((VT == MVT::f16 || VT == MVT::f32) && Subtarget->hasSALUFloatInsts())
+    return true;
+
   return false;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index 1d984bd49756e..ff47c865c67e6 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -297,10 +297,7 @@ define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr
 ; GFX12-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_cmp_eq_f32 s0, 0
-; GFX12-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT:    s_and_b32 vcc_lo, exec_lo, s1
-; GFX12-NEXT:    s_cbranch_vccz .LBB2_1
+; GFX12-NEXT:    s_cbranch_scc0 .LBB2_1
 ; GFX12-NEXT:  ; %bb.3: ; %bb0
 ; GFX12-NEXT:    s_getpc_b64 s[2:3]
 ; GFX12-NEXT:  .Lpost_getpc2:
diff --git a/llvm/test/CodeGen/AMDGPU/uniform_branch_with_floating_point_cond.ll b/llvm/test/CodeGen/AMDGPU/uniform_branch_with_floating_point_cond.ll
new file mode 100644
index 0000000000000..28526aafa2e60
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/uniform_branch_with_floating_point_cond.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=amdgpu-isel < %s | FileCheck %s
+
+@external_constant1 = external addrspace(4) constant float, align 4
+@external_constant2 = external addrspace(1) constant float, align 4
+@const.ptr = external addrspace(4) constant ptr, align 4
+
+define void @test() {
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x30000000), %bb.3(0x50000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_constant1, target-flags(amdgpu-gotprel32-hi) @external_constant1, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed [[S_LOAD_DWORDX2_IMM]], 0, 0 :: (dereferenceable invariant load (s32) from @external_constant1, addrspace 4)
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   nofpexcept S_CMP_LG_F32 killed [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_]], implicit-def $scc, implicit $mode
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.3, implicit $scc
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.bb1:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET1:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @const.ptr, target-flags(amdgpu-gotprel32-hi) @const.ptr, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET1]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM killed [[S_LOAD_DWORDX2_IMM1]], 0, 0 :: (invariant load (s64) from @const.ptr, addrspace 4)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR killed [[S_LOAD_DWORDX2_IMM2]], killed [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32) from %ir.0, addrspace 1)
+  ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sgpr_32 = S_MOV_B32 1092616192
+  ; CHECK-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sgpr_32 = S_MOV_B32 1065353216
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
+  ; CHECK-NEXT:   nofpexcept S_CMP_LT_F32 killed [[COPY]], killed [[S_MOV_B32_2]], implicit-def $scc, implicit $mode
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.4, implicit $scc
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.bb2:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_3:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   S_BRANCH %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.Flow1:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_BRANCH %bb.7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.bb3:
+  ; CHECK-NEXT:   successors: %bb.5(0x50000000), %bb.6(0x30000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:sgpr_32 = PHI [[S_MOV_B32_1]], %bb.1, [[S_MOV_B32_3]], %bb.2
+  ; CHECK-NEXT:   [[S_MOV_B32_4:%[0-9]+]]:sgpr_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   nofpexcept S_CMP_NEQ_F32 [[PHI]], killed [[S_MOV_B32_4]], implicit-def $scc, implicit $mode
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.6, implicit $scc
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5.bb4:
+  ; CHECK-NEXT:   successors: %bb.6(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET2:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_constant2, target-flags(amdgpu-gotprel32-hi) @external_constant2, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM3:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET2]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1082130432, implicit $exec
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR killed [[V_MOV_B32_e32_1]], killed [[V_MOV_B32_e32_2]], killed [[S_LOAD_DWORDX2_IMM3]], 0, 0, implicit $exec :: (store (s32) into @external_constant2, addrspace 1)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6.Flow:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7.bb5:
+  ; CHECK-NEXT:   SI_RETURN
+entry:
+  %ld1 = load float, ptr addrspace(4) @external_constant1
+  %cmp1 = fcmp one float %ld1, 0.0
+  br i1 %cmp1, label %bb5, label %bb1, !amdgpu.uniform !0
+
+bb1:
+  %ptr = load ptr, ptr addrspace(4) @const.ptr
+  %ld2 = load float, ptr %ptr, align 4
+  %cmp2 = fcmp olt float %ld2, 1.0
+  %or = or i1 %cmp2, false
+  br i1 %or, label %bb3, label %bb2, !amdgpu.uniform !0
+
+bb2:
+  br label %bb3
+
+bb3:
+  %phi = phi float [ 10.0, %bb1 ], [ 0.0, %bb2 ]
+  %cmp3 = fcmp oeq float %phi, 0.0
+  br i1 %cmp3, label %bb4, label %bb5, !amdgpu.uniform !0
+
+bb4:
+  store float 4.0, ptr addrspace(1) @external_constant2
+  br label %bb5
+
+bb5:
+  ret void
+}
+
+!0 = !{}

From b5d02786be31f45ca5919b3b73e99d8958330f78 Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Thu, 19 Dec 2024 11:42:17 -0800
Subject: [PATCH 096/209] Revert "[WebKit checkers] Recognize adoptRef as a
 safe function" (#120626)

Reverts llvm/llvm-project#119846. Introduced a failing test.
---
 .../Checkers/WebKit/PtrTypesSemantics.cpp     |  5 +--
 .../Analysis/Checkers/WebKit/call-args.cpp    | 18 ----------
 .../Analysis/Checkers/WebKit/mock-types.h     | 36 +------------------
 .../ref-cntbl-crtp-base-no-virtual-dtor.cpp   |  8 +++++
 4 files changed, 12 insertions(+), 55 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index 5487fea1b956c..797f3e1f3fba5 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -125,8 +125,9 @@ bool isCtorOfRefCounted(const clang::FunctionDecl *F) {
   assert(F);
   const std::string &FunctionName = safeGetName(F);
 
-  return isRefType(FunctionName) || FunctionName == "adoptRef" ||
-         FunctionName == "UniqueRef" || FunctionName == "makeUniqueRef" ||
+  return isRefType(FunctionName) || FunctionName == "makeRef" ||
+         FunctionName == "makeRefPtr" || FunctionName == "UniqueRef" ||
+         FunctionName == "makeUniqueRef" ||
          FunctionName == "makeUniqueRefWithoutFastMallocCheck"
 
          || FunctionName == "String" || FunctionName == "AtomString" ||
diff --git a/clang/test/Analysis/Checkers/WebKit/call-args.cpp b/clang/test/Analysis/Checkers/WebKit/call-args.cpp
index b4613d5090f29..2146eae9975b9 100644
--- a/clang/test/Analysis/Checkers/WebKit/call-args.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/call-args.cpp
@@ -376,21 +376,3 @@ namespace call_with_explicit_temporary_obj {
 
 namespace call_with_explicit_construct {
 }
-
-namespace call_with_adopt_ref {
-  class Obj {
-  public:
-    void ref() const;
-    void deref() const;
-    void method();
-  };
-
-  // This is needed due to rdar://141692212.
-  struct dummy {
-    RefPtr<Obj> any;
-  };
-
-  void foo() {
-    adoptRef(new Obj)->method();
-  }
-}
diff --git a/clang/test/Analysis/Checkers/WebKit/mock-types.h b/clang/test/Analysis/Checkers/WebKit/mock-types.h
index 0908e7fdb34dc..f3bd20f8bcf60 100644
--- a/clang/test/Analysis/Checkers/WebKit/mock-types.h
+++ b/clang/test/Analysis/Checkers/WebKit/mock-types.h
@@ -74,10 +74,7 @@ template<typename T> struct DefaultRefDerefTraits {
 template <typename T, typename PtrTraits = RawPtrTraits<T>, typename RefDerefTraits = DefaultRefDerefTraits<T>> struct Ref {
   typename PtrTraits::StorageType t;
 
-  enum AdoptTag { Adopt };
-
   Ref() : t{} {};
-  Ref(T &t, AdoptTag) : t(&t) { }
   Ref(T &t) : t(&RefDerefTraits::ref(t)) { }
   Ref(const Ref& o) : t(RefDerefTraits::refIfNotNull(PtrTraits::unwrap(o.t))) { }
   Ref(Ref&& o) : t(o.leakRef()) { }
@@ -104,19 +101,10 @@ template <typename T, typename PtrTraits = RawPtrTraits<T>, typename RefDerefTra
   T* leakRef() { return PtrTraits::exchange(t, nullptr); }
 };
 
-template <typename T> Ref<T> adoptRef(T& t) {
-  using Ref = Ref<T>;
-  return Ref(t, Ref::Adopt);
-}
-
-template<typename T> class RefPtr;
-template<typename T> RefPtr<T> adoptRef(T*);
-
 template <typename T> struct RefPtr {
   T *t;
 
-  RefPtr() : t(nullptr) { }
-
+  RefPtr() : t(new T) {}
   RefPtr(T *t)
     : t(t) {
     if (t)
@@ -125,17 +113,6 @@ template <typename T> struct RefPtr {
   RefPtr(Ref<T>&& o)
     : t(o.leakRef())
   { }
-  RefPtr(RefPtr&& o)
-    : t(o.t)
-  {
-    o.t = nullptr;
-  }
-  RefPtr(const RefPtr& o)
-    : t(o.t)
-  {
-      if (t)
-          t->ref();
-  }
   ~RefPtr() {
     if (t)
       t->deref();
@@ -161,19 +138,8 @@ template <typename T> struct RefPtr {
     return *this;
   }
   operator bool() const { return t; }
-
-private:
-  friend RefPtr adoptRef<T>(T*);
-
-  // call_with_adopt_ref in call-args.cpp requires this method to be private.
-  enum AdoptTag { Adopt };
-  RefPtr(T *t, AdoptTag) : t(t) { }
 };
 
-template <typename T> RefPtr<T> adoptRef(T* t) {
-  return RefPtr<T>(t, RefPtr<T>::Adopt);
-}
-
 template <typename T> bool operator==(const RefPtr<T> &, const RefPtr<T> &) {
   return false;
 }
diff --git a/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp b/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp
index 4209db14eaa52..33c60ea8ca64d 100644
--- a/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp
@@ -61,6 +61,14 @@ template<typename Out, typename... In> Function<Out(In...)> adopt(Detail::Callab
     return Function<Out(In...)>(impl, Function<Out(In...)>::Adopt);
 }
 
+template<typename T, typename PtrTraits = RawPtrTraits<T>, typename RefDerefTraits = DefaultRefDerefTraits<T>> Ref<T, PtrTraits, RefDerefTraits> adoptRef(T&);
+
+template<typename T, typename _PtrTraits, typename RefDerefTraits>
+inline Ref<T, _PtrTraits, RefDerefTraits> adoptRef(T& reference)
+{
+    return Ref<T, _PtrTraits, RefDerefTraits>(reference);
+}
+
 enum class DestructionThread : unsigned char { Any, Main, MainRunLoop };
 void ensureOnMainThread(Function<void()>&&); // Sync if called on main thread, async otherwise.
 void ensureOnMainRunLoop(Function<void()>&&); // Sync if called on main run loop, async otherwise.

From 45c01e8a33bbb1790ea16577e47b1e6a34fa1548 Mon Sep 17 00:00:00 2001
From: Finn Plummer <50529406+inbelic@users.noreply.github.com>
Date: Thu, 19 Dec 2024 11:54:26 -0800
Subject: [PATCH 097/209] [NFC][TargetTransformInfo][VectorUtils] Consolidate
 `isVectorIntrinsic...` api (#117635)

- update `VectorUtils:isVectorIntrinsicWithScalarOpAtArg` to use TTI for
all uses, to allow specifiction of target specific intrinsics
- add TTI to the `isVectorIntrinsicWithStructReturnOverloadAtField` api
- update TTI api to provide `isTargetIntrinsicWith...` functions and
  consistently name them
- move `isTriviallyScalarizable` to VectorUtils

- update all uses of the api and provide the TTI parameter

Resolves #117030
---
 .../llvm/Analysis/TargetTransformInfo.h       | 28 +++++++++----
 .../llvm/Analysis/TargetTransformInfoImpl.h   | 11 +++--
 llvm/include/llvm/Analysis/VectorUtils.h      | 26 +++++++++---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      | 11 +++--
 llvm/lib/Analysis/ConstantFolding.cpp         |  2 +-
 llvm/lib/Analysis/TargetTransformInfo.cpp     | 11 +++--
 llvm/lib/Analysis/VectorUtils.cpp             | 34 +++++++++++++--
 llvm/lib/CodeGen/ReplaceWithVeclib.cpp        |  2 +-
 .../DirectX/DirectXTargetTransformInfo.cpp    |  8 ++--
 .../DirectX/DirectXTargetTransformInfo.h      |  3 +-
 llvm/lib/Transforms/Scalar/Scalarizer.cpp     | 24 ++---------
 .../Vectorize/LoopVectorizationLegality.cpp   |  2 +-
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 42 ++++++++++---------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  3 +-
 .../Transforms/Vectorize/VectorCombine.cpp    | 23 +++++-----
 15 files changed, 145 insertions(+), 85 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index cd8e9b7887b66..c6b846f96f162 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -905,14 +905,20 @@ class TargetTransformInfo {
 
   bool isTargetIntrinsicTriviallyScalarizable(Intrinsic::ID ID) const;
 
+  /// Identifies if the vector form of the intrinsic has a scalar operand.
   bool isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
                                           unsigned ScalarOpdIdx) const;
 
   /// Identifies if the vector form of the intrinsic is overloaded on the type
   /// of the operand at index \p OpdIdx, or on the return type if \p OpdIdx is
   /// -1.
-  bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
-                                              int ScalarOpdIdx) const;
+  bool isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
+                                              int OpdIdx) const;
+
+  /// Identifies if the vector form of the intrinsic that returns a struct is
+  /// overloaded at the struct element index \p RetIdx.
+  bool isTargetIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID,
+                                                        int RetIdx) const;
 
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the demanded result elements need to be inserted and/or
@@ -2020,8 +2026,11 @@ class TargetTransformInfo::Concept {
   virtual bool isTargetIntrinsicTriviallyScalarizable(Intrinsic::ID ID) = 0;
   virtual bool isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
                                                   unsigned ScalarOpdIdx) = 0;
-  virtual bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
-                                                      int ScalarOpdIdx) = 0;
+  virtual bool isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
+                                                      int OpdIdx) = 0;
+  virtual bool
+  isTargetIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID,
+                                                   int RetIdx) = 0;
   virtual InstructionCost
   getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
                            bool Insert, bool Extract, TargetCostKind CostKind,
@@ -2610,9 +2619,14 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.isTargetIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx);
   }
 
-  bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
-                                              int ScalarOpdIdx) override {
-    return Impl.isVectorIntrinsicWithOverloadTypeAtArg(ID, ScalarOpdIdx);
+  bool isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
+                                              int OpdIdx) override {
+    return Impl.isTargetIntrinsicWithOverloadTypeAtArg(ID, OpdIdx);
+  }
+
+  bool isTargetIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID,
+                                                        int RetIdx) override {
+    return Impl.isTargetIntrinsicWithStructReturnOverloadAtField(ID, RetIdx);
   }
 
   InstructionCost getScalarizationOverhead(VectorType *Ty,
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 885fe4390e568..5fa0c46ad292d 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -396,9 +396,14 @@ class TargetTransformInfoImplBase {
     return false;
   }
 
-  bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
-                                              int ScalarOpdIdx) const {
-    return ScalarOpdIdx == -1;
+  bool isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
+                                              int OpdIdx) const {
+    return OpdIdx == -1;
+  }
+
+  bool isTargetIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID,
+                                                        int RetIdx) const {
+    return RetIdx == 0;
   }
 
   InstructionCost getScalarizationOverhead(VectorType *Ty,
diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
index 7f8a0c9c0af7b..b72efac0a4887 100644
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -132,11 +132,25 @@ typedef unsigned ID;
 /// This method returns true if the intrinsic's argument types are all scalars
 /// for the scalar form of the intrinsic and all vectors (or scalars handled by
 /// isVectorIntrinsicWithScalarOpAtArg) for the vector form of the intrinsic.
+///
+/// Note: isTriviallyVectorizable implies isTriviallyScalarizable.
 bool isTriviallyVectorizable(Intrinsic::ID ID);
 
+/// Identify if the intrinsic is trivially scalarizable.
+/// This method returns true following the same predicates of
+/// isTriviallyVectorizable.
+
+/// Note: There are intrinsics where implementing vectorization for the
+/// intrinsic is redundant, but we want to implement scalarization of the
+/// vector. To prevent the requirement that an intrinsic also implements
+/// vectorization we provide this seperate function.
+bool isTriviallyScalarizable(Intrinsic::ID ID, const TargetTransformInfo *TTI);
+
 /// Identifies if the vector form of the intrinsic has a scalar operand.
-bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
-                                        unsigned ScalarOpdIdx);
+/// \p TTI is used to consider target specific intrinsics, if no target specific
+/// intrinsics will be considered then it is appropriate to pass in nullptr.
+bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx,
+                                        const TargetTransformInfo *TTI);
 
 /// Identifies if the vector form of the intrinsic is overloaded on the type of
 /// the operand at index \p OpdIdx, or on the return type if \p OpdIdx is -1.
@@ -146,9 +160,11 @@ bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx,
                                             const TargetTransformInfo *TTI);
 
 /// Identifies if the vector form of the intrinsic that returns a struct is
-/// overloaded at the struct element index \p RetIdx.
-bool isVectorIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID,
-                                                      int RetIdx);
+/// overloaded at the struct element index \p RetIdx. /// \p TTI is used to
+/// consider target specific intrinsics, if no target specific intrinsics
+/// will be considered then it is appropriate to pass in nullptr.
+bool isVectorIntrinsicWithStructReturnOverloadAtField(
+    Intrinsic::ID ID, int RetIdx, const TargetTransformInfo *TTI);
 
 /// Returns intrinsic ID for call.
 /// For the input call instruction it finds mapping intrinsic and returns
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 8eef8ea665a26..ed4541f66740e 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -819,9 +819,14 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return false;
   }
 
-  bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
-                                              int ScalarOpdIdx) const {
-    return ScalarOpdIdx == -1;
+  bool isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
+                                              int OpdIdx) const {
+    return OpdIdx == -1;
+  }
+
+  bool isTargetIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID,
+                                                        int RetIdx) const {
+    return RetIdx == 0;
   }
 
   /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 3d5022e5502e2..88533f2972fa6 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -3447,7 +3447,7 @@ static Constant *ConstantFoldFixedVectorCall(
     // Gather a column of constants.
     for (unsigned J = 0, JE = Operands.size(); J != JE; ++J) {
       // Some intrinsics use a scalar type for certain arguments.
-      if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, J)) {
+      if (isVectorIntrinsicWithScalarOpAtArg(IntrinsicID, J, /*TTI=*/nullptr)) {
         Lane[J] = Operands[J];
         continue;
       }
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index efd92ebf92150..c62e40db0c577 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -615,9 +615,14 @@ bool TargetTransformInfo::isTargetIntrinsicWithScalarOpAtArg(
   return TTIImpl->isTargetIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx);
 }
 
-bool TargetTransformInfo::isVectorIntrinsicWithOverloadTypeAtArg(
-    Intrinsic::ID ID, int ScalarOpdIdx) const {
-  return TTIImpl->isVectorIntrinsicWithOverloadTypeAtArg(ID, ScalarOpdIdx);
+bool TargetTransformInfo::isTargetIntrinsicWithOverloadTypeAtArg(
+    Intrinsic::ID ID, int OpdIdx) const {
+  return TTIImpl->isTargetIntrinsicWithOverloadTypeAtArg(ID, OpdIdx);
+}
+
+bool TargetTransformInfo::isTargetIntrinsicWithStructReturnOverloadAtField(
+    Intrinsic::ID ID, int RetIdx) const {
+  return TTIImpl->isTargetIntrinsicWithStructReturnOverloadAtField(ID, RetIdx);
 }
 
 InstructionCost TargetTransformInfo::getScalarizationOverhead(
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 19306679cf6db..6c2502ce21cca 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -113,9 +113,31 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   }
 }
 
+bool llvm::isTriviallyScalarizable(Intrinsic::ID ID,
+                                   const TargetTransformInfo *TTI) {
+  if (isTriviallyVectorizable(ID))
+    return true;
+
+  if (TTI && Intrinsic::isTargetIntrinsic(ID))
+    return TTI->isTargetIntrinsicTriviallyScalarizable(ID);
+
+  // TODO: Move frexp to isTriviallyVectorizable.
+  // https://github.com/llvm/llvm-project/issues/112408
+  switch (ID) {
+  case Intrinsic::frexp:
+    return true;
+  }
+  return false;
+}
+
 /// Identifies if the vector form of the intrinsic has a scalar operand.
 bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
-                                              unsigned ScalarOpdIdx) {
+                                              unsigned ScalarOpdIdx,
+                                              const TargetTransformInfo *TTI) {
+
+  if (TTI && Intrinsic::isTargetIntrinsic(ID))
+    return TTI->isTargetIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx);
+
   switch (ID) {
   case Intrinsic::abs:
   case Intrinsic::vp_abs:
@@ -142,7 +164,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
   assert(ID != Intrinsic::not_intrinsic && "Not an intrinsic!");
 
   if (TTI && Intrinsic::isTargetIntrinsic(ID))
-    return TTI->isVectorIntrinsicWithOverloadTypeAtArg(ID, OpdIdx);
+    return TTI->isTargetIntrinsicWithOverloadTypeAtArg(ID, OpdIdx);
 
   if (VPCastIntrinsic::isVPCast(ID))
     return OpdIdx == -1 || OpdIdx == 0;
@@ -167,8 +189,12 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(
   }
 }
 
-bool llvm::isVectorIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID,
-                                                            int RetIdx) {
+bool llvm::isVectorIntrinsicWithStructReturnOverloadAtField(
+    Intrinsic::ID ID, int RetIdx, const TargetTransformInfo *TTI) {
+
+  if (TTI && Intrinsic::isTargetIntrinsic(ID))
+    return TTI->isTargetIntrinsicWithStructReturnOverloadAtField(ID, RetIdx);
+
   switch (ID) {
   case Intrinsic::frexp:
     return RetIdx == 0 || RetIdx == 1;
diff --git a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
index 8d457f58e6eed..a87c2063b1e35 100644
--- a/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
+++ b/llvm/lib/CodeGen/ReplaceWithVeclib.cpp
@@ -121,7 +121,7 @@ static bool replaceWithCallToVeclib(const TargetLibraryInfo &TLI,
     auto *ArgTy = Arg.value()->getType();
     bool IsOloadTy = isVectorIntrinsicWithOverloadTypeAtArg(IID, Arg.index(),
                                                             /*TTI=*/nullptr);
-    if (isVectorIntrinsicWithScalarOpAtArg(IID, Arg.index())) {
+    if (isVectorIntrinsicWithScalarOpAtArg(IID, Arg.index(), /*TTI=*/nullptr)) {
       ScalarArgTypes.push_back(ArgTy);
       if (IsOloadTy)
         OloadTys.push_back(ArgTy);
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index 2ca4e23594d56..4be1326085bc0 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -25,13 +25,13 @@ bool DirectXTTIImpl::isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
   }
 }
 
-bool DirectXTTIImpl::isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
-                                                            int ScalarOpdIdx) {
+bool DirectXTTIImpl::isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
+                                                            int OpdIdx) {
   switch (ID) {
   case Intrinsic::dx_asdouble:
-    return ScalarOpdIdx == 0;
+    return OpdIdx == 0;
   default:
-    return ScalarOpdIdx == -1;
+    return OpdIdx == -1;
   }
 }
 
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h
index a18e4a2862575..992d0483de93e 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.h
@@ -37,8 +37,7 @@ class DirectXTTIImpl : public BasicTTIImplBase<DirectXTTIImpl> {
   bool isTargetIntrinsicTriviallyScalarizable(Intrinsic::ID ID) const;
   bool isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
                                           unsigned ScalarOpdIdx);
-  bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
-                                              int ScalarOpdIdx);
+  bool isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx);
 };
 } // namespace llvm
 
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index 3b701e6ca0976..2b27150112ad8 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -279,8 +279,6 @@ class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> {
 
   bool visit(Function &F);
 
-  bool isTriviallyScalarizable(Intrinsic::ID ID);
-
   // InstVisitor methods.  They return true if the instruction was scalarized,
   // false if nothing changed.
   bool visitInstruction(Instruction &I) { return false; }
@@ -683,19 +681,6 @@ bool ScalarizerVisitor::splitBinary(Instruction &I, const Splitter &Split) {
   return true;
 }
 
-bool ScalarizerVisitor::isTriviallyScalarizable(Intrinsic::ID ID) {
-  if (isTriviallyVectorizable(ID))
-    return true;
-  // TODO: Move frexp to isTriviallyVectorizable.
-  // https://github.com/llvm/llvm-project/issues/112408
-  switch (ID) {
-  case Intrinsic::frexp:
-    return true;
-  }
-  return Intrinsic::isTargetIntrinsic(ID) &&
-         TTI->isTargetIntrinsicTriviallyScalarizable(ID);
-}
-
 /// If a call to a vector typed intrinsic function, split into a scalar call per
 /// element if possible for the intrinsic.
 bool ScalarizerVisitor::splitCall(CallInst &CI) {
@@ -715,7 +700,7 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) {
 
   Intrinsic::ID ID = F->getIntrinsicID();
 
-  if (ID == Intrinsic::not_intrinsic || !isTriviallyScalarizable(ID))
+  if (ID == Intrinsic::not_intrinsic || !isTriviallyScalarizable(ID, TTI))
     return false;
 
   // unsigned NumElems = VT->getNumElements();
@@ -743,7 +728,7 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) {
       // will only scalarize when the struct elements have the same bitness.
       if (!CurrVS || CurrVS->NumPacked != VS->NumPacked)
         return false;
-      if (isVectorIntrinsicWithStructReturnOverloadAtField(ID, I))
+      if (isVectorIntrinsicWithStructReturnOverloadAtField(ID, I, TTI))
         Tys.push_back(CurrVS->SplitTy);
     }
   }
@@ -794,8 +779,7 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) {
       Tys[0] = VS->RemainderTy;
 
     for (unsigned J = 0; J != NumArgs; ++J) {
-      if (isVectorIntrinsicWithScalarOpAtArg(ID, J) ||
-          TTI->isTargetIntrinsicWithScalarOpAtArg(ID, J)) {
+      if (isVectorIntrinsicWithScalarOpAtArg(ID, J, TTI)) {
         ScalarCallOps.push_back(ScalarOperands[J]);
       } else {
         ScalarCallOps.push_back(Scattered[J][I]);
@@ -1089,7 +1073,7 @@ bool ScalarizerVisitor::visitExtractValueInst(ExtractValueInst &EVI) {
     if (!F)
       return false;
     Intrinsic::ID ID = F->getIntrinsicID();
-    if (ID == Intrinsic::not_intrinsic || !isTriviallyScalarizable(ID))
+    if (ID == Intrinsic::not_intrinsic || !isTriviallyScalarizable(ID, TTI))
       return false;
     // Note: Fall through means Operand is a`CallInst` and it is defined in
     // `isTriviallyScalarizable`.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 1c82fd174dbec..cb0b4641b6492 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -926,7 +926,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         auto *SE = PSE.getSE();
         Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
         for (unsigned Idx = 0; Idx < CI->arg_size(); ++Idx)
-          if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx)) {
+          if (isVectorIntrinsicWithScalarOpAtArg(IntrinID, Idx, TTI)) {
             if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(Idx)),
                                      TheLoop)) {
               reportVectorizationFailure("Found unvectorizable intrinsic",
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 88049897b29ab..6b71e3e7585ed 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1091,7 +1091,8 @@ static bool allSameType(ArrayRef<Value *> VL) {
 /// \returns True if in-tree use also needs extract. This refers to
 /// possible scalar operand in vectorized instruction.
 static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
-                                        TargetLibraryInfo *TLI) {
+                                        TargetLibraryInfo *TLI,
+                                        const TargetTransformInfo *TTI) {
   if (!UserInst)
     return false;
   unsigned Opcode = UserInst->getOpcode();
@@ -1108,7 +1109,7 @@ static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
     CallInst *CI = cast<CallInst>(UserInst);
     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
     return any_of(enumerate(CI->args()), [&](auto &&Arg) {
-      return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
+      return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
              Arg.value().get() == Scalar;
     });
   }
@@ -6503,7 +6504,7 @@ void BoUpSLP::buildExternalUses(
           // be used.
           if (UseEntry->State == TreeEntry::ScatterVectorize ||
               !doesInTreeUserNeedToExtract(
-                  Scalar, getRootEntryInstruction(*UseEntry), TLI)) {
+                  Scalar, getRootEntryInstruction(*UseEntry), TLI, TTI)) {
             LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
                               << ".\n");
             assert(!UseEntry->isGather() && "Bad state");
@@ -7828,7 +7829,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
     unsigned NumArgs = CI->arg_size();
     SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
     for (unsigned J = 0; J != NumArgs; ++J)
-      if (isVectorIntrinsicWithScalarOpAtArg(ID, J))
+      if (isVectorIntrinsicWithScalarOpAtArg(ID, J, TTI))
         ScalarArgs[J] = CI->getArgOperand(J);
     for (Value *V : VL) {
       CallInst *CI2 = dyn_cast<CallInst>(V);
@@ -7844,7 +7845,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
       // Some intrinsics have scalar arguments and should be same in order for
       // them to be vectorized.
       for (unsigned J = 0; J != NumArgs; ++J) {
-        if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) {
+        if (isVectorIntrinsicWithScalarOpAtArg(ID, J, TTI)) {
           Value *A1J = CI2->getArgOperand(J);
           if (ScalarArgs[J] != A1J) {
             LLVM_DEBUG(dbgs()
@@ -8716,7 +8717,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       for (unsigned I : seq<unsigned>(CI->arg_size())) {
         // For scalar operands no need to create an entry since no need to
         // vectorize it.
-        if (isVectorIntrinsicWithScalarOpAtArg(ID, I))
+        if (isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI))
           continue;
         buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
       }
@@ -10946,14 +10947,14 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
 
 /// Builds the arguments types vector for the given call instruction with the
 /// given \p ID for the specified vector factor.
-static SmallVector<Type *> buildIntrinsicArgTypes(const CallInst *CI,
-                                                  const Intrinsic::ID ID,
-                                                  const unsigned VF,
-                                                  unsigned MinBW) {
+static SmallVector<Type *>
+buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID,
+                       const unsigned VF, unsigned MinBW,
+                       const TargetTransformInfo *TTI) {
   SmallVector<Type *> ArgTys;
   for (auto [Idx, Arg] : enumerate(CI->args())) {
     if (ID != Intrinsic::not_intrinsic) {
-      if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx)) {
+      if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx, TTI)) {
         ArgTys.push_back(Arg->getType());
         continue;
       }
@@ -11655,9 +11656,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     auto GetVectorCost = [=](InstructionCost CommonCost) {
       auto *CI = cast<CallInst>(VL0);
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-      SmallVector<Type *> ArgTys =
-          buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(),
-                                 It != MinBWs.end() ? It->second.first : 0);
+      SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
+          CI, ID, VecTy->getNumElements(),
+          It != MinBWs.end() ? It->second.first : 0, TTI);
       auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
       return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
     };
@@ -15815,9 +15816,9 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
 
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
-      SmallVector<Type *> ArgTys =
-          buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(),
-                                 It != MinBWs.end() ? It->second.first : 0);
+      SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(
+          CI, ID, VecTy->getNumElements(),
+          It != MinBWs.end() ? It->second.first : 0, TTI);
       auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
       bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
                           VecCallCosts.first <= VecCallCosts.second;
@@ -15833,7 +15834,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
         ValueList OpVL;
         // Some intrinsics have scalar arguments. This argument should not be
         // vectorized.
-        if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
+        if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
           ScalarArg = CEI->getArgOperand(I);
           // if decided to reduce bitwidth of abs intrinsic, it second argument
           // must be set false (do not return poison, if value issigned min).
@@ -16372,7 +16373,7 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
                                E->State == TreeEntry::StridedVectorize) &&
                               doesInTreeUserNeedToExtract(
                                   Scalar, getRootEntryInstruction(*UseEntry),
-                                  TLI);
+                                  TLI, TTI);
                      })) &&
              "Scalar with nullptr User must be registered in "
              "ExternallyUsedValues map or remain as scalar in vectorized "
@@ -17872,7 +17873,8 @@ bool BoUpSLP::collectValuesToDemote(
     // Choose the best bitwidth based on cost estimations.
     auto Checker = [&](unsigned BitWidth, unsigned) {
       unsigned MinBW = PowerOf2Ceil(BitWidth);
-      SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(IC, ID, VF, MinBW);
+      SmallVector<Type *> ArgTys =
+          buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
       auto VecCallCosts = getVectorCallCosts(
           IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
           TTI, TLI, ArgTys);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index b5020a3287432..5dd1b8403f46b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1027,7 +1027,8 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) {
     // Some intrinsics have a scalar argument - don't replace it with a
     // vector.
     Value *Arg;
-    if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
+    if (isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index(),
+                                           State.TTI))
       Arg = State.get(I.value(), VPLane(0));
     else
       Arg = State.get(I.value(), onlyFirstLaneUsed(I.value()));
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 791006c48b5dd..430fed6d76eaa 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1977,7 +1977,7 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
     return false;
 
   for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
-    if (isVectorIntrinsicWithScalarOpAtArg(IID, I) &&
+    if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI) &&
         II0->getArgOperand(I) != II1->getArgOperand(I))
       return false;
 
@@ -1990,7 +1990,7 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
   SmallVector<Type *> NewArgsTy;
   InstructionCost NewCost = 0;
   for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
-    if (isVectorIntrinsicWithScalarOpAtArg(IID, I)) {
+    if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI)) {
       NewArgsTy.push_back(II0->getArgOperand(I)->getType());
     } else {
       auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
@@ -2011,7 +2011,7 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
 
   SmallVector<Value *> NewArgs;
   for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
-    if (isVectorIntrinsicWithScalarOpAtArg(IID, I)) {
+    if (isVectorIntrinsicWithScalarOpAtArg(IID, I, &TTI)) {
       NewArgs.push_back(II0->getArgOperand(I));
     } else {
       Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
@@ -2102,7 +2102,8 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
                                   const SmallPtrSet<Use *, 4> &IdentityLeafs,
                                   const SmallPtrSet<Use *, 4> &SplatLeafs,
                                   const SmallPtrSet<Use *, 4> &ConcatLeafs,
-                                  IRBuilder<> &Builder) {
+                                  IRBuilder<> &Builder,
+                                  const TargetTransformInfo *TTI) {
   auto [FrontU, FrontLane] = Item.front();
 
   if (IdentityLeafs.contains(FrontU)) {
@@ -2137,13 +2138,14 @@ static Value *generateNewInstTree(ArrayRef<InstLane> Item, FixedVectorType *Ty,
   unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
   SmallVector<Value *> Ops(NumOps);
   for (unsigned Idx = 0; Idx < NumOps; Idx++) {
-    if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx)) {
+    if (II &&
+        isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
       Ops[Idx] = II->getOperand(Idx);
       continue;
     }
-    Ops[Idx] =
-        generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx), Ty,
-                            IdentityLeafs, SplatLeafs, ConcatLeafs, Builder);
+    Ops[Idx] = generateNewInstTree(generateInstLaneVectorFromOperand(Item, Idx),
+                                   Ty, IdentityLeafs, SplatLeafs, ConcatLeafs,
+                                   Builder, TTI);
   }
 
   SmallVector<Value *, 8> ValueList;
@@ -2315,7 +2317,8 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
                  II && isTriviallyVectorizable(II->getIntrinsicID()) &&
                  !II->hasOperandBundles()) {
         for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
-          if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op)) {
+          if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
+                                                 &TTI)) {
             if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
                   Value *FrontV = Item.front().first->get();
                   Use *U = IL.first;
@@ -2346,7 +2349,7 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
   // removed. Scan through again and generate the new tree of instructions.
   Builder.SetInsertPoint(&I);
   Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
-                                 ConcatLeafs, Builder);
+                                 ConcatLeafs, Builder, &TTI);
   replaceValue(I, *V);
   return true;
 }

From e3b571e632855386908c5cea310f5056d31d6df8 Mon Sep 17 00:00:00 2001
From: Niels Dekker <N.Dekker@lumc.nl>
Date: Thu, 19 Dec 2024 21:16:40 +0100
Subject: [PATCH 098/209] [clang-tidy][NFC] Sync ContainerSizeEmptyCheck with
 container-size-empty doc (#118459)

Brought the class documentation in sync with the user documentation at
container-size-empty.rst:


https://github.com/llvm/llvm-project/blob/bfb26202e05ee2932b4368b5fca607df01e8247f/clang-tools-extra/docs/clang-tidy/checks/readability/container-size-empty.rst#L7-L14
---
 .../clang-tidy/readability/ContainerSizeEmptyCheck.h     | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.h b/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.h
index acd8a6bfc50f5..3aa4bdc496194 100644
--- a/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.h
+++ b/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.h
@@ -18,10 +18,11 @@ namespace clang::tidy::readability {
 /// a call to `empty()`.
 ///
 /// The emptiness of a container should be checked using the `empty()` method
-/// instead of the `size()` method. It shows clearer intent to use `empty()`.
-/// Furthermore some containers may implement the `empty()` method but not
-/// implement the `size()` method. Using `empty()` whenever possible makes it
-/// easier to switch to another container in the future.
+/// instead of the `size()`/`length()` method. It shows clearer intent to use
+/// `empty()`. Furthermore some containers may implement the `empty()` method
+/// but not implement the `size()` or `length()` method. Using `empty()`
+/// whenever possible makes it easier to switch to another container in the
+/// future.
 class ContainerSizeEmptyCheck : public ClangTidyCheck {
 public:
   ContainerSizeEmptyCheck(StringRef Name, ClangTidyContext *Context);

From 10d054e95413f0e98e4aeed9dbd4605f6f03b3fa Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 19 Dec 2024 12:20:25 -0800
Subject: [PATCH 099/209] [memprof] Introduce IndexedCallstackIdConveter (NFC)
 (#120540)

This patch introduces IndexedCallstackIdConveter as a convenience
wrapper around FrameIdConverter and CallStackIdConverter just for
tests.

With the new wrapper, we get to replace idioms like:

  FrameIdConverter<decltype(MemProfData.Frames)> FrameIdConv(
      MemProfData.Frames);
  CallStackIdConverter<decltype(MemProfData.CallStacks)> CSIdConv(
      MemProfData.CallStacks, FrameIdConv);

with:

  IndexedCallstackIdConveter CSIdConv(MemProfData);

Unfortunately, this exact pattern occurs in tests only; the
combinations of the frame ID converter and call stack ID converter are
diverse in production code.
---
 llvm/include/llvm/ProfileData/MemProf.h      | 21 ++++++++++
 llvm/unittests/ProfileData/InstrProfTest.cpp | 31 ++++++---------
 llvm/unittests/ProfileData/MemProfTest.cpp   | 41 +++++++-------------
 3 files changed, 45 insertions(+), 48 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 883e24d718615..da0cb47508e32 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -1030,6 +1030,27 @@ struct IndexedMemProfData {
   CallStackId hashCallStack(ArrayRef<FrameId> CS) const;
 };
 
+// A convenience wrapper around FrameIdConverter and CallStackIdConverter for
+// tests.
+struct IndexedCallstackIdConveter {
+  IndexedCallstackIdConveter() = delete;
+  IndexedCallstackIdConveter(IndexedMemProfData &MemProfData)
+      : FrameIdConv(MemProfData.Frames),
+        CSIdConv(MemProfData.CallStacks, FrameIdConv) {}
+
+  // Delete the copy constructor and copy assignment operator to avoid a
+  // situation where a copy of IndexedCallStackIdConverter gets an error in
+  // LastUnmappedId while the original instance doesn't.
+  IndexedCallstackIdConveter(const IndexedCallstackIdConveter &) = delete;
+  IndexedCallstackIdConveter &
+  operator=(const IndexedCallstackIdConveter &) = delete;
+
+  std::vector<Frame> operator()(CallStackId CSId) { return CSIdConv(CSId); }
+
+  FrameIdConverter<decltype(IndexedMemProfData::Frames)> FrameIdConv;
+  CallStackIdConverter<decltype(IndexedMemProfData::CallStacks)> CSIdConv;
+};
+
 struct FrameStat {
   // The number of occurrences of a given FrameId.
   uint64_t Count = 0;
diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp
index ac872d8235622..adcd4d2d11e0f 100644
--- a/llvm/unittests/ProfileData/InstrProfTest.cpp
+++ b/llvm/unittests/ProfileData/InstrProfTest.cpp
@@ -457,17 +457,14 @@ TEST_F(InstrProfTest, test_memprof_v2_full_schema) {
   ASSERT_THAT_ERROR(RecordOr.takeError(), Succeeded());
   const memprof::MemProfRecord &Record = RecordOr.get();
 
-  memprof::FrameIdConverter<decltype(MemProfData.Frames)> FrameIdConv(
-      MemProfData.Frames);
-  memprof::CallStackIdConverter<decltype(MemProfData.CallStacks)> CSIdConv(
-      MemProfData.CallStacks, FrameIdConv);
+  memprof::IndexedCallstackIdConveter CSIdConv(MemProfData);
 
   const ::llvm::memprof::MemProfRecord WantRecord =
       IndexedMR.toMemProfRecord(CSIdConv);
-  ASSERT_EQ(FrameIdConv.LastUnmappedId, std::nullopt)
-      << "could not map frame id: " << *FrameIdConv.LastUnmappedId;
-  ASSERT_EQ(CSIdConv.LastUnmappedId, std::nullopt)
-      << "could not map call stack id: " << *CSIdConv.LastUnmappedId;
+  ASSERT_EQ(CSIdConv.FrameIdConv.LastUnmappedId, std::nullopt)
+      << "could not map frame id: " << *CSIdConv.FrameIdConv.LastUnmappedId;
+  ASSERT_EQ(CSIdConv.CSIdConv.LastUnmappedId, std::nullopt)
+      << "could not map call stack id: " << *CSIdConv.CSIdConv.LastUnmappedId;
   EXPECT_THAT(WantRecord, EqualsRecord(Record));
 }
 
@@ -494,17 +491,14 @@ TEST_F(InstrProfTest, test_memprof_v2_partial_schema) {
   ASSERT_THAT_ERROR(RecordOr.takeError(), Succeeded());
   const memprof::MemProfRecord &Record = RecordOr.get();
 
-  memprof::FrameIdConverter<decltype(MemProfData.Frames)> FrameIdConv(
-      MemProfData.Frames);
-  memprof::CallStackIdConverter<decltype(MemProfData.CallStacks)> CSIdConv(
-      MemProfData.CallStacks, FrameIdConv);
+  memprof::IndexedCallstackIdConveter CSIdConv(MemProfData);
 
   const ::llvm::memprof::MemProfRecord WantRecord =
       IndexedMR.toMemProfRecord(CSIdConv);
-  ASSERT_EQ(FrameIdConv.LastUnmappedId, std::nullopt)
-      << "could not map frame id: " << *FrameIdConv.LastUnmappedId;
-  ASSERT_EQ(CSIdConv.LastUnmappedId, std::nullopt)
-      << "could not map call stack id: " << *CSIdConv.LastUnmappedId;
+  ASSERT_EQ(CSIdConv.FrameIdConv.LastUnmappedId, std::nullopt)
+      << "could not map frame id: " << *CSIdConv.FrameIdConv.LastUnmappedId;
+  ASSERT_EQ(CSIdConv.CSIdConv.LastUnmappedId, std::nullopt)
+      << "could not map call stack id: " << *CSIdConv.CSIdConv.LastUnmappedId;
   EXPECT_THAT(WantRecord, EqualsRecord(Record));
 }
 
@@ -615,10 +609,7 @@ TEST_F(InstrProfTest, test_memprof_merge) {
 
   std::optional<memprof::FrameId> LastUnmappedFrameId;
 
-  memprof::FrameIdConverter<decltype(MemProfData.Frames)> FrameIdConv(
-      MemProfData.Frames);
-  memprof::CallStackIdConverter<decltype(MemProfData.CallStacks)> CSIdConv(
-      MemProfData.CallStacks, FrameIdConv);
+  memprof::IndexedCallstackIdConveter CSIdConv(MemProfData);
 
   const ::llvm::memprof::MemProfRecord WantRecord =
       IndexedMR.toMemProfRecord(CSIdConv);
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index 2eb85d5b2f587..1fe9af521d884 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -501,16 +501,13 @@ TEST(MemProf, IndexedMemProfRecordToMemProfRecord) {
   IndexedRecord.CallSiteIds.push_back(CS3Id);
   IndexedRecord.CallSiteIds.push_back(CS4Id);
 
-  FrameIdConverter<decltype(MemProfData.Frames)> FrameIdConv(
-      MemProfData.Frames);
-  CallStackIdConverter<decltype(MemProfData.CallStacks)> CSIdConv(
-      MemProfData.CallStacks, FrameIdConv);
+  IndexedCallstackIdConveter CSIdConv(MemProfData);
 
   MemProfRecord Record = IndexedRecord.toMemProfRecord(CSIdConv);
 
   // Make sure that all lookups are successful.
-  ASSERT_EQ(FrameIdConv.LastUnmappedId, std::nullopt);
-  ASSERT_EQ(CSIdConv.LastUnmappedId, std::nullopt);
+  ASSERT_EQ(CSIdConv.FrameIdConv.LastUnmappedId, std::nullopt);
+  ASSERT_EQ(CSIdConv.CSIdConv.LastUnmappedId, std::nullopt);
 
   // Verify the contents of Record.
   ASSERT_THAT(Record.AllocSites, SizeIs(2));
@@ -540,17 +537,14 @@ TEST(MemProf, MissingCallStackId) {
 
   // Create empty maps.
   IndexedMemProfData MemProfData;
-  FrameIdConverter<decltype(MemProfData.Frames)> FrameIdConv(
-      MemProfData.Frames);
-  CallStackIdConverter<decltype(MemProfData.CallStacks)> CSIdConv(
-      MemProfData.CallStacks, FrameIdConv);
+  IndexedCallstackIdConveter CSIdConv(MemProfData);
 
   // We are only interested in errors, not the return value.
   (void)IndexedMR.toMemProfRecord(CSIdConv);
 
-  ASSERT_TRUE(CSIdConv.LastUnmappedId.has_value());
-  EXPECT_EQ(*CSIdConv.LastUnmappedId, 0xdeadbeefU);
-  EXPECT_EQ(FrameIdConv.LastUnmappedId, std::nullopt);
+  ASSERT_TRUE(CSIdConv.CSIdConv.LastUnmappedId.has_value());
+  EXPECT_EQ(*CSIdConv.CSIdConv.LastUnmappedId, 0xdeadbeefU);
+  EXPECT_EQ(CSIdConv.FrameIdConv.LastUnmappedId, std::nullopt);
 }
 
 TEST(MemProf, MissingFrameId) {
@@ -561,17 +555,14 @@ TEST(MemProf, MissingFrameId) {
   IndexedMemProfRecord IndexedMR;
   IndexedMR.AllocSites.emplace_back(CSId, makePartialMIB(), getHotColdSchema());
 
-  FrameIdConverter<decltype(MemProfData.Frames)> FrameIdConv(
-      MemProfData.Frames);
-  CallStackIdConverter<decltype(MemProfData.CallStacks)> CSIdConv(
-      MemProfData.CallStacks, FrameIdConv);
+  IndexedCallstackIdConveter CSIdConv(MemProfData);
 
   // We are only interested in errors, not the return value.
   (void)IndexedMR.toMemProfRecord(CSIdConv);
 
-  EXPECT_EQ(CSIdConv.LastUnmappedId, std::nullopt);
-  ASSERT_TRUE(FrameIdConv.LastUnmappedId.has_value());
-  EXPECT_EQ(*FrameIdConv.LastUnmappedId, 3U);
+  EXPECT_EQ(CSIdConv.CSIdConv.LastUnmappedId, std::nullopt);
+  ASSERT_TRUE(CSIdConv.FrameIdConv.LastUnmappedId.has_value());
+  EXPECT_EQ(*CSIdConv.FrameIdConv.LastUnmappedId, 3U);
 }
 
 // Verify CallStackRadixTreeBuilder can handle empty inputs.
@@ -714,10 +705,7 @@ TEST(MemProf, YAMLParser) {
   const auto &[GUID, IndexedRecord] = MemProfData.Records.front();
   EXPECT_EQ(GUID, 0xdeadbeef12345678ULL);
 
-  FrameIdConverter<decltype(MemProfData.Frames)> FrameIdConv(
-      MemProfData.Frames);
-  CallStackIdConverter<decltype(MemProfData.CallStacks)> CSIdConv(
-      MemProfData.CallStacks, FrameIdConv);
+  IndexedCallstackIdConveter CSIdConv(MemProfData);
   MemProfRecord Record = IndexedRecord.toMemProfRecord(CSIdConv);
 
   ASSERT_THAT(Record.AllocSites, SizeIs(2));
@@ -760,10 +748,7 @@ TEST(MemProf, YAMLParserGUID) {
   const auto &[GUID, IndexedRecord] = MemProfData.Records.front();
   EXPECT_EQ(GUID, IndexedMemProfRecord::getGUID("_Z3fooi"));
 
-  FrameIdConverter<decltype(MemProfData.Frames)> FrameIdConv(
-      MemProfData.Frames);
-  CallStackIdConverter<decltype(MemProfData.CallStacks)> CSIdConv(
-      MemProfData.CallStacks, FrameIdConv);
+  IndexedCallstackIdConveter CSIdConv(MemProfData);
   MemProfRecord Record = IndexedRecord.toMemProfRecord(CSIdConv);
 
   ASSERT_THAT(Record.AllocSites, SizeIs(1));

From 4bbdb018a6cb564783cfb9c65ca82b81c6006bb6 Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane@nvidia.com>
Date: Thu, 19 Dec 2024 06:11:36 -0800
Subject: [PATCH 100/209] [OpenACC] Implement 'init' and 'shutdown' constructs

These two constructs are very simple and similar, and only support 3
different clauses, two of which are already implemented.  This patch
adds AST nodes for both constructs, and leaves the device_num clause
unimplemented, but enables the other two.
---
 clang/include/clang-c/Index.h                 |  10 +-
 clang/include/clang/AST/RecursiveASTVisitor.h |   4 +
 clang/include/clang/AST/StmtOpenACC.h         |  80 +++++++++++
 clang/include/clang/AST/TextNodeDumper.h      |   2 +
 .../clang/Basic/DiagnosticSemaKinds.td        |   4 +-
 clang/include/clang/Basic/StmtNodes.td        |   2 +
 .../include/clang/Serialization/ASTBitCodes.h |   2 +
 clang/lib/AST/StmtOpenACC.cpp                 |  40 ++++++
 clang/lib/AST/StmtPrinter.cpp                 |  87 ++++--------
 clang/lib/AST/StmtProfile.cpp                 |  13 ++
 clang/lib/AST/TextNodeDumper.cpp              |   7 +
 clang/lib/CodeGen/CGStmt.cpp                  |   6 +
 clang/lib/CodeGen/CodeGenFunction.h           |  10 ++
 clang/lib/Parse/ParseOpenACC.cpp              |   4 +
 clang/lib/Sema/SemaExceptionSpec.cpp          |   2 +
 clang/lib/Sema/SemaOpenACC.cpp                |  23 +++-
 clang/lib/Sema/TreeTransform.h                |  52 ++++++++
 clang/lib/Serialization/ASTReaderStmt.cpp     |  20 +++
 clang/lib/Serialization/ASTWriterStmt.cpp     |  12 ++
 clang/lib/StaticAnalyzer/Core/ExprEngine.cpp  |   2 +
 .../AST/ast-print-openacc-init-construct.cpp  |  15 +++
 .../ast-print-openacc-shutdown-construct.cpp  |  15 +++
 clang/test/ParserOpenACC/parse-clauses.c      |  20 +--
 clang/test/ParserOpenACC/parse-constructs.c   |   6 +-
 .../combined-construct-async-clause.cpp       |   6 +-
 .../combined-construct-wait-clause.cpp        |  12 +-
 .../compute-construct-async-clause.cpp        |   6 +-
 .../compute-construct-num_gangs-clause.cpp    |   8 +-
 .../compute-construct-num_workers-clause.cpp  |   6 +-
 .../compute-construct-private-clause.c        |   3 +-
 .../compute-construct-reduction-clause.c      |   3 +-
 ...compute-construct-vector_length-clause.cpp |   6 +-
 .../compute-construct-wait-clause.cpp         |  12 +-
 clang/test/SemaOpenACC/init-construct-ast.cpp | 125 ++++++++++++++++++
 clang/test/SemaOpenACC/init-construct.cpp     |  75 +++++++++++
 .../SemaOpenACC/shutdown-construct-ast.cpp    | 125 ++++++++++++++++++
 clang/test/SemaOpenACC/shutdown-construct.cpp |  74 +++++++++++
 clang/test/SemaOpenACC/wait-construct.cpp     |   4 +-
 clang/tools/libclang/CIndex.cpp               |  19 +++
 clang/tools/libclang/CXCursor.cpp             |   6 +
 40 files changed, 810 insertions(+), 118 deletions(-)
 create mode 100644 clang/test/AST/ast-print-openacc-init-construct.cpp
 create mode 100644 clang/test/AST/ast-print-openacc-shutdown-construct.cpp
 create mode 100644 clang/test/SemaOpenACC/init-construct-ast.cpp
 create mode 100644 clang/test/SemaOpenACC/init-construct.cpp
 create mode 100644 clang/test/SemaOpenACC/shutdown-construct-ast.cpp
 create mode 100644 clang/test/SemaOpenACC/shutdown-construct.cpp

diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index 122118b8f3763..dfc562da88afe 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -2190,7 +2190,15 @@ enum CXCursorKind {
    */
   CXCursor_OpenACCWaitConstruct = 327,
 
-  CXCursor_LastStmt = CXCursor_OpenACCWaitConstruct,
+  /** OpenACC init Construct.
+   */
+  CXCursor_OpenACCInitConstruct = 328,
+
+  /** OpenACC shutdown Construct.
+   */
+  CXCursor_OpenACCShutdownConstruct = 329,
+
+  CXCursor_LastStmt = CXCursor_OpenACCShutdownConstruct,
 
   /**
    * Cursor that represents the translation unit itself.
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index d9a87b30062df..f5b32ed51698e 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -4076,6 +4076,10 @@ DEF_TRAVERSE_STMT(OpenACCWaitConstruct, {
     TRY_TO(TraverseStmt(E));
   TRY_TO(VisitOpenACCClauseList(S->clauses()));
 })
+DEF_TRAVERSE_STMT(OpenACCInitConstruct,
+                  { TRY_TO(VisitOpenACCClauseList(S->clauses())); })
+DEF_TRAVERSE_STMT(OpenACCShutdownConstruct,
+                  { TRY_TO(VisitOpenACCClauseList(S->clauses())); })
 
 // Traverse HLSL: Out argument expression
 DEF_TRAVERSE_STMT(HLSLOutArgExpr, {})
diff --git a/clang/include/clang/AST/StmtOpenACC.h b/clang/include/clang/AST/StmtOpenACC.h
index 093393a81be9c..e311eded5599b 100644
--- a/clang/include/clang/AST/StmtOpenACC.h
+++ b/clang/include/clang/AST/StmtOpenACC.h
@@ -592,5 +592,85 @@ class OpenACCWaitConstruct final
     return const_child_range(Begin, Begin + NumExprs);
   }
 };
+
+// This class represents an 'init' construct, which has just a clause list.
+class OpenACCInitConstruct final
+    : public OpenACCConstructStmt,
+      private llvm::TrailingObjects<OpenACCInitConstruct,
+                                    const OpenACCClause *> {
+  friend TrailingObjects;
+  OpenACCInitConstruct(unsigned NumClauses)
+      : OpenACCConstructStmt(OpenACCInitConstructClass,
+                             OpenACCDirectiveKind::Init, SourceLocation{},
+                             SourceLocation{}, SourceLocation{}) {
+    std::uninitialized_value_construct(
+        getTrailingObjects<const OpenACCClause *>(),
+        getTrailingObjects<const OpenACCClause *>() + NumClauses);
+    setClauseList(MutableArrayRef(getTrailingObjects<const OpenACCClause *>(),
+                                  NumClauses));
+  }
+  OpenACCInitConstruct(SourceLocation Start, SourceLocation DirectiveLoc,
+                       SourceLocation End,
+                       ArrayRef<const OpenACCClause *> Clauses)
+      : OpenACCConstructStmt(OpenACCInitConstructClass,
+                             OpenACCDirectiveKind::Init, Start, DirectiveLoc,
+                             End) {
+    std::uninitialized_copy(Clauses.begin(), Clauses.end(),
+                            getTrailingObjects<const OpenACCClause *>());
+    setClauseList(MutableArrayRef(getTrailingObjects<const OpenACCClause *>(),
+                                  Clauses.size()));
+  }
+
+public:
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OpenACCInitConstructClass;
+  }
+  static OpenACCInitConstruct *CreateEmpty(const ASTContext &C,
+                                           unsigned NumClauses);
+  static OpenACCInitConstruct *Create(const ASTContext &C, SourceLocation Start,
+                                      SourceLocation DirectiveLoc,
+                                      SourceLocation End,
+                                      ArrayRef<const OpenACCClause *> Clauses);
+};
+
+// This class represents a 'shutdown' construct, which has just a clause list.
+class OpenACCShutdownConstruct final
+    : public OpenACCConstructStmt,
+      private llvm::TrailingObjects<OpenACCShutdownConstruct,
+                                    const OpenACCClause *> {
+  friend TrailingObjects;
+  OpenACCShutdownConstruct(unsigned NumClauses)
+      : OpenACCConstructStmt(OpenACCShutdownConstructClass,
+                             OpenACCDirectiveKind::Shutdown, SourceLocation{},
+                             SourceLocation{}, SourceLocation{}) {
+    std::uninitialized_value_construct(
+        getTrailingObjects<const OpenACCClause *>(),
+        getTrailingObjects<const OpenACCClause *>() + NumClauses);
+    setClauseList(MutableArrayRef(getTrailingObjects<const OpenACCClause *>(),
+                                  NumClauses));
+  }
+  OpenACCShutdownConstruct(SourceLocation Start, SourceLocation DirectiveLoc,
+                           SourceLocation End,
+                           ArrayRef<const OpenACCClause *> Clauses)
+      : OpenACCConstructStmt(OpenACCShutdownConstructClass,
+                             OpenACCDirectiveKind::Shutdown, Start,
+                             DirectiveLoc, End) {
+    std::uninitialized_copy(Clauses.begin(), Clauses.end(),
+                            getTrailingObjects<const OpenACCClause *>());
+    setClauseList(MutableArrayRef(getTrailingObjects<const OpenACCClause *>(),
+                                  Clauses.size()));
+  }
+
+public:
+  static bool classof(const Stmt *T) {
+    return T->getStmtClass() == OpenACCShutdownConstructClass;
+  }
+  static OpenACCShutdownConstruct *CreateEmpty(const ASTContext &C,
+                                               unsigned NumClauses);
+  static OpenACCShutdownConstruct *
+  Create(const ASTContext &C, SourceLocation Start, SourceLocation DirectiveLoc,
+         SourceLocation End, ArrayRef<const OpenACCClause *> Clauses);
+};
+
 } // namespace clang
 #endif // LLVM_CLANG_AST_STMTOPENACC_H
diff --git a/clang/include/clang/AST/TextNodeDumper.h b/clang/include/clang/AST/TextNodeDumper.h
index b6f16be7a5b98..5383b53fdc491 100644
--- a/clang/include/clang/AST/TextNodeDumper.h
+++ b/clang/include/clang/AST/TextNodeDumper.h
@@ -416,6 +416,8 @@ class TextNodeDumper
   void VisitOpenACCExitDataConstruct(const OpenACCExitDataConstruct *S);
   void VisitOpenACCHostDataConstruct(const OpenACCHostDataConstruct *S);
   void VisitOpenACCWaitConstruct(const OpenACCWaitConstruct *S);
+  void VisitOpenACCInitConstruct(const OpenACCInitConstruct *S);
+  void VisitOpenACCShutdownConstruct(const OpenACCShutdownConstruct *S);
   void VisitOpenACCAsteriskSizeExpr(const OpenACCAsteriskSizeExpr *S);
   void VisitEmbedExpr(const EmbedExpr *S);
   void VisitAtomicExpr(const AtomicExpr *AE);
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index de34bcbf9ad4a..8d19e9030ac2e 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12669,8 +12669,8 @@ def err_acc_int_expr_requires_integer
 def err_acc_int_expr_incomplete_class_type
     : Error<"OpenACC integer expression has incomplete class type %0">;
 def err_acc_int_expr_explicit_conversion
-    : Error<"OpenACC integer expression type %0 requires explicit conversion "
-            "to %1">;
+    : Error<"OpenACC integer expression requires explicit conversion "
+            "from %0 to %1">;
 def note_acc_int_expr_conversion
     : Note<"conversion to %select{integral|enumeration}0 type %1">;
 def err_acc_int_expr_multiple_conversions
diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td
index 6c7314b06d858..31280df93e4c6 100644
--- a/clang/include/clang/Basic/StmtNodes.td
+++ b/clang/include/clang/Basic/StmtNodes.td
@@ -313,6 +313,8 @@ def OpenACCEnterDataConstruct : StmtNode<OpenACCConstructStmt>;
 def OpenACCExitDataConstruct : StmtNode<OpenACCConstructStmt>;
 def OpenACCHostDataConstruct : StmtNode<OpenACCAssociatedStmtConstruct>;
 def OpenACCWaitConstruct : StmtNode<OpenACCConstructStmt>;
+def OpenACCInitConstruct : StmtNode<OpenACCConstructStmt>;
+def OpenACCShutdownConstruct : StmtNode<OpenACCConstructStmt>;
 
 // OpenACC Additional Expressions.
 def OpenACCAsteriskSizeExpr : StmtNode<Expr>;
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 57e27c373bffa..dfd82afad4007 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -2022,6 +2022,8 @@ enum StmtCode {
   STMT_OPENACC_EXIT_DATA_CONSTRUCT,
   STMT_OPENACC_HOST_DATA_CONSTRUCT,
   STMT_OPENACC_WAIT_CONSTRUCT,
+  STMT_OPENACC_INIT_CONSTRUCT,
+  STMT_OPENACC_SHUTDOWN_CONSTRUCT,
 
   // HLSL Constructs
   EXPR_HLSL_OUT_ARG,
diff --git a/clang/lib/AST/StmtOpenACC.cpp b/clang/lib/AST/StmtOpenACC.cpp
index 6d9f267702e37..e6d76ea30f029 100644
--- a/clang/lib/AST/StmtOpenACC.cpp
+++ b/clang/lib/AST/StmtOpenACC.cpp
@@ -225,3 +225,43 @@ OpenACCWaitConstruct *OpenACCWaitConstruct::Create(
                            QueuesLoc, QueueIdExprs, RParenLoc, End, Clauses);
   return Inst;
 }
+OpenACCInitConstruct *OpenACCInitConstruct::CreateEmpty(const ASTContext &C,
+                                                        unsigned NumClauses) {
+  void *Mem =
+      C.Allocate(OpenACCInitConstruct::totalSizeToAlloc<const OpenACCClause *>(
+          NumClauses));
+  auto *Inst = new (Mem) OpenACCInitConstruct(NumClauses);
+  return Inst;
+}
+
+OpenACCInitConstruct *
+OpenACCInitConstruct::Create(const ASTContext &C, SourceLocation Start,
+                             SourceLocation DirectiveLoc, SourceLocation End,
+                             ArrayRef<const OpenACCClause *> Clauses) {
+  void *Mem =
+      C.Allocate(OpenACCInitConstruct::totalSizeToAlloc<const OpenACCClause *>(
+          Clauses.size()));
+  auto *Inst =
+      new (Mem) OpenACCInitConstruct(Start, DirectiveLoc, End, Clauses);
+  return Inst;
+}
+OpenACCShutdownConstruct *
+OpenACCShutdownConstruct::CreateEmpty(const ASTContext &C,
+                                      unsigned NumClauses) {
+  void *Mem = C.Allocate(
+      OpenACCShutdownConstruct::totalSizeToAlloc<const OpenACCClause *>(
+          NumClauses));
+  auto *Inst = new (Mem) OpenACCShutdownConstruct(NumClauses);
+  return Inst;
+}
+
+OpenACCShutdownConstruct *OpenACCShutdownConstruct::Create(
+    const ASTContext &C, SourceLocation Start, SourceLocation DirectiveLoc,
+    SourceLocation End, ArrayRef<const OpenACCClause *> Clauses) {
+  void *Mem = C.Allocate(
+      OpenACCShutdownConstruct::totalSizeToAlloc<const OpenACCClause *>(
+          Clauses.size()));
+  auto *Inst =
+      new (Mem) OpenACCShutdownConstruct(Start, DirectiveLoc, End, Clauses);
+  return Inst;
+}
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index ecc9b6e35db72..c5d19f70fc6ea 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -127,6 +127,8 @@ namespace {
     void PrintOMPExecutableDirective(OMPExecutableDirective *S,
                                      bool ForceNoStmt = false);
     void PrintFPPragmas(CompoundStmt *S);
+    void PrintOpenACCClauseList(OpenACCConstructStmt *S);
+    void PrintOpenACCConstruct(OpenACCConstructStmt *S);
 
     void PrintExpr(Expr *E) {
       if (E)
@@ -1155,87 +1157,52 @@ void StmtPrinter::VisitOMPTargetParallelGenericLoopDirective(
 //===----------------------------------------------------------------------===//
 //  OpenACC construct printing methods
 //===----------------------------------------------------------------------===//
-void StmtPrinter::VisitOpenACCComputeConstruct(OpenACCComputeConstruct *S) {
-  Indent() << "#pragma acc " << S->getDirectiveKind();
-
+void StmtPrinter::PrintOpenACCClauseList(OpenACCConstructStmt *S) {
   if (!S->clauses().empty()) {
     OS << ' ';
     OpenACCClausePrinter Printer(OS, Policy);
     Printer.VisitClauseList(S->clauses());
   }
+}
+void StmtPrinter::PrintOpenACCConstruct(OpenACCConstructStmt *S) {
+  Indent() << "#pragma acc " << S->getDirectiveKind();
+  PrintOpenACCClauseList(S);
   OS << '\n';
-
+}
+void StmtPrinter::VisitOpenACCComputeConstruct(OpenACCComputeConstruct *S) {
+  PrintOpenACCConstruct(S);
   PrintStmt(S->getStructuredBlock());
 }
 
 void StmtPrinter::VisitOpenACCLoopConstruct(OpenACCLoopConstruct *S) {
-  Indent() << "#pragma acc loop";
-
-  if (!S->clauses().empty()) {
-    OS << ' ';
-    OpenACCClausePrinter Printer(OS, Policy);
-    Printer.VisitClauseList(S->clauses());
-  }
-  OS << '\n';
-
+  PrintOpenACCConstruct(S);
   PrintStmt(S->getLoop());
 }
 
 void StmtPrinter::VisitOpenACCCombinedConstruct(OpenACCCombinedConstruct *S) {
-  Indent() << "#pragma acc " << S->getDirectiveKind();
-  if (!S->clauses().empty()) {
-    OS << ' ';
-    OpenACCClausePrinter Printer(OS, Policy);
-    Printer.VisitClauseList(S->clauses());
-  }
-  OS << '\n';
-
+  PrintOpenACCConstruct(S);
   PrintStmt(S->getLoop());
 }
 
 void StmtPrinter::VisitOpenACCDataConstruct(OpenACCDataConstruct *S) {
-  Indent() << "#pragma acc data";
-
-  if (!S->clauses().empty()) {
-    OS << ' ';
-    OpenACCClausePrinter Printer(OS, Policy);
-    Printer.VisitClauseList(S->clauses());
-  }
-  OS << '\n';
-
+  PrintOpenACCConstruct(S);
+  PrintStmt(S->getStructuredBlock());
+}
+void StmtPrinter::VisitOpenACCHostDataConstruct(OpenACCHostDataConstruct *S) {
+  PrintOpenACCConstruct(S);
   PrintStmt(S->getStructuredBlock());
 }
 void StmtPrinter::VisitOpenACCEnterDataConstruct(OpenACCEnterDataConstruct *S) {
-  Indent() << "#pragma acc enter data";
-
-  if (!S->clauses().empty()) {
-    OS << ' ';
-    OpenACCClausePrinter Printer(OS, Policy);
-    Printer.VisitClauseList(S->clauses());
-  }
-  OS << '\n';
+  PrintOpenACCConstruct(S);
 }
 void StmtPrinter::VisitOpenACCExitDataConstruct(OpenACCExitDataConstruct *S) {
-  Indent() << "#pragma acc exit data";
-
-  if (!S->clauses().empty()) {
-    OS << ' ';
-    OpenACCClausePrinter Printer(OS, Policy);
-    Printer.VisitClauseList(S->clauses());
-  }
-  OS << '\n';
+  PrintOpenACCConstruct(S);
 }
-void StmtPrinter::VisitOpenACCHostDataConstruct(OpenACCHostDataConstruct *S) {
-  Indent() << "#pragma acc host_data";
-
-  if (!S->clauses().empty()) {
-    OS << ' ';
-    OpenACCClausePrinter Printer(OS, Policy);
-    Printer.VisitClauseList(S->clauses());
-  }
-  OS << '\n';
-
-  PrintStmt(S->getStructuredBlock());
+void StmtPrinter::VisitOpenACCInitConstruct(OpenACCInitConstruct *S) {
+  PrintOpenACCConstruct(S);
+}
+void StmtPrinter::VisitOpenACCShutdownConstruct(OpenACCShutdownConstruct *S) {
+  PrintOpenACCConstruct(S);
 }
 
 void StmtPrinter::VisitOpenACCWaitConstruct(OpenACCWaitConstruct *S) {
@@ -1258,11 +1225,7 @@ void StmtPrinter::VisitOpenACCWaitConstruct(OpenACCWaitConstruct *S) {
     OS << ")";
   }
 
-  if (!S->clauses().empty()) {
-    OS << ' ';
-    OpenACCClausePrinter Printer(OS, Policy);
-    Printer.VisitClauseList(S->clauses());
-  }
+  PrintOpenACCClauseList(S);
   OS << '\n';
 }
 
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index fccd97dca23af..4c4ecd791c48f 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -2751,6 +2751,19 @@ void StmtProfiler::VisitOpenACCWaitConstruct(const OpenACCWaitConstruct *S) {
   P.VisitOpenACCClauseList(S->clauses());
 }
 
+void StmtProfiler::VisitOpenACCInitConstruct(const OpenACCInitConstruct *S) {
+  VisitStmt(S);
+  OpenACCClauseProfiler P{*this};
+  P.VisitOpenACCClauseList(S->clauses());
+}
+
+void StmtProfiler::VisitOpenACCShutdownConstruct(
+    const OpenACCShutdownConstruct *S) {
+  VisitStmt(S);
+  OpenACCClauseProfiler P{*this};
+  P.VisitOpenACCClauseList(S->clauses());
+}
+
 void StmtProfiler::VisitHLSLOutArgExpr(const HLSLOutArgExpr *S) {
   VisitStmt(S);
 }
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 7cdffbe20e575..f9cbdf6916dcb 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -2963,6 +2963,13 @@ void TextNodeDumper::VisitOpenACCHostDataConstruct(
 void TextNodeDumper::VisitOpenACCWaitConstruct(const OpenACCWaitConstruct *S) {
   OS << " " << S->getDirectiveKind();
 }
+void TextNodeDumper::VisitOpenACCInitConstruct(const OpenACCInitConstruct *S) {
+  OS << " " << S->getDirectiveKind();
+}
+void TextNodeDumper::VisitOpenACCShutdownConstruct(
+    const OpenACCShutdownConstruct *S) {
+  OS << " " << S->getDirectiveKind();
+}
 
 void TextNodeDumper::VisitEmbedExpr(const EmbedExpr *S) {
   AddChild("begin", [=] { OS << S->getStartingElementPos(); });
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index 6c604f44e283b..3974739d2abb4 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -473,6 +473,12 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef<const Attr *> Attrs) {
   case Stmt::OpenACCWaitConstructClass:
     EmitOpenACCWaitConstruct(cast<OpenACCWaitConstruct>(*S));
     break;
+  case Stmt::OpenACCInitConstructClass:
+    EmitOpenACCInitConstruct(cast<OpenACCInitConstruct>(*S));
+    break;
+  case Stmt::OpenACCShutdownConstructClass:
+    EmitOpenACCShutdownConstruct(cast<OpenACCShutdownConstruct>(*S));
+    break;
   }
 }
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 4d4139180e100..1a5c42f8f974d 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4123,6 +4123,16 @@ class CodeGenFunction : public CodeGenTypeCache {
     // but in the future we will implement some sort of IR.
   }
 
+  void EmitOpenACCInitConstruct(const OpenACCInitConstruct &S) {
+    // TODO OpenACC: Implement this.  It is currently implemented as a 'no-op',
+    // but in the future we will implement some sort of IR.
+  }
+
+  void EmitOpenACCShutdownConstruct(const OpenACCShutdownConstruct &S) {
+    // TODO OpenACC: Implement this.  It is currently implemented as a 'no-op',
+    // but in the future we will implement some sort of IR.
+  }
+
   //===--------------------------------------------------------------------===//
   //                         LValue Expression Emission
   //===--------------------------------------------------------------------===//
diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp
index 5da34a2f5db92..31ec0c7c1d718 100644
--- a/clang/lib/Parse/ParseOpenACC.cpp
+++ b/clang/lib/Parse/ParseOpenACC.cpp
@@ -574,6 +574,8 @@ bool doesDirectiveHaveAssociatedStmt(OpenACCDirectiveKind DirKind) {
   case OpenACCDirectiveKind::EnterData:
   case OpenACCDirectiveKind::ExitData:
   case OpenACCDirectiveKind::Wait:
+  case OpenACCDirectiveKind::Init:
+  case OpenACCDirectiveKind::Shutdown:
     return false;
   case OpenACCDirectiveKind::Parallel:
   case OpenACCDirectiveKind::Serial:
@@ -606,6 +608,8 @@ unsigned getOpenACCScopeFlags(OpenACCDirectiveKind DirKind) {
   case OpenACCDirectiveKind::ExitData:
   case OpenACCDirectiveKind::HostData:
   case OpenACCDirectiveKind::Wait:
+  case OpenACCDirectiveKind::Init:
+  case OpenACCDirectiveKind::Shutdown:
     return 0;
   case OpenACCDirectiveKind::Invalid:
     llvm_unreachable("Shouldn't be creating a scope for an invalid construct");
diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp
index 505cc5e153fa7..ac3666394d0e8 100644
--- a/clang/lib/Sema/SemaExceptionSpec.cpp
+++ b/clang/lib/Sema/SemaExceptionSpec.cpp
@@ -1399,6 +1399,8 @@ CanThrowResult Sema::canThrow(const Stmt *S) {
   case Stmt::OpenACCEnterDataConstructClass:
   case Stmt::OpenACCExitDataConstructClass:
   case Stmt::OpenACCWaitConstructClass:
+  case Stmt::OpenACCInitConstructClass:
+  case Stmt::OpenACCShutdownConstructClass:
     // These expressions can never throw.
     return CT_Cannot;
 
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index aa9097bfa1743..618c0f6257640 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -574,7 +574,9 @@ bool checkValidAfterDeviceType(
 bool isDirectiveKindImplemented(OpenACCDirectiveKind DK) {
   return isOpenACCComputeDirectiveKind(DK) ||
          isOpenACCCombinedDirectiveKind(DK) || isOpenACCDataDirectiveKind(DK) ||
-         DK == OpenACCDirectiveKind::Loop || DK == OpenACCDirectiveKind::Wait;
+         DK == OpenACCDirectiveKind::Loop || DK == OpenACCDirectiveKind::Wait ||
+         DK == OpenACCDirectiveKind::Init ||
+         DK == OpenACCDirectiveKind::Shutdown;
 }
 
 class SemaOpenACCClauseVisitor {
@@ -699,7 +701,10 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitIfClause(
   // sense. Prose DOES exist for 'data' and 'host_data', 'enter data' and 'exit
   // data' both don't, but other implmementations do this.  OpenACC issue 519
   // filed for the latter two.
-  if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
+  // GCC allows this on init/shutdown, presumably for good reason, so we do too.
+  if (Clause.getDirectiveKind() != OpenACCDirectiveKind::Init &&
+      Clause.getDirectiveKind() != OpenACCDirectiveKind::Shutdown &&
+      checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause))
     return nullptr;
 
   // The parser has ensured that we have a proper condition expr, so there
@@ -1868,6 +1873,8 @@ bool PreserveLoopRAIIDepthInAssociatedStmtRAII(OpenACCDirectiveKind DK) {
   case OpenACCDirectiveKind::EnterData:
   case OpenACCDirectiveKind::ExitData:
   case OpenACCDirectiveKind::Wait:
+  case OpenACCDirectiveKind::Init:
+  case OpenACCDirectiveKind::Shutdown:
     llvm_unreachable("Doesn't have an associated stmt");
   default:
   case OpenACCDirectiveKind::Invalid:
@@ -2294,6 +2301,8 @@ void SemaOpenACC::ActOnConstruct(OpenACCDirectiveKind K,
   case OpenACCDirectiveKind::EnterData:
   case OpenACCDirectiveKind::ExitData:
   case OpenACCDirectiveKind::HostData:
+  case OpenACCDirectiveKind::Init:
+  case OpenACCDirectiveKind::Shutdown:
     // Nothing to do here, there is no real legalization that needs to happen
     // here as these constructs do not take any arguments.
     break;
@@ -3682,6 +3691,14 @@ StmtResult SemaOpenACC::ActOnEndStmtDirective(
         getASTContext(), StartLoc, DirLoc, LParenLoc, Exprs.front(), MiscLoc,
         Exprs.drop_front(), RParenLoc, EndLoc, Clauses);
   }
+  case OpenACCDirectiveKind::Init: {
+    return OpenACCInitConstruct::Create(getASTContext(), StartLoc, DirLoc,
+                                        EndLoc, Clauses);
+  }
+  case OpenACCDirectiveKind::Shutdown: {
+    return OpenACCShutdownConstruct::Create(getASTContext(), StartLoc, DirLoc,
+                                            EndLoc, Clauses);
+  }
   }
   llvm_unreachable("Unhandled case in directive handling?");
 }
@@ -3695,6 +3712,8 @@ StmtResult SemaOpenACC::ActOnAssociatedStmt(
   case OpenACCDirectiveKind::EnterData:
   case OpenACCDirectiveKind::ExitData:
   case OpenACCDirectiveKind::Wait:
+  case OpenACCDirectiveKind::Init:
+  case OpenACCDirectiveKind::Shutdown:
     llvm_unreachable(
         "these don't have associated statements, so shouldn't get here");
   case OpenACCDirectiveKind::Parallel:
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index c097465374cba..686132dbc5f5d 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -4151,6 +4151,24 @@ class TreeTransform {
         SourceLocation{}, {}, SourceLocation{}, EndLoc, Clauses, StrBlock);
   }
 
+  StmtResult RebuildOpenACCInitConstruct(SourceLocation BeginLoc,
+                                         SourceLocation DirLoc,
+                                         SourceLocation EndLoc,
+                                         ArrayRef<OpenACCClause *> Clauses) {
+    return getSema().OpenACC().ActOnEndStmtDirective(
+        OpenACCDirectiveKind::Init, BeginLoc, DirLoc, SourceLocation{},
+        SourceLocation{}, {}, SourceLocation{}, EndLoc, Clauses, {});
+  }
+
+  StmtResult
+  RebuildOpenACCShutdownConstruct(SourceLocation BeginLoc,
+                                  SourceLocation DirLoc, SourceLocation EndLoc,
+                                  ArrayRef<OpenACCClause *> Clauses) {
+    return getSema().OpenACC().ActOnEndStmtDirective(
+        OpenACCDirectiveKind::Shutdown, BeginLoc, DirLoc, SourceLocation{},
+        SourceLocation{}, {}, SourceLocation{}, EndLoc, Clauses, {});
+  }
+
   StmtResult RebuildOpenACCWaitConstruct(
       SourceLocation BeginLoc, SourceLocation DirLoc, SourceLocation LParenLoc,
       Expr *DevNumExpr, SourceLocation QueuesLoc, ArrayRef<Expr *> QueueIdExprs,
@@ -12348,6 +12366,40 @@ StmtResult TreeTransform<Derived>::TransformOpenACCHostDataConstruct(
       TransformedClauses, StrBlock);
 }
 
+template <typename Derived>
+StmtResult
+TreeTransform<Derived>::TransformOpenACCInitConstruct(OpenACCInitConstruct *C) {
+  getSema().OpenACC().ActOnConstruct(C->getDirectiveKind(), C->getBeginLoc());
+
+  llvm::SmallVector<OpenACCClause *> TransformedClauses =
+      getDerived().TransformOpenACCClauseList(C->getDirectiveKind(),
+                                              C->clauses());
+  if (getSema().OpenACC().ActOnStartStmtDirective(
+          C->getDirectiveKind(), C->getBeginLoc(), TransformedClauses))
+    return StmtError();
+
+  return getDerived().RebuildOpenACCInitConstruct(
+      C->getBeginLoc(), C->getDirectiveLoc(), C->getEndLoc(),
+      TransformedClauses);
+}
+
+template <typename Derived>
+StmtResult TreeTransform<Derived>::TransformOpenACCShutdownConstruct(
+    OpenACCShutdownConstruct *C) {
+  getSema().OpenACC().ActOnConstruct(C->getDirectiveKind(), C->getBeginLoc());
+
+  llvm::SmallVector<OpenACCClause *> TransformedClauses =
+      getDerived().TransformOpenACCClauseList(C->getDirectiveKind(),
+                                              C->clauses());
+  if (getSema().OpenACC().ActOnStartStmtDirective(
+          C->getDirectiveKind(), C->getBeginLoc(), TransformedClauses))
+    return StmtError();
+
+  return getDerived().RebuildOpenACCShutdownConstruct(
+      C->getBeginLoc(), C->getDirectiveLoc(), C->getEndLoc(),
+      TransformedClauses);
+}
+
 template <typename Derived>
 StmtResult
 TreeTransform<Derived>::TransformOpenACCWaitConstruct(OpenACCWaitConstruct *C) {
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 8fe0412706ce3..9e8cf19a6f0f7 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -2865,6 +2865,16 @@ void ASTStmtReader::VisitOpenACCExitDataConstruct(OpenACCExitDataConstruct *S) {
   VisitOpenACCConstructStmt(S);
 }
 
+void ASTStmtReader::VisitOpenACCInitConstruct(OpenACCInitConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCConstructStmt(S);
+}
+
+void ASTStmtReader::VisitOpenACCShutdownConstruct(OpenACCShutdownConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCConstructStmt(S);
+}
+
 void ASTStmtReader::VisitOpenACCHostDataConstruct(OpenACCHostDataConstruct *S) {
   VisitStmt(S);
   VisitOpenACCAssociatedStmtConstruct(S);
@@ -4387,6 +4397,16 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
       S = OpenACCWaitConstruct::CreateEmpty(Context, NumExprs, NumClauses);
       break;
     }
+    case STMT_OPENACC_INIT_CONSTRUCT: {
+      unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
+      S = OpenACCInitConstruct::CreateEmpty(Context, NumClauses);
+      break;
+    }
+    case STMT_OPENACC_SHUTDOWN_CONSTRUCT: {
+      unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
+      S = OpenACCShutdownConstruct::CreateEmpty(Context, NumClauses);
+      break;
+    }
     case EXPR_REQUIRES: {
       unsigned numLocalParameters = Record[ASTStmtReader::NumExprFields];
       unsigned numRequirement = Record[ASTStmtReader::NumExprFields + 1];
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index f13443d18b612..1d42b43c3e2ca 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -2945,6 +2945,18 @@ void ASTStmtWriter::VisitOpenACCExitDataConstruct(OpenACCExitDataConstruct *S) {
   Code = serialization::STMT_OPENACC_EXIT_DATA_CONSTRUCT;
 }
 
+void ASTStmtWriter::VisitOpenACCInitConstruct(OpenACCInitConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCConstructStmt(S);
+  Code = serialization::STMT_OPENACC_INIT_CONSTRUCT;
+}
+
+void ASTStmtWriter::VisitOpenACCShutdownConstruct(OpenACCShutdownConstruct *S) {
+  VisitStmt(S);
+  VisitOpenACCConstructStmt(S);
+  Code = serialization::STMT_OPENACC_SHUTDOWN_CONSTRUCT;
+}
+
 void ASTStmtWriter::VisitOpenACCHostDataConstruct(OpenACCHostDataConstruct *S) {
   VisitStmt(S);
   VisitOpenACCAssociatedStmtConstruct(S);
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 0a74a80a6a62f..db385e891e762 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1830,6 +1830,8 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
     case Stmt::OpenACCExitDataConstructClass:
     case Stmt::OpenACCHostDataConstructClass:
     case Stmt::OpenACCWaitConstructClass:
+    case Stmt::OpenACCInitConstructClass:
+    case Stmt::OpenACCShutdownConstructClass:
     case Stmt::OMPUnrollDirectiveClass:
     case Stmt::OMPMetaDirectiveClass:
     case Stmt::HLSLOutArgExprClass: {
diff --git a/clang/test/AST/ast-print-openacc-init-construct.cpp b/clang/test/AST/ast-print-openacc-init-construct.cpp
new file mode 100644
index 0000000000000..8bee2d84118f5
--- /dev/null
+++ b/clang/test/AST/ast-print-openacc-init-construct.cpp
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -fopenacc -ast-print %s -o - | FileCheck %s
+
+unsigned Int;
+
+void uses() {
+// CHECK: #pragma acc init device_type(*) if(Int == 5)
+#pragma acc init device_type(*) device_num(Int) if (Int == 5)
+// CHECK: #pragma acc init device_type(*)
+// CHECK-NOT: device_num(Int)
+#pragma acc init device_type(*) device_num(Int)
+// CHECK: #pragma acc init device_type(*) if(Int == 5)
+#pragma acc init device_type(*) if (Int == 5)
+// CHECK: #pragma acc init device_type(SomeName)
+#pragma acc init device_type(SomeName)
+}
diff --git a/clang/test/AST/ast-print-openacc-shutdown-construct.cpp b/clang/test/AST/ast-print-openacc-shutdown-construct.cpp
new file mode 100644
index 0000000000000..c1da69dd82a23
--- /dev/null
+++ b/clang/test/AST/ast-print-openacc-shutdown-construct.cpp
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -fopenacc -ast-print %s -o - | FileCheck %s
+
+unsigned Int;
+
+void uses() {
+// CHECK: #pragma acc shutdown device_type(*) if(Int == 5)
+#pragma acc shutdown device_type(*) device_num(Int) if (Int == 5)
+// CHECK: #pragma acc shutdown device_type(*)
+// CHECK-NOT: device_num(Int)
+#pragma acc shutdown device_type(*) device_num(Int)
+// CHECK: #pragma acc shutdown device_type(*) if(Int == 5)
+#pragma acc shutdown device_type(*) if (Int == 5)
+// CHECK: #pragma acc shutdown device_type(SomeName)
+#pragma acc shutdown device_type(SomeName)
+}
diff --git a/clang/test/ParserOpenACC/parse-clauses.c b/clang/test/ParserOpenACC/parse-clauses.c
index 2e884c472c2ec..7bd8edcee3db8 100644
--- a/clang/test/ParserOpenACC/parse-clauses.c
+++ b/clang/test/ParserOpenACC/parse-clauses.c
@@ -803,29 +803,23 @@ void IntExprParsing() {
 #pragma acc parallel num_workers(returns_int())
   {}
 
-  // expected-error@+2{{expected '('}}
-  // expected-warning@+1{{OpenACC construct 'init' not yet implemented, pragma ignored}}
+  // expected-error@+1{{expected '('}}
 #pragma acc init device_num
 
-  // expected-error@+2{{expected expression}}
-  // expected-warning@+1{{OpenACC construct 'init' not yet implemented, pragma ignored}}
+  // expected-error@+1{{expected expression}}
 #pragma acc init device_num()
 
-  // expected-error@+2{{use of undeclared identifier 'invalid'}}
-  // expected-warning@+1{{OpenACC construct 'init' not yet implemented, pragma ignored}}
+  // expected-error@+1{{use of undeclared identifier 'invalid'}}
 #pragma acc init device_num(invalid)
 
-  // expected-error@+3{{expected ')'}}
-  // expected-note@+2{{to match this '('}}
-  // expected-warning@+1{{OpenACC construct 'init' not yet implemented, pragma ignored}}
+  // expected-error@+2{{expected ')'}}
+  // expected-note@+1{{to match this '('}}
 #pragma acc init device_num(5, 4)
 
-  // expected-warning@+2{{OpenACC clause 'device_num' not yet implemented, clause ignored}}
-  // expected-warning@+1{{OpenACC construct 'init' not yet implemented, pragma ignored}}
+  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented, clause ignored}}
 #pragma acc init device_num(5)
 
-  // expected-warning@+2{{OpenACC clause 'device_num' not yet implemented, clause ignored}}
-  // expected-warning@+1{{OpenACC construct 'init' not yet implemented, pragma ignored}}
+  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented, clause ignored}}
 #pragma acc init device_num(returns_int())
 
   // expected-error@+2{{expected '('}}
diff --git a/clang/test/ParserOpenACC/parse-constructs.c b/clang/test/ParserOpenACC/parse-constructs.c
index 4a6c31cc9b0a9..878c38e8bedc7 100644
--- a/clang/test/ParserOpenACC/parse-constructs.c
+++ b/clang/test/ParserOpenACC/parse-constructs.c
@@ -141,12 +141,10 @@ void func() {
   // expected-warning@+1{{OpenACC construct 'declare' not yet implemented, pragma ignored}}
 #pragma acc declare clause list
   for(;;){}
-  // expected-error@+2{{invalid OpenACC clause 'clause'}}
-  // expected-warning@+1{{OpenACC construct 'init' not yet implemented, pragma ignored}}
+  // expected-error@+1{{invalid OpenACC clause 'clause'}}
 #pragma acc init clause list
   for(;;){}
-  // expected-error@+2{{invalid OpenACC clause 'clause'}}
-  // expected-warning@+1{{OpenACC construct 'shutdown' not yet implemented, pragma ignored}}
+  // expected-error@+1{{invalid OpenACC clause 'clause'}}
 #pragma acc shutdown clause list
   for(;;){}
   // expected-error@+2{{invalid OpenACC clause 'clause'}}
diff --git a/clang/test/SemaOpenACC/combined-construct-async-clause.cpp b/clang/test/SemaOpenACC/combined-construct-async-clause.cpp
index 872586acbaffa..c54f340cdfd96 100644
--- a/clang/test/SemaOpenACC/combined-construct-async-clause.cpp
+++ b/clang/test/SemaOpenACC/combined-construct-async-clause.cpp
@@ -48,7 +48,7 @@ void Test() {
 #pragma acc parallel loop async(Convert)
   for (int i = 5; i < 10; ++i);
 
-  // expected-error@+2{{OpenACC integer expression type 'struct ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc kernels loop async(Explicit)
   for (int i = 5; i < 10; ++i);
@@ -94,12 +94,12 @@ void TestInst() {
 #pragma acc kernels loop async(T::ACValue)
   for (int i = 5; i < 10; ++i);
 
-  // expected-error@+2{{OpenACC integer expression type 'const ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'const ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc parallel loop async(HasInt::EXValue)
   for (int i = 5; i < 10; ++i);
 
-  // expected-error@+2{{OpenACC integer expression type 'const ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'const ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc kernels loop async(T::EXValue)
   for (int i = 5; i < 10; ++i);
diff --git a/clang/test/SemaOpenACC/combined-construct-wait-clause.cpp b/clang/test/SemaOpenACC/combined-construct-wait-clause.cpp
index 9ce04290cbe5a..e0411925e9f20 100644
--- a/clang/test/SemaOpenACC/combined-construct-wait-clause.cpp
+++ b/clang/test/SemaOpenACC/combined-construct-wait-clause.cpp
@@ -18,7 +18,7 @@ void Test() {
 #pragma acc parallel loop wait(Ambiguous)
   for (unsigned i = 0; i < 5; ++i);
 
-  // expected-error@+2{{OpenACC integer expression type 'struct ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc parallel loop wait(4, Explicit, 5)
   for (unsigned i = 0; i < 5; ++i);
@@ -29,12 +29,12 @@ void Test() {
 #pragma acc parallel loop wait(queues: Ambiguous, 5)
   for (unsigned i = 0; i < 5; ++i);
 
-  // expected-error@+2{{OpenACC integer expression type 'struct ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc parallel loop wait(devnum: Explicit: 5)
   for (unsigned i = 0; i < 5; ++i);
 
-  // expected-error@+2{{OpenACC integer expression type 'struct ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc parallel loop wait(devnum: Explicit:queues:  5)
   for (unsigned i = 0; i < 5; ++i);
@@ -70,7 +70,7 @@ void TestInst() {
 #pragma acc parallel loop wait(devnum:T::value :queues:T::ACValue)
   for (unsigned i = 0; i < 5; ++i);
 
-  // expected-error@+5{{OpenACC integer expression type 'const ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+5{{OpenACC integer expression requires explicit conversion from 'const ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
   // expected-error@+3{{multiple conversions from expression type 'const AmbiguousConvert' to an integral type}}
   // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
@@ -78,7 +78,7 @@ void TestInst() {
 #pragma acc parallel loop wait(devnum:T::EXValue :queues:T::ACValue)
   for (unsigned i = 0; i < 5; ++i);
 
-  // expected-error@+5{{OpenACC integer expression type 'const ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+5{{OpenACC integer expression requires explicit conversion from 'const ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
   // expected-error@+3{{multiple conversions from expression type 'const AmbiguousConvert' to an integral type}}
   // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
@@ -86,7 +86,7 @@ void TestInst() {
 #pragma acc parallel loop wait(T::EXValue, T::ACValue)
   for (unsigned i = 0; i < 5; ++i);
 
-  // expected-error@+5{{OpenACC integer expression type 'const ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+5{{OpenACC integer expression requires explicit conversion from 'const ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
   // expected-error@+3{{multiple conversions from expression type 'const AmbiguousConvert' to an integral type}}
   // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
diff --git a/clang/test/SemaOpenACC/compute-construct-async-clause.cpp b/clang/test/SemaOpenACC/compute-construct-async-clause.cpp
index a5da7c8f4e56e..44b1540a934c2 100644
--- a/clang/test/SemaOpenACC/compute-construct-async-clause.cpp
+++ b/clang/test/SemaOpenACC/compute-construct-async-clause.cpp
@@ -46,7 +46,7 @@ void Test() {
 #pragma acc parallel async(Convert)
   while(1);
 
-  // expected-error@+2{{OpenACC integer expression type 'struct ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc kernels async(Explicit)
   while(1);
@@ -92,12 +92,12 @@ void TestInst() {
 #pragma acc kernels async(T::ACValue)
   while (1);
 
-  // expected-error@+2{{OpenACC integer expression type 'const ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'const ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc parallel async(HasInt::EXValue)
   while (1);
 
-  // expected-error@+2{{OpenACC integer expression type 'const ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'const ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc kernels async(T::EXValue)
   while (1);
diff --git a/clang/test/SemaOpenACC/compute-construct-num_gangs-clause.cpp b/clang/test/SemaOpenACC/compute-construct-num_gangs-clause.cpp
index c50f52afda7c1..c6dbe4db2be64 100644
--- a/clang/test/SemaOpenACC/compute-construct-num_gangs-clause.cpp
+++ b/clang/test/SemaOpenACC/compute-construct-num_gangs-clause.cpp
@@ -74,20 +74,20 @@ void Test() {
 #pragma acc parallel num_gangs(SomeE, NC)
   while(1);
 
-  // expected-error@+3{{OpenACC integer expression type 'struct ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+3{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
   // expected-error@+1{{OpenACC clause 'num_gangs' requires expression of integer type ('struct NotConvertible' invalid)}}
 #pragma acc parallel num_gangs(Explicit, NC)
   while(1);
 
-  // expected-error@+4{{OpenACC integer expression type 'struct ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+4{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
   // expected-error@+2{{OpenACC clause 'num_gangs' requires expression of integer type ('struct NotConvertible' invalid)}}
   // expected-error@+1{{OpenACC 'num_gangs' clause is not valid on 'serial' directive}}
 #pragma acc serial num_gangs(Explicit, NC)
   while(1);
 
-  // expected-error@+6{{OpenACC integer expression type 'struct ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+6{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
   // expected-error@+4{{OpenACC clause 'num_gangs' requires expression of integer type ('struct NotConvertible' invalid)}}
   // expected-error@+3{{multiple conversions from expression type 'struct AmbiguousConvert' to an integral type}}
@@ -96,7 +96,7 @@ void Test() {
 #pragma acc parallel num_gangs(Explicit, NC, Ambiguous)
   while(1);
 
-  // expected-error@+7{{OpenACC integer expression type 'struct ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+7{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
   // expected-error@+5{{OpenACC clause 'num_gangs' requires expression of integer type ('struct NotConvertible' invalid)}}
   // expected-error@+4{{multiple conversions from expression type 'struct AmbiguousConvert' to an integral type}}
diff --git a/clang/test/SemaOpenACC/compute-construct-num_workers-clause.cpp b/clang/test/SemaOpenACC/compute-construct-num_workers-clause.cpp
index 9449b77d092f4..c73f421222e86 100644
--- a/clang/test/SemaOpenACC/compute-construct-num_workers-clause.cpp
+++ b/clang/test/SemaOpenACC/compute-construct-num_workers-clause.cpp
@@ -44,7 +44,7 @@ void Test() {
 #pragma acc parallel num_workers(Convert)
   while(1);
 
-  // expected-error@+2{{OpenACC integer expression type 'struct ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc kernels num_workers(Explicit)
   while(1);
@@ -90,12 +90,12 @@ void TestInst() {
 #pragma acc kernels num_workers(T::ACValue)
   while (1);
 
-  // expected-error@+2{{OpenACC integer expression type 'const ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'const ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc parallel num_workers(HasInt::EXValue)
   while (1);
 
-  // expected-error@+2{{OpenACC integer expression type 'const ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'const ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc kernels num_workers(T::EXValue)
   while (1);
diff --git a/clang/test/SemaOpenACC/compute-construct-private-clause.c b/clang/test/SemaOpenACC/compute-construct-private-clause.c
index d979fd909f11c..a40d84c669d9a 100644
--- a/clang/test/SemaOpenACC/compute-construct-private-clause.c
+++ b/clang/test/SemaOpenACC/compute-construct-private-clause.c
@@ -135,8 +135,7 @@ void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete Compo
 #pragma acc parallel private((float)ArrayParam[2])
   while(1);
 
-  // expected-error@+2{{OpenACC 'private' clause is not valid on 'init' directive}}
-  // expected-warning@+1{{OpenACC construct 'init' not yet implemented}}
+  // expected-error@+1{{OpenACC 'private' clause is not valid on 'init' directive}}
 #pragma acc init private(LocalInt)
   for(;;);
 }
diff --git a/clang/test/SemaOpenACC/compute-construct-reduction-clause.c b/clang/test/SemaOpenACC/compute-construct-reduction-clause.c
index fcc4ca2655c20..4d426cf189fca 100644
--- a/clang/test/SemaOpenACC/compute-construct-reduction-clause.c
+++ b/clang/test/SemaOpenACC/compute-construct-reduction-clause.c
@@ -105,8 +105,7 @@ void uses(unsigned Parm) {
 #pragma acc parallel reduction(&:HA.array[1:2])
   while (1);
 
-  // expected-error@+2{{OpenACC 'reduction' clause is not valid on 'init' directive}}
-  // expected-warning@+1{{OpenACC construct 'init' not yet implemented}}
+  // expected-error@+1{{OpenACC 'reduction' clause is not valid on 'init' directive}}
 #pragma acc init reduction(+:I)
   for(;;);
 }
diff --git a/clang/test/SemaOpenACC/compute-construct-vector_length-clause.cpp b/clang/test/SemaOpenACC/compute-construct-vector_length-clause.cpp
index f6c5dde1a0235..f2bbe0f022a07 100644
--- a/clang/test/SemaOpenACC/compute-construct-vector_length-clause.cpp
+++ b/clang/test/SemaOpenACC/compute-construct-vector_length-clause.cpp
@@ -44,7 +44,7 @@ void Test() {
 #pragma acc parallel vector_length(Convert)
   while(1);
 
-  // expected-error@+2{{OpenACC integer expression type 'struct ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc kernels vector_length(Explicit)
   while(1);
@@ -90,12 +90,12 @@ void TestInst() {
 #pragma acc kernels vector_length(T::ACValue)
   while (1);
 
-  // expected-error@+2{{OpenACC integer expression type 'const ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'const ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc parallel vector_length(HasInt::EXValue)
   while (1);
 
-  // expected-error@+2{{OpenACC integer expression type 'const ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'const ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc kernels vector_length(T::EXValue)
   while (1);
diff --git a/clang/test/SemaOpenACC/compute-construct-wait-clause.cpp b/clang/test/SemaOpenACC/compute-construct-wait-clause.cpp
index 94f669be0f672..fbe40ce5f31c1 100644
--- a/clang/test/SemaOpenACC/compute-construct-wait-clause.cpp
+++ b/clang/test/SemaOpenACC/compute-construct-wait-clause.cpp
@@ -18,7 +18,7 @@ void Test() {
 #pragma acc parallel wait(Ambiguous)
   while (true);
 
-  // expected-error@+2{{OpenACC integer expression type 'struct ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc parallel wait(4, Explicit, 5)
   while (true);
@@ -29,12 +29,12 @@ void Test() {
 #pragma acc parallel wait(queues: Ambiguous, 5)
   while (true);
 
-  // expected-error@+2{{OpenACC integer expression type 'struct ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc parallel wait(devnum: Explicit: 5)
   while (true);
 
-  // expected-error@+2{{OpenACC integer expression type 'struct ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc parallel wait(devnum: Explicit:queues:  5)
   while (true);
@@ -70,7 +70,7 @@ void TestInst() {
 #pragma acc parallel wait(devnum:T::value :queues:T::ACValue)
   while (true);
 
-  // expected-error@+5{{OpenACC integer expression type 'const ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+5{{OpenACC integer expression requires explicit conversion from 'const ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
   // expected-error@+3{{multiple conversions from expression type 'const AmbiguousConvert' to an integral type}}
   // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
@@ -78,7 +78,7 @@ void TestInst() {
 #pragma acc parallel wait(devnum:T::EXValue :queues:T::ACValue)
   while (true);
 
-  // expected-error@+5{{OpenACC integer expression type 'const ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+5{{OpenACC integer expression requires explicit conversion from 'const ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
   // expected-error@+3{{multiple conversions from expression type 'const AmbiguousConvert' to an integral type}}
   // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
@@ -86,7 +86,7 @@ void TestInst() {
 #pragma acc parallel wait(T::EXValue, T::ACValue)
   while (true);
 
-  // expected-error@+5{{OpenACC integer expression type 'const ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+5{{OpenACC integer expression requires explicit conversion from 'const ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
   // expected-error@+3{{multiple conversions from expression type 'const AmbiguousConvert' to an integral type}}
   // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
diff --git a/clang/test/SemaOpenACC/init-construct-ast.cpp b/clang/test/SemaOpenACC/init-construct-ast.cpp
new file mode 100644
index 0000000000000..793a4829d14b3
--- /dev/null
+++ b/clang/test/SemaOpenACC/init-construct-ast.cpp
@@ -0,0 +1,125 @@
+// RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s
+
+// Test this with PCH.
+// RUN: %clang_cc1 %s -fopenacc -emit-pch -o %t %s
+// RUN: %clang_cc1 %s -fopenacc -include-pch %t -ast-dump-all | FileCheck %s
+
+#ifndef PCH_HELPER
+#define PCH_HELPER
+
+int some_int();
+long some_long();
+void NormalFunc() {
+  // CHECK-LABEL: NormalFunc
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc init
+  // CHECK-NEXT: OpenACCInitConstruct{{.*}}init
+#pragma acc init if (some_int() < some_long())
+  // CHECK-NEXT: OpenACCInitConstruct{{.*}}init
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}} 'bool' '<'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'long'
+  // CHECK-NEXT: CallExpr
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()'
+#pragma acc init device_num(some_int())
+  // CHECK-NEXT: OpenACCInitConstruct{{.*}}init
+#pragma acc init device_type(T)
+  // CHECK-NEXT: OpenACCInitConstruct{{.*}}init
+  // CHECK-NEXT: device_type(T)
+#pragma acc init if (some_int() < some_long()) device_type(T) device_num(some_int())
+  // CHECK-NEXT: OpenACCInitConstruct{{.*}}init
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}} 'bool' '<'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'long'
+  // CHECK-NEXT: CallExpr
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()'
+  // CHECK-NEXT: device_type(T)
+}
+
+template<typename T>
+void TemplFunc(T t) {
+  // CHECK-LABEL: FunctionTemplateDecl {{.*}}TemplFunc
+  // CHECK-NEXT: TemplateTypeParmDecl
+  // CHECK-NEXT: FunctionDecl{{.*}}TemplFunc
+  // CHECK-NEXT: ParmVarDecl{{.*}} t 'T'
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc init
+  // CHECK-NEXT: OpenACCInitConstruct{{.*}}init
+#pragma acc init if (T::value > t)
+  // CHECK-NEXT: OpenACCInitConstruct{{.*}}init
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}} '<dependent type>' '>'
+  // CHECK-NEXT: DependentScopeDeclRefExpr
+  // CHECK-NEXT: NestedNameSpecifier{{.*}} 'T'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T'
+#pragma acc init device_num(t)
+  // CHECK-NEXT: OpenACCInitConstruct{{.*}}init
+#pragma acc init device_type(T)
+  // CHECK-NEXT: OpenACCInitConstruct{{.*}}init
+  // CHECK-NEXT: device_type(T)
+#pragma acc init if (T::value > t) device_type(T) device_num(t)
+  // CHECK-NEXT: OpenACCInitConstruct{{.*}}init
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}} '<dependent type>' '>'
+  // CHECK-NEXT: DependentScopeDeclRefExpr
+  // CHECK-NEXT: NestedNameSpecifier{{.*}} 'T'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T'
+  // CHECK-NEXT: device_type(T)
+
+  // Instantiation:
+  // CHECK-NEXT: FunctionDecl{{.*}} TemplFunc 'void (SomeStruct)' implicit_instantiation
+  // CHECK-NEXT: TemplateArgument type 'SomeStruct'
+  // CHECK-NEXT: RecordType{{.*}} 'SomeStruct'
+  // CHECK-NEXT: CXXRecord{{.*}} 'SomeStruct'
+  // CHECK-NEXT: ParmVarDecl{{.*}} t 'SomeStruct'
+  // CHECK-NEXT: CompoundStmt
+
+  // CHECK-NEXT: OpenACCInitConstruct{{.*}}init
+
+  // CHECK-NEXT: OpenACCInitConstruct{{.*}}init
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}} 'bool' '>'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int'
+  // CHECK-NEXT: NestedNameSpecifier{{.*}} 'SomeStruct'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}} 'unsigned int'
+  // CHECK-NEXT: MemberExpr{{.*}} .operator unsigned int
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'SomeStruct'
+
+  // CHECK-NEXT: OpenACCInitConstruct{{.*}}init
+
+  // CHECK-NEXT: OpenACCInitConstruct{{.*}}init
+  // CHECK-NEXT: device_type(T)
+
+  // CHECK-NEXT: OpenACCInitConstruct{{.*}}init
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}} 'bool' '>'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int'
+  // CHECK-NEXT: NestedNameSpecifier{{.*}} 'SomeStruct'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}} 'unsigned int'
+  // CHECK-NEXT: MemberExpr{{.*}} .operator unsigned int
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'SomeStruct'
+}
+
+struct SomeStruct{
+  static constexpr unsigned value = 5;
+  operator unsigned();
+};
+
+void use() {
+  TemplFunc(SomeStruct{});
+}
+#endif
diff --git a/clang/test/SemaOpenACC/init-construct.cpp b/clang/test/SemaOpenACC/init-construct.cpp
new file mode 100644
index 0000000000000..152b27c0f3be1
--- /dev/null
+++ b/clang/test/SemaOpenACC/init-construct.cpp
@@ -0,0 +1,75 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+struct NotConvertible{} NC;
+short getS();
+int getI();
+
+struct AmbiguousConvert{
+  operator int(); // #AMBIG_INT
+  operator short(); // #AMBIG_SHORT
+  operator float();
+} Ambiguous;
+
+struct ExplicitConvertOnly {
+  explicit operator int() const; // #EXPL_CONV
+} Explicit;
+
+void uses() {
+#pragma acc init
+#pragma acc init if (getI() < getS())
+  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+#pragma acc init device_num(getI())
+  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+#pragma acc init device_type(SOMETHING) device_num(getI())
+#pragma acc init device_type(SOMETHING) if (getI() < getS())
+  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+#pragma acc init device_type(SOMETHING) device_num(getI()) if (getI() < getS())
+
+  // expected-error@+1{{value of type 'struct NotConvertible' is not contextually convertible to 'bool'}}
+#pragma acc init if (NC)
+
+  // expected-error@+1{{OpenACC clause 'device_num' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc init device_num(NC)
+  // expected-error@+3{{multiple conversions from expression type 'struct AmbiguousConvert' to an integral type}}
+  // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
+  // expected-note@#AMBIG_SHORT{{conversion to integral type 'short'}}
+#pragma acc init device_num(Ambiguous)
+  // expected-warning@+3{{OpenACC clause 'device_num' not yet implemented}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
+  // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
+#pragma acc init device_num(Explicit)
+}
+
+template<typename T>
+void TestInst() {
+  T t;
+#pragma acc init
+#pragma acc init if (T::value < T{})
+  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+#pragma acc init device_type(SOMETHING) device_num(getI()) if (getI() < getS())
+  // expected-warning@+1 2{{OpenACC clause 'device_num' not yet implemented}}
+#pragma acc init device_type(SOMETHING) device_type(T) device_num(t) if (t < T::value) device_num(getI()) if (getI() < getS())
+
+
+  // expected-error@+1{{value of type 'const NotConvertible' is not contextually convertible to 'bool'}}
+#pragma acc init if (T::NCValue)
+
+  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+#pragma acc init device_num(T::NCValue)
+  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+#pragma acc init device_num(T::ACValue)
+  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+#pragma acc init device_num(T::EXValue)
+}
+
+struct HasStuff {
+  static constexpr AmbiguousConvert ACValue;
+  static constexpr ExplicitConvertOnly EXValue;
+  static constexpr NotConvertible NCValue;
+  static constexpr unsigned value = 5;
+  operator char();
+};
+
+void Inst() {
+  TestInst<HasStuff>(); // expected-note {{in instantiation of function template specialization}}
+}
diff --git a/clang/test/SemaOpenACC/shutdown-construct-ast.cpp b/clang/test/SemaOpenACC/shutdown-construct-ast.cpp
new file mode 100644
index 0000000000000..006f1c9e4c996
--- /dev/null
+++ b/clang/test/SemaOpenACC/shutdown-construct-ast.cpp
@@ -0,0 +1,125 @@
+// RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s
+
+// Test this with PCH.
+// RUN: %clang_cc1 %s -fopenacc -emit-pch -o %t %s
+// RUN: %clang_cc1 %s -fopenacc -include-pch %t -ast-dump-all | FileCheck %s
+
+#ifndef PCH_HELPER
+#define PCH_HELPER
+
+int some_int();
+long some_long();
+void NormalFunc() {
+  // CHECK-LABEL: NormalFunc
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc shutdown
+  // CHECK-NEXT: OpenACCShutdownConstruct{{.*}}shutdown
+#pragma acc shutdown if (some_int() < some_long())
+  // CHECK-NEXT: OpenACCShutdownConstruct{{.*}}shutdown
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}} 'bool' '<'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'long'
+  // CHECK-NEXT: CallExpr
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()'
+#pragma acc shutdown device_num(some_int())
+  // CHECK-NEXT: OpenACCShutdownConstruct{{.*}}shutdown
+#pragma acc shutdown device_type(T)
+  // CHECK-NEXT: OpenACCShutdownConstruct{{.*}}shutdown
+  // CHECK-NEXT: device_type(T)
+#pragma acc shutdown if (some_int() < some_long()) device_type(T) device_num(some_int())
+  // CHECK-NEXT: OpenACCShutdownConstruct{{.*}}shutdown
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}} 'bool' '<'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'long'
+  // CHECK-NEXT: CallExpr
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
+  // CHECK-NEXT: CallExpr
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()'
+  // CHECK-NEXT: device_type(T)
+}
+
+template<typename T>
+void TemplFunc(T t) {
+  // CHECK-LABEL: FunctionTemplateDecl {{.*}}TemplFunc
+  // CHECK-NEXT: TemplateTypeParmDecl
+  // CHECK-NEXT: FunctionDecl{{.*}}TemplFunc
+  // CHECK-NEXT: ParmVarDecl{{.*}} t 'T'
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc shutdown
+  // CHECK-NEXT: OpenACCShutdownConstruct{{.*}}shutdown
+#pragma acc shutdown if (T::value > t)
+  // CHECK-NEXT: OpenACCShutdownConstruct{{.*}}shutdown
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}} '<dependent type>' '>'
+  // CHECK-NEXT: DependentScopeDeclRefExpr
+  // CHECK-NEXT: NestedNameSpecifier{{.*}} 'T'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T'
+#pragma acc shutdown device_num(t)
+  // CHECK-NEXT: OpenACCShutdownConstruct{{.*}}shutdown
+#pragma acc shutdown device_type(T)
+  // CHECK-NEXT: OpenACCShutdownConstruct{{.*}}shutdown
+  // CHECK-NEXT: device_type(T)
+#pragma acc shutdown if (T::value > t) device_type(T) device_num(t)
+  // CHECK-NEXT: OpenACCShutdownConstruct{{.*}}shutdown
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}} '<dependent type>' '>'
+  // CHECK-NEXT: DependentScopeDeclRefExpr
+  // CHECK-NEXT: NestedNameSpecifier{{.*}} 'T'
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T'
+  // CHECK-NEXT: device_type(T)
+
+  // Instantiation:
+  // CHECK-NEXT: FunctionDecl{{.*}} TemplFunc 'void (SomeStruct)' implicit_instantiation
+  // CHECK-NEXT: TemplateArgument type 'SomeStruct'
+  // CHECK-NEXT: RecordType{{.*}} 'SomeStruct'
+  // CHECK-NEXT: CXXRecord{{.*}} 'SomeStruct'
+  // CHECK-NEXT: ParmVarDecl{{.*}} t 'SomeStruct'
+  // CHECK-NEXT: CompoundStmt
+
+  // CHECK-NEXT: OpenACCShutdownConstruct{{.*}}shutdown
+
+  // CHECK-NEXT: OpenACCShutdownConstruct{{.*}}shutdown
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}} 'bool' '>'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int'
+  // CHECK-NEXT: NestedNameSpecifier{{.*}} 'SomeStruct'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}} 'unsigned int'
+  // CHECK-NEXT: MemberExpr{{.*}} .operator unsigned int
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'SomeStruct'
+
+  // CHECK-NEXT: OpenACCShutdownConstruct{{.*}}shutdown
+
+  // CHECK-NEXT: OpenACCShutdownConstruct{{.*}}shutdown
+  // CHECK-NEXT: device_type(T)
+
+  // CHECK-NEXT: OpenACCShutdownConstruct{{.*}}shutdown
+  // CHECK-NEXT: if clause
+  // CHECK-NEXT: BinaryOperator{{.*}} 'bool' '>'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'value' 'const unsigned int'
+  // CHECK-NEXT: NestedNameSpecifier{{.*}} 'SomeStruct'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}} 'unsigned int'
+  // CHECK-NEXT: MemberExpr{{.*}} .operator unsigned int
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'SomeStruct'
+}
+
+struct SomeStruct{
+  static constexpr unsigned value = 5;
+  operator unsigned();
+};
+
+void use() {
+  TemplFunc(SomeStruct{});
+}
+#endif
diff --git a/clang/test/SemaOpenACC/shutdown-construct.cpp b/clang/test/SemaOpenACC/shutdown-construct.cpp
new file mode 100644
index 0000000000000..e4305883376c7
--- /dev/null
+++ b/clang/test/SemaOpenACC/shutdown-construct.cpp
@@ -0,0 +1,74 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+struct NotConvertible{} NC;
+short getS();
+int getI();
+
+struct AmbiguousConvert{
+  operator int(); // #AMBIG_INT
+  operator short(); // #AMBIG_SHORT
+  operator float();
+} Ambiguous;
+
+struct ExplicitConvertOnly {
+  explicit operator int() const; // #EXPL_CONV
+} Explicit;
+
+void uses() {
+#pragma acc shutdown
+#pragma acc shutdown if (getI() < getS())
+  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+#pragma acc shutdown device_num(getI())
+  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+#pragma acc shutdown device_type(SOMETHING) device_num(getI())
+#pragma acc shutdown device_type(SOMETHING) if (getI() < getS())
+  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+#pragma acc shutdown device_type(SOMETHING) device_num(getI()) if (getI() < getS())
+
+  // expected-error@+1{{value of type 'struct NotConvertible' is not contextually convertible to 'bool'}}
+#pragma acc shutdown if (NC)
+
+  // expected-error@+1{{OpenACC clause 'device_num' requires expression of integer type ('struct NotConvertible' invalid)}}
+#pragma acc shutdown device_num(NC)
+  // expected-error@+3{{multiple conversions from expression type 'struct AmbiguousConvert' to an integral type}}
+  // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
+  // expected-note@#AMBIG_SHORT{{conversion to integral type 'short'}}
+#pragma acc shutdown device_num(Ambiguous)
+  // expected-warning@+3{{OpenACC clause 'device_num' not yet implemented}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
+  // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
+#pragma acc shutdown device_num(Explicit)
+}
+
+template<typename T>
+void TestInst() {
+  T t;
+#pragma acc shutdown
+#pragma acc shutdown if (T::value < T{})
+  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+#pragma acc shutdown device_type(SOMETHING) device_num(getI()) if (getI() < getS())
+  // expected-warning@+1 2{{OpenACC clause 'device_num' not yet implemented}}
+#pragma acc shutdown device_type(SOMETHING) device_type(T) device_num(t) if (t < T::value) device_num(getI()) if (getI() < getS())
+
+  // expected-error@+1{{value of type 'const NotConvertible' is not contextually convertible to 'bool'}}
+#pragma acc shutdown if (T::NCValue)
+
+  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+#pragma acc shutdown device_num(T::NCValue)
+  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+#pragma acc shutdown device_num(T::ACValue)
+  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+#pragma acc shutdown device_num(T::EXValue)
+}
+
+struct HasStuff {
+  static constexpr AmbiguousConvert ACValue;
+  static constexpr ExplicitConvertOnly EXValue;
+  static constexpr NotConvertible NCValue;
+  static constexpr unsigned value = 5;
+  operator char();
+};
+
+void Inst() {
+  TestInst<HasStuff>(); // expected-note {{in instantiation of function template specialization}}
+}
diff --git a/clang/test/SemaOpenACC/wait-construct.cpp b/clang/test/SemaOpenACC/wait-construct.cpp
index a68fc54b6e8f2..2de98762ea600 100644
--- a/clang/test/SemaOpenACC/wait-construct.cpp
+++ b/clang/test/SemaOpenACC/wait-construct.cpp
@@ -35,7 +35,7 @@ void uses() {
   // expected-note@#AMBIG_SHORT{{conversion to integral type 'short'}}
 #pragma acc wait(Ambiguous)
 
-  // expected-error@+2{{OpenACC integer expression type 'struct ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc wait(4, Explicit, 5)
 
@@ -61,7 +61,7 @@ void TestInst() {
   // expected-note@#AMBIG_SHORT{{conversion to integral type 'short'}}
 #pragma acc wait(devnum:T::value :queues:T::ACValue)
 
-  // expected-error@+5{{OpenACC integer expression type 'const ExplicitConvertOnly' requires explicit conversion to 'int'}}
+  // expected-error@+5{{OpenACC integer expression requires explicit conversion from 'const ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
   // expected-error@+3{{multiple conversions from expression type 'const AmbiguousConvert' to an integral type}}
   // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 9bdc4c9f8ce23..140200a52b80b 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2190,6 +2190,8 @@ class EnqueueVisitor : public ConstStmtVisitor<EnqueueVisitor, void>,
   void VisitOpenACCExitDataConstruct(const OpenACCExitDataConstruct *D);
   void VisitOpenACCHostDataConstruct(const OpenACCHostDataConstruct *D);
   void VisitOpenACCWaitConstruct(const OpenACCWaitConstruct *D);
+  void VisitOpenACCInitConstruct(const OpenACCInitConstruct *D);
+  void VisitOpenACCShutdownConstruct(const OpenACCShutdownConstruct *D);
   void VisitOMPExecutableDirective(const OMPExecutableDirective *D);
   void VisitOMPLoopBasedDirective(const OMPLoopBasedDirective *D);
   void VisitOMPLoopDirective(const OMPLoopDirective *D);
@@ -3639,6 +3641,19 @@ void EnqueueVisitor::VisitOpenACCWaitConstruct(const OpenACCWaitConstruct *C) {
     EnqueueChildren(Clause);
 }
 
+void EnqueueVisitor::VisitOpenACCInitConstruct(const OpenACCInitConstruct *C) {
+  EnqueueChildren(C);
+  for (auto *Clause : C->clauses())
+    EnqueueChildren(Clause);
+}
+
+void EnqueueVisitor::VisitOpenACCShutdownConstruct(
+    const OpenACCShutdownConstruct *C) {
+  EnqueueChildren(C);
+  for (auto *Clause : C->clauses())
+    EnqueueChildren(Clause);
+}
+
 void EnqueueVisitor::VisitAnnotateAttr(const AnnotateAttr *A) {
   EnqueueChildren(A);
 }
@@ -6403,6 +6418,10 @@ CXString clang_getCursorKindSpelling(enum CXCursorKind Kind) {
     return cxstring::createRef("OpenACCHostDataConstruct");
   case CXCursor_OpenACCWaitConstruct:
     return cxstring::createRef("OpenACCWaitConstruct");
+  case CXCursor_OpenACCInitConstruct:
+    return cxstring::createRef("OpenACCInitConstruct");
+  case CXCursor_OpenACCShutdownConstruct:
+    return cxstring::createRef("OpenACCShutdownConstruct");
   }
 
   llvm_unreachable("Unhandled CXCursorKind");
diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp
index b9fd3b4c7f364..f56e77b42f9d7 100644
--- a/clang/tools/libclang/CXCursor.cpp
+++ b/clang/tools/libclang/CXCursor.cpp
@@ -903,6 +903,12 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent,
   case Stmt::OpenACCWaitConstructClass:
     K = CXCursor_OpenACCWaitConstruct;
     break;
+  case Stmt::OpenACCInitConstructClass:
+    K = CXCursor_OpenACCInitConstruct;
+    break;
+  case Stmt::OpenACCShutdownConstructClass:
+    K = CXCursor_OpenACCShutdownConstruct;
+    break;
   case Stmt::OMPTargetParallelGenericLoopDirectiveClass:
     K = CXCursor_OMPTargetParallelGenericLoopDirective;
     break;

From bdf255530821201c9febf9fdb42b91082656dc94 Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane@nvidia.com>
Date: Thu, 19 Dec 2024 09:41:07 -0800
Subject: [PATCH 101/209] [OpenACC] Implement 'device_num' clause sema for
 'init'/'shutdown'

This is a very simple sema implementation, and just required AST node
plus the existing diagnostics.  This patch adds tests and adds the AST
node required, plus enables it for 'init' and 'shutdown' (only!)
---
 clang/include/clang/AST/OpenACCClause.h       | 14 ++++++++++
 clang/include/clang/Basic/OpenACCClauses.def  |  1 +
 clang/include/clang/Sema/SemaOpenACC.h        |  7 ++++-
 clang/lib/AST/OpenACCClause.cpp               | 28 +++++++++++++++++++
 clang/lib/AST/StmtProfile.cpp                 |  5 ++++
 clang/lib/AST/TextNodeDumper.cpp              |  1 +
 clang/lib/Parse/ParseOpenACC.cpp              |  1 +
 clang/lib/Sema/SemaOpenACC.cpp                | 25 +++++++++++++++++
 clang/lib/Sema/TreeTransform.h                | 23 +++++++++++++++
 clang/lib/Serialization/ASTReader.cpp         |  7 ++++-
 clang/lib/Serialization/ASTWriter.cpp         |  7 ++++-
 .../AST/ast-print-openacc-init-construct.cpp  |  5 ++--
 .../ast-print-openacc-shutdown-construct.cpp  |  5 ++--
 clang/test/ParserOpenACC/parse-clauses.c      |  2 --
 ...d-construct-auto_seq_independent-clauses.c | 12 ++++----
 .../combined-construct-device_type-clause.c   |  3 +-
 .../compute-construct-device_type-clause.c    |  3 +-
 clang/test/SemaOpenACC/init-construct-ast.cpp | 23 +++++++++++++++
 clang/test/SemaOpenACC/init-construct.cpp     | 16 ++++-------
 ...p-construct-auto_seq_independent-clauses.c | 12 ++++----
 .../loop-construct-device_type-clause.c       |  3 +-
 .../SemaOpenACC/shutdown-construct-ast.cpp    | 23 +++++++++++++++
 clang/test/SemaOpenACC/shutdown-construct.cpp | 15 ++++------
 clang/tools/libclang/CIndex.cpp               |  4 +++
 24 files changed, 197 insertions(+), 48 deletions(-)

diff --git a/clang/include/clang/AST/OpenACCClause.h b/clang/include/clang/AST/OpenACCClause.h
index 51db58d484a25..b4747c68a1dfd 100644
--- a/clang/include/clang/AST/OpenACCClause.h
+++ b/clang/include/clang/AST/OpenACCClause.h
@@ -619,6 +619,20 @@ class OpenACCAsyncClause : public OpenACCClauseWithSingleIntExpr {
                                     SourceLocation EndLoc);
 };
 
+class OpenACCDeviceNumClause : public OpenACCClauseWithSingleIntExpr {
+  OpenACCDeviceNumClause(SourceLocation BeginLoc, SourceLocation LParenLoc,
+                     Expr *IntExpr, SourceLocation EndLoc);
+
+public:
+  static bool classof(const OpenACCClause *C) {
+    return C->getClauseKind() == OpenACCClauseKind::DeviceNum;
+  }
+  static OpenACCDeviceNumClause *Create(const ASTContext &C,
+                                        SourceLocation BeginLoc,
+                                        SourceLocation LParenLoc, Expr *IntExpr,
+                                        SourceLocation EndLoc);
+};
+
 /// Represents a 'collapse' clause on a 'loop' construct. This clause takes an
 /// integer constant expression 'N' that represents how deep to collapse the
 /// construct. It also takes an optional 'force' tag that permits intervening
diff --git a/clang/include/clang/Basic/OpenACCClauses.def b/clang/include/clang/Basic/OpenACCClauses.def
index c7ffac391e202..1b619bc0dfd4c 100644
--- a/clang/include/clang/Basic/OpenACCClauses.def
+++ b/clang/include/clang/Basic/OpenACCClauses.def
@@ -40,6 +40,7 @@ CLAUSE_ALIAS(PresentOrCreate, Create, true)
 VISIT_CLAUSE(Default)
 VISIT_CLAUSE(Delete)
 VISIT_CLAUSE(Detach)
+VISIT_CLAUSE(DeviceNum)
 VISIT_CLAUSE(DevicePtr)
 VISIT_CLAUSE(DeviceType)
 CLAUSE_ALIAS(DType, DeviceType, false)
diff --git a/clang/include/clang/Sema/SemaOpenACC.h b/clang/include/clang/Sema/SemaOpenACC.h
index 04ab1ac429a2d..addc33bf3c76b 100644
--- a/clang/include/clang/Sema/SemaOpenACC.h
+++ b/clang/include/clang/Sema/SemaOpenACC.h
@@ -296,13 +296,15 @@ class SemaOpenACC : public SemaBase {
       assert((ClauseKind == OpenACCClauseKind::NumGangs ||
               ClauseKind == OpenACCClauseKind::NumWorkers ||
               ClauseKind == OpenACCClauseKind::Async ||
+              ClauseKind == OpenACCClauseKind::DeviceNum ||
               ClauseKind == OpenACCClauseKind::Tile ||
               ClauseKind == OpenACCClauseKind::Worker ||
               ClauseKind == OpenACCClauseKind::Vector ||
               ClauseKind == OpenACCClauseKind::VectorLength) &&
              "Parsed clause kind does not have a int exprs");
 
-      // 'async' and 'wait' have an optional IntExpr, so be tolerant of that.
+      // 'async', 'worker', 'vector', and 'wait' have an optional IntExpr, so be
+      // tolerant of that.
       if ((ClauseKind == OpenACCClauseKind::Async ||
            ClauseKind == OpenACCClauseKind::Worker ||
            ClauseKind == OpenACCClauseKind::Vector ||
@@ -346,6 +348,7 @@ class SemaOpenACC : public SemaBase {
       assert((ClauseKind == OpenACCClauseKind::NumGangs ||
               ClauseKind == OpenACCClauseKind::NumWorkers ||
               ClauseKind == OpenACCClauseKind::Async ||
+              ClauseKind == OpenACCClauseKind::DeviceNum ||
               ClauseKind == OpenACCClauseKind::Tile ||
               ClauseKind == OpenACCClauseKind::Gang ||
               ClauseKind == OpenACCClauseKind::Worker ||
@@ -482,6 +485,7 @@ class SemaOpenACC : public SemaBase {
       assert((ClauseKind == OpenACCClauseKind::NumGangs ||
               ClauseKind == OpenACCClauseKind::NumWorkers ||
               ClauseKind == OpenACCClauseKind::Async ||
+              ClauseKind == OpenACCClauseKind::DeviceNum ||
               ClauseKind == OpenACCClauseKind::Tile ||
               ClauseKind == OpenACCClauseKind::Worker ||
               ClauseKind == OpenACCClauseKind::Vector ||
@@ -493,6 +497,7 @@ class SemaOpenACC : public SemaBase {
       assert((ClauseKind == OpenACCClauseKind::NumGangs ||
               ClauseKind == OpenACCClauseKind::NumWorkers ||
               ClauseKind == OpenACCClauseKind::Async ||
+              ClauseKind == OpenACCClauseKind::DeviceNum ||
               ClauseKind == OpenACCClauseKind::Tile ||
               ClauseKind == OpenACCClauseKind::Worker ||
               ClauseKind == OpenACCClauseKind::Vector ||
diff --git a/clang/lib/AST/OpenACCClause.cpp b/clang/lib/AST/OpenACCClause.cpp
index fbc9f6d15fa7b..d69fab5cc413c 100644
--- a/clang/lib/AST/OpenACCClause.cpp
+++ b/clang/lib/AST/OpenACCClause.cpp
@@ -46,6 +46,7 @@ bool OpenACCClauseWithCondition::classof(const OpenACCClause *C) {
 bool OpenACCClauseWithSingleIntExpr::classof(const OpenACCClause *C) {
   return OpenACCNumWorkersClause::classof(C) ||
          OpenACCVectorLengthClause::classof(C) ||
+         OpenACCDeviceNumClause::classof(C) ||
          OpenACCVectorClause::classof(C) || OpenACCWorkerClause::classof(C) ||
          OpenACCCollapseClause::classof(C) || OpenACCAsyncClause::classof(C);
 }
@@ -218,6 +219,26 @@ OpenACCAsyncClause *OpenACCAsyncClause::Create(const ASTContext &C,
   return new (Mem) OpenACCAsyncClause(BeginLoc, LParenLoc, IntExpr, EndLoc);
 }
 
+OpenACCDeviceNumClause::OpenACCDeviceNumClause(SourceLocation BeginLoc,
+                                       SourceLocation LParenLoc, Expr *IntExpr,
+                                       SourceLocation EndLoc)
+    : OpenACCClauseWithSingleIntExpr(OpenACCClauseKind::DeviceNum, BeginLoc,
+                                     LParenLoc, IntExpr, EndLoc) {
+  assert((IntExpr->isInstantiationDependent() ||
+          IntExpr->getType()->isIntegerType()) &&
+         "device_num expression type not scalar/dependent");
+}
+
+OpenACCDeviceNumClause *OpenACCDeviceNumClause::Create(const ASTContext &C,
+                                               SourceLocation BeginLoc,
+                                               SourceLocation LParenLoc,
+                                               Expr *IntExpr,
+                                               SourceLocation EndLoc) {
+  void *Mem =
+      C.Allocate(sizeof(OpenACCDeviceNumClause), alignof(OpenACCDeviceNumClause));
+  return new (Mem) OpenACCDeviceNumClause(BeginLoc, LParenLoc, IntExpr, EndLoc);
+}
+
 OpenACCWaitClause *OpenACCWaitClause::Create(
     const ASTContext &C, SourceLocation BeginLoc, SourceLocation LParenLoc,
     Expr *DevNumExpr, SourceLocation QueuesLoc, ArrayRef<Expr *> QueueIdExprs,
@@ -547,6 +568,13 @@ void OpenACCClausePrinter::VisitVectorLengthClause(
   OS << ")";
 }
 
+void OpenACCClausePrinter::VisitDeviceNumClause(
+    const OpenACCDeviceNumClause &C) {
+  OS << "device_num(";
+  printExpr(C.getIntExpr());
+  OS << ")";
+}
+
 void OpenACCClausePrinter::VisitAsyncClause(const OpenACCAsyncClause &C) {
   OS << "async";
   if (C.hasIntExpr()) {
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 4c4ecd791c48f..27313f9ae1275 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -2645,6 +2645,11 @@ void OpenACCClauseProfiler::VisitAsyncClause(const OpenACCAsyncClause &Clause) {
     Profiler.VisitStmt(Clause.getIntExpr());
 }
 
+void OpenACCClauseProfiler::VisitDeviceNumClause(
+    const OpenACCDeviceNumClause &Clause) {
+  Profiler.VisitStmt(Clause.getIntExpr());
+}
+
 void OpenACCClauseProfiler::VisitWorkerClause(
     const OpenACCWorkerClause &Clause) {
   if (Clause.hasIntExpr())
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index f9cbdf6916dcb..018147e68aba6 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -413,6 +413,7 @@ void TextNodeDumper::Visit(const OpenACCClause *C) {
     case OpenACCClauseKind::Independent:
     case OpenACCClauseKind::Detach:
     case OpenACCClauseKind::Delete:
+    case OpenACCClauseKind::DeviceNum:
     case OpenACCClauseKind::DevicePtr:
     case OpenACCClauseKind::Finalize:
     case OpenACCClauseKind::FirstPrivate:
diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp
index 31ec0c7c1d718..ede409096548c 100644
--- a/clang/lib/Parse/ParseOpenACC.cpp
+++ b/clang/lib/Parse/ParseOpenACC.cpp
@@ -1085,6 +1085,7 @@ Parser::OpenACCClauseParseResult Parser::ParseOpenACCClauseParams(
       // TODO OpenACC: as we implement the 'rest' of the above, this 'if' should
       // be removed leaving just the 'setIntExprDetails'.
       if (ClauseKind == OpenACCClauseKind::NumWorkers ||
+          ClauseKind == OpenACCClauseKind::DeviceNum ||
           ClauseKind == OpenACCClauseKind::VectorLength)
         ParsedClause.setIntExprDetails(IntExpr.get());
 
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 618c0f6257640..42bbdf1f3f9fa 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -444,6 +444,17 @@ bool doesClauseApplyToDirective(OpenACCDirectiveKind DirectiveKind,
     }
   }
 
+  case OpenACCClauseKind::DeviceNum: {
+    switch (DirectiveKind) {
+    case OpenACCDirectiveKind::Init:
+    case OpenACCDirectiveKind::Shutdown:
+    case OpenACCDirectiveKind::Set:
+      return true;
+    default:
+      return false;
+    }
+  }
+
   case OpenACCClauseKind::UseDevice: {
     switch (DirectiveKind) {
     case OpenACCDirectiveKind::HostData:
@@ -945,6 +956,20 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitAsyncClause(
       Clause.getEndLoc());
 }
 
+OpenACCClause *SemaOpenACCClauseVisitor::VisitDeviceNumClause(
+    SemaOpenACC::OpenACCParsedClause &Clause) {
+  // Restrictions only properly implemented on certain constructs, so skip/treat
+  // as unimplemented in those cases.
+  if (!isDirectiveKindImplemented(Clause.getDirectiveKind()))
+    return isNotImplemented();
+
+  assert(Clause.getNumIntExprs() == 1 &&
+         "Invalid number of expressions for device_num");
+  return OpenACCDeviceNumClause::Create(
+      Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getIntExprs()[0],
+      Clause.getEndLoc());
+}
+
 OpenACCClause *SemaOpenACCClauseVisitor::VisitPrivateClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
   // ActOnVar ensured that everything is a valid variable reference, so there
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 686132dbc5f5d..5d43d98ce49e4 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -11877,6 +11877,29 @@ void OpenACCClauseTransform<Derived>::VisitNumWorkersClause(
       ParsedClause.getEndLoc());
 }
 
+template <typename Derived>
+void OpenACCClauseTransform<Derived>::VisitDeviceNumClause (
+    const OpenACCDeviceNumClause &C) {
+  Expr *IntExpr = const_cast<Expr *>(C.getIntExpr());
+  assert(IntExpr && "device_num clause constructed with invalid int expr");
+
+  ExprResult Res = Self.TransformExpr(IntExpr);
+  if (!Res.isUsable())
+    return;
+
+  Res = Self.getSema().OpenACC().ActOnIntExpr(OpenACCDirectiveKind::Invalid,
+                                              C.getClauseKind(),
+                                              C.getBeginLoc(), Res.get());
+  if (!Res.isUsable())
+    return;
+
+  ParsedClause.setIntExprDetails(Res.get());
+  NewClause = OpenACCDeviceNumClause::Create(
+      Self.getSema().getASTContext(), ParsedClause.getBeginLoc(),
+      ParsedClause.getLParenLoc(), ParsedClause.getIntExprs()[0],
+      ParsedClause.getEndLoc());
+}
+
 template <typename Derived>
 void OpenACCClauseTransform<Derived>::VisitVectorLengthClause(
     const OpenACCVectorLengthClause &C) {
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 21f6b2ecc58c4..15160b0751f83 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -12405,6 +12405,12 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() {
     return OpenACCNumWorkersClause::Create(getContext(), BeginLoc, LParenLoc,
                                            IntExpr, EndLoc);
   }
+  case OpenACCClauseKind::DeviceNum: {
+    SourceLocation LParenLoc = readSourceLocation();
+    Expr *IntExpr = readSubExpr();
+    return OpenACCDeviceNumClause::Create(getContext(), BeginLoc, LParenLoc,
+                                           IntExpr, EndLoc);
+  }
   case OpenACCClauseKind::VectorLength: {
     SourceLocation LParenLoc = readSourceLocation();
     Expr *IntExpr = readSubExpr();
@@ -12594,7 +12600,6 @@ OpenACCClause *ASTRecordReader::readOpenACCClause() {
   case OpenACCClauseKind::Host:
   case OpenACCClauseKind::Link:
   case OpenACCClauseKind::Bind:
-  case OpenACCClauseKind::DeviceNum:
   case OpenACCClauseKind::DefaultAsync:
   case OpenACCClauseKind::Invalid:
     llvm_unreachable("Clause serialization not yet implemented");
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 6db2262a7952e..4a6027943072c 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -8326,6 +8326,12 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
       AddStmt(E);
     return;
   }
+  case OpenACCClauseKind::DeviceNum: {
+    const auto *DNC = cast<OpenACCDeviceNumClause>(C);
+    writeSourceLocation(DNC->getLParenLoc());
+    AddStmt(const_cast<Expr*>(DNC->getIntExpr()));
+    return;
+  }
   case OpenACCClauseKind::NumWorkers: {
     const auto *NWC = cast<OpenACCNumWorkersClause>(C);
     writeSourceLocation(NWC->getLParenLoc());
@@ -8522,7 +8528,6 @@ void ASTRecordWriter::writeOpenACCClause(const OpenACCClause *C) {
   case OpenACCClauseKind::Host:
   case OpenACCClauseKind::Link:
   case OpenACCClauseKind::Bind:
-  case OpenACCClauseKind::DeviceNum:
   case OpenACCClauseKind::DefaultAsync:
   case OpenACCClauseKind::Invalid:
     llvm_unreachable("Clause serialization not yet implemented");
diff --git a/clang/test/AST/ast-print-openacc-init-construct.cpp b/clang/test/AST/ast-print-openacc-init-construct.cpp
index 8bee2d84118f5..0cfcbc99ab145 100644
--- a/clang/test/AST/ast-print-openacc-init-construct.cpp
+++ b/clang/test/AST/ast-print-openacc-init-construct.cpp
@@ -3,10 +3,9 @@
 unsigned Int;
 
 void uses() {
-// CHECK: #pragma acc init device_type(*) if(Int == 5)
+// CHECK: #pragma acc init device_type(*) device_num(Int) if(Int == 5)
 #pragma acc init device_type(*) device_num(Int) if (Int == 5)
-// CHECK: #pragma acc init device_type(*)
-// CHECK-NOT: device_num(Int)
+// CHECK: #pragma acc init device_type(*) device_num(Int)
 #pragma acc init device_type(*) device_num(Int)
 // CHECK: #pragma acc init device_type(*) if(Int == 5)
 #pragma acc init device_type(*) if (Int == 5)
diff --git a/clang/test/AST/ast-print-openacc-shutdown-construct.cpp b/clang/test/AST/ast-print-openacc-shutdown-construct.cpp
index c1da69dd82a23..4e2529658d418 100644
--- a/clang/test/AST/ast-print-openacc-shutdown-construct.cpp
+++ b/clang/test/AST/ast-print-openacc-shutdown-construct.cpp
@@ -3,10 +3,9 @@
 unsigned Int;
 
 void uses() {
-// CHECK: #pragma acc shutdown device_type(*) if(Int == 5)
+// CHECK: #pragma acc shutdown device_type(*) device_num(Int) if(Int == 5)
 #pragma acc shutdown device_type(*) device_num(Int) if (Int == 5)
-// CHECK: #pragma acc shutdown device_type(*)
-// CHECK-NOT: device_num(Int)
+// CHECK: #pragma acc shutdown device_type(*) device_num(Int)
 #pragma acc shutdown device_type(*) device_num(Int)
 // CHECK: #pragma acc shutdown device_type(*) if(Int == 5)
 #pragma acc shutdown device_type(*) if (Int == 5)
diff --git a/clang/test/ParserOpenACC/parse-clauses.c b/clang/test/ParserOpenACC/parse-clauses.c
index 7bd8edcee3db8..8a404a5a1987d 100644
--- a/clang/test/ParserOpenACC/parse-clauses.c
+++ b/clang/test/ParserOpenACC/parse-clauses.c
@@ -816,10 +816,8 @@ void IntExprParsing() {
   // expected-note@+1{{to match this '('}}
 #pragma acc init device_num(5, 4)
 
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented, clause ignored}}
 #pragma acc init device_num(5)
 
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented, clause ignored}}
 #pragma acc init device_num(returns_int())
 
   // expected-error@+2{{expected '('}}
diff --git a/clang/test/SemaOpenACC/combined-construct-auto_seq_independent-clauses.c b/clang/test/SemaOpenACC/combined-construct-auto_seq_independent-clauses.c
index a71d25ce61bed..f37c1d7e72928 100644
--- a/clang/test/SemaOpenACC/combined-construct-auto_seq_independent-clauses.c
+++ b/clang/test/SemaOpenACC/combined-construct-auto_seq_independent-clauses.c
@@ -134,7 +134,7 @@ void uses() {
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc parallel loop auto num_workers(1)
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+  // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'parallel loop' directive}}
 #pragma acc parallel loop auto device_num(1)
   for(unsigned i = 0; i < 5; ++i);
   // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
@@ -251,7 +251,7 @@ void uses() {
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc parallel loop num_workers(1) auto
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+  // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'parallel loop' directive}}
 #pragma acc parallel loop device_num(1) auto
   for(unsigned i = 0; i < 5; ++i);
   // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
@@ -369,7 +369,7 @@ void uses() {
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc parallel loop independent num_workers(1)
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+  // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'parallel loop' directive}}
 #pragma acc parallel loop independent device_num(1)
   for(unsigned i = 0; i < 5; ++i);
   // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
@@ -486,7 +486,7 @@ void uses() {
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc parallel loop num_workers(1) independent
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+  // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'parallel loop' directive}}
 #pragma acc parallel loop device_num(1) independent
   for(unsigned i = 0; i < 5; ++i);
   // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
@@ -612,7 +612,7 @@ void uses() {
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc parallel loop seq num_workers(1)
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+  // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'parallel loop' directive}}
 #pragma acc parallel loop seq device_num(1)
   for(unsigned i = 0; i < 5; ++i);
   // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
@@ -735,7 +735,7 @@ void uses() {
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc parallel loop num_workers(1) seq
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+  // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'parallel loop' directive}}
 #pragma acc parallel loop device_num(1) seq
   for(unsigned i = 0; i < 5; ++i);
   // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
diff --git a/clang/test/SemaOpenACC/combined-construct-device_type-clause.c b/clang/test/SemaOpenACC/combined-construct-device_type-clause.c
index 40339941f51a9..50612c2a4685e 100644
--- a/clang/test/SemaOpenACC/combined-construct-device_type-clause.c
+++ b/clang/test/SemaOpenACC/combined-construct-device_type-clause.c
@@ -190,8 +190,7 @@ void uses() {
   for(int i = 0; i < 5; ++i);
 #pragma acc parallel loop device_type(*) num_workers(1)
   for(int i = 0; i < 5; ++i);
-  // expected-error@+2{{OpenACC clause 'device_num' may not follow a 'device_type' clause in a 'serial loop' construct}}
-  // expected-note@+1{{previous clause is here}}
+  // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'serial loop' directive}}
 #pragma acc serial loop device_type(*) device_num(1)
   for(int i = 0; i < 5; ++i);
   // expected-error@+2{{OpenACC clause 'default_async' may not follow a 'device_type' clause in a 'serial loop' construct}}
diff --git a/clang/test/SemaOpenACC/compute-construct-device_type-clause.c b/clang/test/SemaOpenACC/compute-construct-device_type-clause.c
index c1a77abf5c3f1..bbad68c425f3e 100644
--- a/clang/test/SemaOpenACC/compute-construct-device_type-clause.c
+++ b/clang/test/SemaOpenACC/compute-construct-device_type-clause.c
@@ -195,8 +195,7 @@ void uses() {
   while(1);
 #pragma acc kernels device_type(*) num_workers(1)
   while(1);
-  // expected-error@+2{{OpenACC clause 'device_num' may not follow a 'device_type' clause in a 'kernels' construct}}
-  // expected-note@+1{{previous clause is here}}
+  // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'kernels' directive}}
 #pragma acc kernels device_type(*) device_num(1)
   while(1);
   // expected-error@+2{{OpenACC clause 'default_async' may not follow a 'device_type' clause in a 'kernels' construct}}
diff --git a/clang/test/SemaOpenACC/init-construct-ast.cpp b/clang/test/SemaOpenACC/init-construct-ast.cpp
index 793a4829d14b3..7c7d07a450d51 100644
--- a/clang/test/SemaOpenACC/init-construct-ast.cpp
+++ b/clang/test/SemaOpenACC/init-construct-ast.cpp
@@ -28,6 +28,10 @@ void NormalFunc() {
   // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()'
 #pragma acc init device_num(some_int())
   // CHECK-NEXT: OpenACCInitConstruct{{.*}}init
+  // CHECK-NEXT: device_num clause
+  // CHECK-NEXT: CallExpr
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
 #pragma acc init device_type(T)
   // CHECK-NEXT: OpenACCInitConstruct{{.*}}init
   // CHECK-NEXT: device_type(T)
@@ -43,6 +47,10 @@ void NormalFunc() {
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()'
   // CHECK-NEXT: device_type(T)
+  // CHECK-NEXT: device_num clause
+  // CHECK-NEXT: CallExpr
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
 }
 
 template<typename T>
@@ -64,6 +72,8 @@ void TemplFunc(T t) {
   // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T'
 #pragma acc init device_num(t)
   // CHECK-NEXT: OpenACCInitConstruct{{.*}}init
+  // CHECK-NEXT: device_num clause
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T'
 #pragma acc init device_type(T)
   // CHECK-NEXT: OpenACCInitConstruct{{.*}}init
   // CHECK-NEXT: device_type(T)
@@ -75,6 +85,8 @@ void TemplFunc(T t) {
   // CHECK-NEXT: NestedNameSpecifier{{.*}} 'T'
   // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T'
   // CHECK-NEXT: device_type(T)
+  // CHECK-NEXT: device_num clause
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T'
 
   // Instantiation:
   // CHECK-NEXT: FunctionDecl{{.*}} TemplFunc 'void (SomeStruct)' implicit_instantiation
@@ -98,6 +110,11 @@ void TemplFunc(T t) {
   // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'SomeStruct'
 
   // CHECK-NEXT: OpenACCInitConstruct{{.*}}init
+  // CHECK-NEXT: device_num clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: MemberExpr{{.*}} .operator unsigned int
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'SomeStruct'
 
   // CHECK-NEXT: OpenACCInitConstruct{{.*}}init
   // CHECK-NEXT: device_type(T)
@@ -112,6 +129,12 @@ void TemplFunc(T t) {
   // CHECK-NEXT: CXXMemberCallExpr{{.*}} 'unsigned int'
   // CHECK-NEXT: MemberExpr{{.*}} .operator unsigned int
   // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'SomeStruct'
+  // CHECK-NEXT: device_type(T)
+  // CHECK-NEXT: device_num clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: MemberExpr{{.*}} .operator unsigned int
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'SomeStruct'
 }
 
 struct SomeStruct{
diff --git a/clang/test/SemaOpenACC/init-construct.cpp b/clang/test/SemaOpenACC/init-construct.cpp
index 152b27c0f3be1..e4491ee905f3f 100644
--- a/clang/test/SemaOpenACC/init-construct.cpp
+++ b/clang/test/SemaOpenACC/init-construct.cpp
@@ -17,12 +17,9 @@ struct ExplicitConvertOnly {
 void uses() {
 #pragma acc init
 #pragma acc init if (getI() < getS())
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
 #pragma acc init device_num(getI())
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
 #pragma acc init device_type(SOMETHING) device_num(getI())
 #pragma acc init device_type(SOMETHING) if (getI() < getS())
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
 #pragma acc init device_type(SOMETHING) device_num(getI()) if (getI() < getS())
 
   // expected-error@+1{{value of type 'struct NotConvertible' is not contextually convertible to 'bool'}}
@@ -34,7 +31,6 @@ void uses() {
   // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
   // expected-note@#AMBIG_SHORT{{conversion to integral type 'short'}}
 #pragma acc init device_num(Ambiguous)
-  // expected-warning@+3{{OpenACC clause 'device_num' not yet implemented}}
   // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc init device_num(Explicit)
@@ -45,20 +41,20 @@ void TestInst() {
   T t;
 #pragma acc init
 #pragma acc init if (T::value < T{})
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
 #pragma acc init device_type(SOMETHING) device_num(getI()) if (getI() < getS())
-  // expected-warning@+1 2{{OpenACC clause 'device_num' not yet implemented}}
 #pragma acc init device_type(SOMETHING) device_type(T) device_num(t) if (t < T::value) device_num(getI()) if (getI() < getS())
 
-
   // expected-error@+1{{value of type 'const NotConvertible' is not contextually convertible to 'bool'}}
 #pragma acc init if (T::NCValue)
 
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+  // expected-error@+1{{OpenACC clause 'device_num' requires expression of integer type ('const NotConvertible' invalid)}}
 #pragma acc init device_num(T::NCValue)
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+  // expected-error@+3{{multiple conversions from expression type 'const AmbiguousConvert' to an integral type}}
+  // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
+  // expected-note@#AMBIG_SHORT{{conversion to integral type 'short'}}
 #pragma acc init device_num(T::ACValue)
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'const ExplicitConvertOnly' to 'int'}}
+  // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc init device_num(T::EXValue)
 }
 
diff --git a/clang/test/SemaOpenACC/loop-construct-auto_seq_independent-clauses.c b/clang/test/SemaOpenACC/loop-construct-auto_seq_independent-clauses.c
index d196633c8b6d9..d75d6abb99f6f 100644
--- a/clang/test/SemaOpenACC/loop-construct-auto_seq_independent-clauses.c
+++ b/clang/test/SemaOpenACC/loop-construct-auto_seq_independent-clauses.c
@@ -149,7 +149,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'num_workers' clause is not valid on 'loop' directive}}
 #pragma acc loop auto num_workers(1)
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+  // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'loop' directive}}
 #pragma acc loop auto device_num(1)
   for(unsigned i = 0; i < 5; ++i);
   // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
@@ -283,7 +283,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'num_workers' clause is not valid on 'loop' directive}}
 #pragma acc loop num_workers(1) auto
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+  // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'loop' directive}}
 #pragma acc loop device_num(1) auto
   for(unsigned i = 0; i < 5; ++i);
   // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
@@ -418,7 +418,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'num_workers' clause is not valid on 'loop' directive}}
 #pragma acc loop independent num_workers(1)
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+  // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'loop' directive}}
 #pragma acc loop independent device_num(1)
   for(unsigned i = 0; i < 5; ++i);
   // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
@@ -552,7 +552,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'num_workers' clause is not valid on 'loop' directive}}
 #pragma acc loop num_workers(1) independent
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+  // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'loop' directive}}
 #pragma acc loop device_num(1) independent
   for(unsigned i = 0; i < 5; ++i);
   // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
@@ -695,7 +695,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'num_workers' clause is not valid on 'loop' directive}}
 #pragma acc loop seq num_workers(1)
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+  // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'loop' directive}}
 #pragma acc loop seq device_num(1)
   for(unsigned i = 0; i < 5; ++i);
   // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
@@ -835,7 +835,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'num_workers' clause is not valid on 'loop' directive}}
 #pragma acc loop num_workers(1) seq
   for(unsigned i = 0; i < 5; ++i);
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+  // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'loop' directive}}
 #pragma acc loop device_num(1) seq
   for(unsigned i = 0; i < 5; ++i);
   // expected-warning@+1{{OpenACC clause 'default_async' not yet implemented}}
diff --git a/clang/test/SemaOpenACC/loop-construct-device_type-clause.c b/clang/test/SemaOpenACC/loop-construct-device_type-clause.c
index 3e4d0da60b6b2..bf2a2499c3436 100644
--- a/clang/test/SemaOpenACC/loop-construct-device_type-clause.c
+++ b/clang/test/SemaOpenACC/loop-construct-device_type-clause.c
@@ -170,8 +170,7 @@ void uses() {
   // expected-error@+1{{OpenACC 'num_workers' clause is not valid on 'loop' directive}}
 #pragma acc loop device_type(*) num_workers(1)
   for(int i = 0; i < 5; ++i);
-  // expected-error@+2{{OpenACC clause 'device_num' may not follow a 'device_type' clause in a 'loop' construct}}
-  // expected-note@+1{{previous clause is here}}
+  // expected-error@+1{{OpenACC 'device_num' clause is not valid on 'loop' directive}}
 #pragma acc loop device_type(*) device_num(1)
   for(int i = 0; i < 5; ++i);
   // expected-error@+2{{OpenACC clause 'default_async' may not follow a 'device_type' clause in a 'loop' construct}}
diff --git a/clang/test/SemaOpenACC/shutdown-construct-ast.cpp b/clang/test/SemaOpenACC/shutdown-construct-ast.cpp
index 006f1c9e4c996..3ec1c0d936493 100644
--- a/clang/test/SemaOpenACC/shutdown-construct-ast.cpp
+++ b/clang/test/SemaOpenACC/shutdown-construct-ast.cpp
@@ -28,6 +28,10 @@ void NormalFunc() {
   // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()'
 #pragma acc shutdown device_num(some_int())
   // CHECK-NEXT: OpenACCShutdownConstruct{{.*}}shutdown
+  // CHECK-NEXT: device_num clause
+  // CHECK-NEXT: CallExpr
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
 #pragma acc shutdown device_type(T)
   // CHECK-NEXT: OpenACCShutdownConstruct{{.*}}shutdown
   // CHECK-NEXT: device_type(T)
@@ -43,6 +47,10 @@ void NormalFunc() {
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: DeclRefExpr{{.*}}'some_long' 'long ()'
   // CHECK-NEXT: device_type(T)
+  // CHECK-NEXT: device_num clause
+  // CHECK-NEXT: CallExpr
+  // CHECK-NEXT: ImplicitCastExpr
+  // CHECK-NEXT: DeclRefExpr{{.*}}'some_int' 'int ()'
 }
 
 template<typename T>
@@ -64,6 +72,8 @@ void TemplFunc(T t) {
   // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T'
 #pragma acc shutdown device_num(t)
   // CHECK-NEXT: OpenACCShutdownConstruct{{.*}}shutdown
+  // CHECK-NEXT: device_num clause
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T'
 #pragma acc shutdown device_type(T)
   // CHECK-NEXT: OpenACCShutdownConstruct{{.*}}shutdown
   // CHECK-NEXT: device_type(T)
@@ -75,6 +85,8 @@ void TemplFunc(T t) {
   // CHECK-NEXT: NestedNameSpecifier{{.*}} 'T'
   // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T'
   // CHECK-NEXT: device_type(T)
+  // CHECK-NEXT: device_num clause
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'T'
 
   // Instantiation:
   // CHECK-NEXT: FunctionDecl{{.*}} TemplFunc 'void (SomeStruct)' implicit_instantiation
@@ -98,6 +110,11 @@ void TemplFunc(T t) {
   // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'SomeStruct'
 
   // CHECK-NEXT: OpenACCShutdownConstruct{{.*}}shutdown
+  // CHECK-NEXT: device_num clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: MemberExpr{{.*}} .operator unsigned int
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'SomeStruct'
 
   // CHECK-NEXT: OpenACCShutdownConstruct{{.*}}shutdown
   // CHECK-NEXT: device_type(T)
@@ -112,6 +129,12 @@ void TemplFunc(T t) {
   // CHECK-NEXT: CXXMemberCallExpr{{.*}} 'unsigned int'
   // CHECK-NEXT: MemberExpr{{.*}} .operator unsigned int
   // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'SomeStruct'
+  // CHECK-NEXT: device_type(T)
+  // CHECK-NEXT: device_num clause
+  // CHECK-NEXT: ImplicitCastExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: CXXMemberCallExpr{{.*}}'unsigned int'
+  // CHECK-NEXT: MemberExpr{{.*}} .operator unsigned int
+  // CHECK-NEXT: DeclRefExpr{{.*}} 't' 'SomeStruct'
 }
 
 struct SomeStruct{
diff --git a/clang/test/SemaOpenACC/shutdown-construct.cpp b/clang/test/SemaOpenACC/shutdown-construct.cpp
index e4305883376c7..c3390b0412873 100644
--- a/clang/test/SemaOpenACC/shutdown-construct.cpp
+++ b/clang/test/SemaOpenACC/shutdown-construct.cpp
@@ -17,12 +17,9 @@ struct ExplicitConvertOnly {
 void uses() {
 #pragma acc shutdown
 #pragma acc shutdown if (getI() < getS())
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
 #pragma acc shutdown device_num(getI())
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
 #pragma acc shutdown device_type(SOMETHING) device_num(getI())
 #pragma acc shutdown device_type(SOMETHING) if (getI() < getS())
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
 #pragma acc shutdown device_type(SOMETHING) device_num(getI()) if (getI() < getS())
 
   // expected-error@+1{{value of type 'struct NotConvertible' is not contextually convertible to 'bool'}}
@@ -34,7 +31,6 @@ void uses() {
   // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
   // expected-note@#AMBIG_SHORT{{conversion to integral type 'short'}}
 #pragma acc shutdown device_num(Ambiguous)
-  // expected-warning@+3{{OpenACC clause 'device_num' not yet implemented}}
   // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'struct ExplicitConvertOnly' to 'int'}}
   // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc shutdown device_num(Explicit)
@@ -45,19 +41,20 @@ void TestInst() {
   T t;
 #pragma acc shutdown
 #pragma acc shutdown if (T::value < T{})
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
 #pragma acc shutdown device_type(SOMETHING) device_num(getI()) if (getI() < getS())
-  // expected-warning@+1 2{{OpenACC clause 'device_num' not yet implemented}}
 #pragma acc shutdown device_type(SOMETHING) device_type(T) device_num(t) if (t < T::value) device_num(getI()) if (getI() < getS())
 
   // expected-error@+1{{value of type 'const NotConvertible' is not contextually convertible to 'bool'}}
 #pragma acc shutdown if (T::NCValue)
 
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+  // expected-error@+1{{OpenACC clause 'device_num' requires expression of integer type ('const NotConvertible' invalid)}}
 #pragma acc shutdown device_num(T::NCValue)
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+  // expected-error@+3{{multiple conversions from expression type 'const AmbiguousConvert' to an integral type}}
+  // expected-note@#AMBIG_INT{{conversion to integral type 'int'}}
+  // expected-note@#AMBIG_SHORT{{conversion to integral type 'short'}}
 #pragma acc shutdown device_num(T::ACValue)
-  // expected-warning@+1{{OpenACC clause 'device_num' not yet implemented}}
+  // expected-error@+2{{OpenACC integer expression requires explicit conversion from 'const ExplicitConvertOnly' to 'int'}}
+  // expected-note@#EXPL_CONV{{conversion to integral type 'int'}}
 #pragma acc shutdown device_num(T::EXValue)
 }
 
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 140200a52b80b..7967c10f9f713 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -2845,6 +2845,10 @@ void OpenACCClauseEnqueue::VisitNumWorkersClause(
     const OpenACCNumWorkersClause &C) {
   Visitor.AddStmt(C.getIntExpr());
 }
+void OpenACCClauseEnqueue::VisitDeviceNumClause(
+    const OpenACCDeviceNumClause &C) {
+  Visitor.AddStmt(C.getIntExpr());
+}
 void OpenACCClauseEnqueue::VisitVectorLengthClause(
     const OpenACCVectorLengthClause &C) {
   Visitor.AddStmt(C.getIntExpr());

From 1fcb6a9754a8db057e18f629cb90011b638901e7 Mon Sep 17 00:00:00 2001
From: Leandro Lupori <leandro.lupori@linaro.org>
Date: Thu, 19 Dec 2024 17:26:50 -0300
Subject: [PATCH 102/209] [flang][OpenMP] Initialize allocatable members of
 derived types (#120295)

Allocatable members of privatized derived types must be allocated,
with the same bounds as the original object, whenever that member
is also allocated in it, but Flang was not performing such
initialization.

The `Initialize` runtime function can't perform this task unless
its signature is changed to receive an additional parameter, the
original object, that is needed to find out which allocatable
members, with their bounds, must also be allocated in the clone.
As `Initialize` is used not only for privatization, sometimes this
other object won't even exist, so this new parameter would need
to be optional.
Because of this, it seemed better to add a new runtime function:
`InitializeClone`.
To avoid unnecessary calls, lowering inserts a call to it only for
privatized items that are derived types with allocatable members.

Fixes https://github.com/llvm/llvm-project/issues/114888
Fixes https://github.com/llvm/llvm-project/issues/114889
---
 flang/include/flang/Lower/AbstractConverter.h |  3 +
 flang/include/flang/Lower/ConvertVariable.h   |  5 +
 .../flang/Optimizer/Builder/Runtime/Derived.h |  6 ++
 flang/include/flang/Runtime/derived-api.h     |  7 ++
 flang/lib/Lower/Bridge.cpp                    |  4 +-
 flang/lib/Lower/ConvertVariable.cpp           | 14 +++
 .../lib/Lower/OpenMP/DataSharingProcessor.cpp | 21 ++++-
 flang/lib/Lower/OpenMP/DataSharingProcessor.h |  1 +
 .../lib/Optimizer/Builder/Runtime/Derived.cpp | 15 +++
 flang/runtime/derived-api.cpp                 | 10 ++
 flang/runtime/derived.cpp                     | 78 +++++++++++++++
 flang/runtime/derived.h                       |  8 ++
 .../Lower/OpenMP/derived-type-allocatable.f90 | 94 +++++++++++++++++++
 13 files changed, 262 insertions(+), 4 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/derived-type-allocatable.f90

diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h
index 307ba6a918777..8f026ac3280bf 100644
--- a/flang/include/flang/Lower/AbstractConverter.h
+++ b/flang/include/flang/Lower/AbstractConverter.h
@@ -88,6 +88,9 @@ class AbstractConverter {
   /// Get the mlir instance of a symbol.
   virtual mlir::Value getSymbolAddress(SymbolRef sym) = 0;
 
+  virtual fir::ExtendedValue
+  symBoxToExtendedValue(const Fortran::lower::SymbolBox &symBox) = 0;
+
   virtual fir::ExtendedValue
   getSymbolExtendedValue(const Fortran::semantics::Symbol &sym,
                          Fortran::lower::SymMap *symMap = nullptr) = 0;
diff --git a/flang/include/flang/Lower/ConvertVariable.h b/flang/include/flang/Lower/ConvertVariable.h
index de394a39e112e..b9d7f89138032 100644
--- a/flang/include/flang/Lower/ConvertVariable.h
+++ b/flang/include/flang/Lower/ConvertVariable.h
@@ -70,6 +70,11 @@ void defaultInitializeAtRuntime(Fortran::lower::AbstractConverter &converter,
                                 const Fortran::semantics::Symbol &sym,
                                 Fortran::lower::SymMap &symMap);
 
+/// Call clone initialization runtime routine to initialize \p sym's value.
+void initializeCloneAtRuntime(Fortran::lower::AbstractConverter &converter,
+                              const Fortran::semantics::Symbol &sym,
+                              Fortran::lower::SymMap &symMap);
+
 /// Create a fir::GlobalOp given a module variable definition. This is intended
 /// to be used when lowering a module definition, not when lowering variables
 /// used from a module. For used variables instantiateVariable must directly be
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Derived.h b/flang/include/flang/Optimizer/Builder/Runtime/Derived.h
index d8b06f35b1da8..21a9a56c7ddc3 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/Derived.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Derived.h
@@ -26,6 +26,12 @@ namespace fir::runtime {
 void genDerivedTypeInitialize(fir::FirOpBuilder &builder, mlir::Location loc,
                               mlir::Value box);
 
+/// Generate call to derived type clone initialization runtime routine to
+/// initialize \p newBox from \p box.
+void genDerivedTypeInitializeClone(fir::FirOpBuilder &builder,
+                                   mlir::Location loc, mlir::Value newBox,
+                                   mlir::Value box);
+
 /// Generate call to derived type destruction runtime routine to
 /// destroy \p box.
 void genDerivedTypeDestroy(fir::FirOpBuilder &builder, mlir::Location loc,
diff --git a/flang/include/flang/Runtime/derived-api.h b/flang/include/flang/Runtime/derived-api.h
index 79aa7d82de881..96374c5a3c234 100644
--- a/flang/include/flang/Runtime/derived-api.h
+++ b/flang/include/flang/Runtime/derived-api.h
@@ -32,6 +32,13 @@ extern "C" {
 void RTDECL(Initialize)(
     const Descriptor &, const char *sourceFile = nullptr, int sourceLine = 0);
 
+// Initializes an object clone from the original object.
+// Each allocatable member of the clone is allocated with the same bounds as
+// in the original object, if it is also allocated in it.
+// The descriptor must be initialized and non-null.
+void RTDECL(InitializeClone)(const Descriptor &, const Descriptor &,
+    const char *sourceFile = nullptr, int sourceLine = 0);
+
 // Finalizes an object and its components.  Deallocates any
 // allocatable/automatic components.  Does not deallocate the descriptor's
 // storage.
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 2c02aa271333e..17b794d147c6f 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -557,8 +557,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     return lookupSymbol(sym).getAddr();
   }
 
-  fir::ExtendedValue
-  symBoxToExtendedValue(const Fortran::lower::SymbolBox &symBox) {
+  fir::ExtendedValue symBoxToExtendedValue(
+      const Fortran::lower::SymbolBox &symBox) override final {
     return symBox.match(
         [](const Fortran::lower::SymbolBox::Intrinsic &box)
             -> fir::ExtendedValue { return box.getAddr(); },
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index ff122c21e37ad..9ee42d5cd8800 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -798,6 +798,20 @@ void Fortran::lower::defaultInitializeAtRuntime(
   }
 }
 
+/// Call clone initialization runtime routine to initialize \p sym's value.
+void Fortran::lower::initializeCloneAtRuntime(
+    Fortran::lower::AbstractConverter &converter,
+    const Fortran::semantics::Symbol &sym, Fortran::lower::SymMap &symMap) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  mlir::Location loc = converter.getCurrentLocation();
+  fir::ExtendedValue exv = converter.getSymbolExtendedValue(sym, &symMap);
+  mlir::Value newBox = builder.createBox(loc, exv);
+  lower::SymbolBox hsb = converter.lookupOneLevelUpSymbol(sym);
+  fir::ExtendedValue hexv = converter.symBoxToExtendedValue(hsb);
+  mlir::Value box = builder.createBox(loc, hexv);
+  fir::runtime::genDerivedTypeInitializeClone(builder, loc, newBox, box);
+}
+
 enum class VariableCleanUp { Finalize, Deallocate };
 /// Check whether a local variable needs to be finalized according to clause
 /// 7.5.6.3 point 3 or if it is an allocatable that must be deallocated. Note
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index 99835c515463b..cd312537551ea 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -116,6 +116,23 @@ void DataSharingProcessor::cloneSymbol(const semantics::Symbol *sym) {
       *sym, /*skipDefaultInit=*/isFirstPrivate);
   (void)success;
   assert(success && "Privatization failed due to existing binding");
+
+  // Initialize clone from original object if it has any allocatable member.
+  auto needInitClone = [&] {
+    if (isFirstPrivate)
+      return false;
+
+    SymbolBox sb = symTable.lookupSymbol(sym);
+    assert(sb);
+    mlir::Value addr = sb.getAddr();
+    assert(addr);
+    return hlfir::mayHaveAllocatableComponent(addr.getType());
+  };
+
+  if (needInitClone()) {
+    Fortran::lower::initializeCloneAtRuntime(converter, *sym, symTable);
+    callsInitClone = true;
+  }
 }
 
 void DataSharingProcessor::copyFirstPrivateSymbol(
@@ -165,8 +182,8 @@ bool DataSharingProcessor::needBarrier() {
   // variables.
   // Emit implicit barrier for linear clause. Maybe on somewhere else.
   for (const semantics::Symbol *sym : allPrivatizedSymbols) {
-    if (sym->test(semantics::Symbol::Flag::OmpFirstPrivate) &&
-        sym->test(semantics::Symbol::Flag::OmpLastPrivate))
+    if (sym->test(semantics::Symbol::Flag::OmpLastPrivate) &&
+        (sym->test(semantics::Symbol::Flag::OmpFirstPrivate) || callsInitClone))
       return true;
   }
   return false;
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
index 2f5c69cc264ce..8c7a222ec939f 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h
@@ -86,6 +86,7 @@ class DataSharingProcessor {
   lower::pft::Evaluation &eval;
   bool shouldCollectPreDeterminedSymbols;
   bool useDelayedPrivatization;
+  bool callsInitClone = false;
   lower::SymMap &symTable;
   OMPConstructSymbolVisitor visitor;
 
diff --git a/flang/lib/Optimizer/Builder/Runtime/Derived.cpp b/flang/lib/Optimizer/Builder/Runtime/Derived.cpp
index fe7e2d157ad9a..25b41518a90e5 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Derived.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Derived.cpp
@@ -29,6 +29,21 @@ void fir::runtime::genDerivedTypeInitialize(fir::FirOpBuilder &builder,
   builder.create<fir::CallOp>(loc, func, args);
 }
 
+void fir::runtime::genDerivedTypeInitializeClone(fir::FirOpBuilder &builder,
+                                                 mlir::Location loc,
+                                                 mlir::Value newBox,
+                                                 mlir::Value box) {
+  auto func =
+      fir::runtime::getRuntimeFunc<mkRTKey(InitializeClone)>(loc, builder);
+  auto fTy = func.getFunctionType();
+  auto sourceFile = fir::factory::locationToFilename(builder, loc);
+  auto sourceLine =
+      fir::factory::locationToLineNo(builder, loc, fTy.getInput(3));
+  auto args = fir::runtime::createArguments(builder, loc, fTy, newBox, box,
+                                            sourceFile, sourceLine);
+  builder.create<fir::CallOp>(loc, func, args);
+}
+
 void fir::runtime::genDerivedTypeDestroy(fir::FirOpBuilder &builder,
                                          mlir::Location loc, mlir::Value box) {
   auto func = fir::runtime::getRuntimeFunc<mkRTKey(Destroy)>(loc, builder);
diff --git a/flang/runtime/derived-api.cpp b/flang/runtime/derived-api.cpp
index eca784be208d1..c8ffd8e3bb67c 100644
--- a/flang/runtime/derived-api.cpp
+++ b/flang/runtime/derived-api.cpp
@@ -31,6 +31,16 @@ void RTDEF(Initialize)(
   }
 }
 
+void RTDEF(InitializeClone)(const Descriptor &clone, const Descriptor &orig,
+    const char *sourceFile, int sourceLine) {
+  if (const DescriptorAddendum * addendum{clone.Addendum()}) {
+    if (const auto *derived{addendum->derivedType()}) {
+      Terminator terminator{sourceFile, sourceLine};
+      InitializeClone(clone, orig, *derived, terminator);
+    }
+  }
+}
+
 void RTDEF(Destroy)(const Descriptor &descriptor) {
   if (const DescriptorAddendum * addendum{descriptor.Addendum()}) {
     if (const auto *derived{addendum->derivedType()}) {
diff --git a/flang/runtime/derived.cpp b/flang/runtime/derived.cpp
index 659f54fa344bb..7c164ff890452 100644
--- a/flang/runtime/derived.cpp
+++ b/flang/runtime/derived.cpp
@@ -122,6 +122,84 @@ RT_API_ATTRS int Initialize(const Descriptor &instance,
   return stat;
 }
 
+RT_API_ATTRS int InitializeClone(const Descriptor &clone,
+    const Descriptor &orig, const typeInfo::DerivedType &derived,
+    Terminator &terminator, bool hasStat, const Descriptor *errMsg) {
+  const Descriptor &componentDesc{derived.component()};
+  std::size_t elements{orig.Elements()};
+  int stat{StatOk};
+
+  // Initialize each data component.
+  std::size_t components{componentDesc.Elements()};
+  for (std::size_t i{0}; i < components; ++i) {
+    const typeInfo::Component &comp{
+        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(i)};
+    SubscriptValue at[maxRank];
+    orig.GetLowerBounds(at);
+    // Allocate allocatable components that are also allocated in the original
+    // object.
+    if (comp.genre() == typeInfo::Component::Genre::Allocatable) {
+      // Initialize each element.
+      for (std::size_t j{0}; j < elements; ++j, orig.IncrementSubscripts(at)) {
+        Descriptor &origDesc{
+            *orig.ElementComponent<Descriptor>(at, comp.offset())};
+        Descriptor &cloneDesc{
+            *clone.ElementComponent<Descriptor>(at, comp.offset())};
+        if (origDesc.IsAllocated()) {
+          cloneDesc.ApplyMold(origDesc, origDesc.rank());
+          stat = ReturnError(terminator, cloneDesc.Allocate(), errMsg, hasStat);
+          if (stat == StatOk) {
+            if (const DescriptorAddendum * addendum{cloneDesc.Addendum()}) {
+              if (const typeInfo::DerivedType *
+                  derived{addendum->derivedType()}) {
+                if (!derived->noInitializationNeeded()) {
+                  // Perform default initialization for the allocated element.
+                  stat = Initialize(
+                      cloneDesc, *derived, terminator, hasStat, errMsg);
+                }
+                // Initialize derived type's allocatables.
+                if (stat == StatOk) {
+                  stat = InitializeClone(cloneDesc, origDesc, *derived,
+                      terminator, hasStat, errMsg);
+                }
+              }
+            }
+          }
+        }
+        if (stat != StatOk) {
+          break;
+        }
+      }
+    } else if (comp.genre() == typeInfo::Component::Genre::Data &&
+        comp.derivedType()) {
+      // Handle nested derived types.
+      const typeInfo::DerivedType &compType{*comp.derivedType()};
+      SubscriptValue extents[maxRank];
+      GetComponentExtents(extents, comp, orig);
+      // Data components don't have descriptors, allocate them.
+      StaticDescriptor<maxRank, true, 0> origStaticDesc;
+      StaticDescriptor<maxRank, true, 0> cloneStaticDesc;
+      Descriptor &origDesc{origStaticDesc.descriptor()};
+      Descriptor &cloneDesc{cloneStaticDesc.descriptor()};
+      // Initialize each element.
+      for (std::size_t j{0}; j < elements; ++j, orig.IncrementSubscripts(at)) {
+        origDesc.Establish(compType,
+            orig.ElementComponent<char>(at, comp.offset()), comp.rank(),
+            extents);
+        cloneDesc.Establish(compType,
+            clone.ElementComponent<char>(at, comp.offset()), comp.rank(),
+            extents);
+        stat = InitializeClone(
+            cloneDesc, origDesc, compType, terminator, hasStat, errMsg);
+        if (stat != StatOk) {
+          break;
+        }
+      }
+    }
+  }
+  return stat;
+}
+
 static RT_API_ATTRS const typeInfo::SpecialBinding *FindFinal(
     const typeInfo::DerivedType &derived, int rank) {
   if (const auto *ranked{derived.FindSpecialBinding(
diff --git a/flang/runtime/derived.h b/flang/runtime/derived.h
index b4863df8db417..f5a1e219b848c 100644
--- a/flang/runtime/derived.h
+++ b/flang/runtime/derived.h
@@ -26,6 +26,14 @@ class Terminator;
 RT_API_ATTRS int Initialize(const Descriptor &, const typeInfo::DerivedType &,
     Terminator &, bool hasStat = false, const Descriptor *errMsg = nullptr);
 
+// Initializes an object clone from the original object.
+// Each allocatable member of the clone is allocated with the same bounds as
+// in the original object, if it is also allocated in it.
+// Returns a STAT= code (0 when all's well).
+RT_API_ATTRS int InitializeClone(const Descriptor &, const Descriptor &,
+    const typeInfo::DerivedType &, Terminator &, bool hasStat = false,
+    const Descriptor *errMsg = nullptr);
+
 // Call FINAL subroutines, if any
 RT_API_ATTRS void Finalize(
     const Descriptor &, const typeInfo::DerivedType &derived, Terminator *);
diff --git a/flang/test/Lower/OpenMP/derived-type-allocatable.f90 b/flang/test/Lower/OpenMP/derived-type-allocatable.f90
new file mode 100644
index 0000000000000..d265954ef1ce1
--- /dev/null
+++ b/flang/test/Lower/OpenMP/derived-type-allocatable.f90
@@ -0,0 +1,94 @@
+! Test that derived type allocatable members of private copies are properly
+! initialized.
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+module m1
+  type x
+     integer, allocatable :: x1(:)
+  end type
+
+  type y
+     integer :: y1(10)
+  end type
+
+contains
+
+!CHECK-LABEL: omp.private {type = private} @_QMm1Ftest_nested
+!CHECK:       fir.call @_FortranAInitializeClone
+!CHECK-NEXT:  omp.yield
+
+!CHECK-LABEL: omp.private {type = private} @_QMm1Ftest_array_of_allocs
+!CHECK:       fir.call @_FortranAInitializeClone
+!CHECK-NEXT:  omp.yield
+
+!CHECK-LABEL: omp.private {type = firstprivate} @_QMm1Ftest_array
+!CHECK-NOT:   fir.call @_FortranAInitializeClone
+!CHECK:       omp.yield
+
+!CHECK-LABEL: omp.private {type = private} @_QMm1Ftest_array
+!CHECK:       fir.call @_FortranAInitializeClone
+!CHECK-NEXT:  omp.yield
+
+!CHECK-LABEL: omp.private {type = private} @_QMm1Ftest_scalar
+!CHECK:       fir.call @_FortranAInitializeClone
+!CHECK-NEXT:  omp.yield
+
+  subroutine test_scalar()
+    type(x) :: v
+    allocate(v%x1(5))
+
+    !$omp parallel private(v)
+    !$omp end parallel
+  end subroutine
+
+! Test omp sections lastprivate(v, v2)
+! - InitializeClone must not be called for v2, that doesn't have an
+!   allocatable member.
+! - InitializeClone must be called for v, that has an allocatable member.
+! - To avoid race conditions between InitializeClone and lastprivate, a
+!   barrier must be present after the initializations.
+!CHECK-LABEL: func @_QMm1Ptest_array
+!CHECK:       fir.call @_FortranAInitializeClone
+!CHECK-NEXT:  omp.barrier
+  subroutine test_array()
+    type(x) :: v(10)
+    type(y) :: v2(10)
+    allocate(v(1)%x1(5))
+
+    !$omp parallel private(v)
+    !$omp end parallel
+
+    !$omp parallel
+      !$omp sections lastprivate(v2, v)
+      !$omp end sections
+    !$omp end parallel
+
+    !$omp parallel firstprivate(v)
+    !$omp end parallel
+  end subroutine
+
+  subroutine test_array_of_allocs()
+    type(x), allocatable  :: v(:)
+    allocate(v(10))
+    allocate(v(1)%x1(5))
+
+    !$omp parallel private(v)
+    !$omp end parallel
+  end subroutine
+
+  subroutine test_nested()
+    type dt1
+      integer, allocatable :: a(:)
+    end type
+
+    type dt2
+      type(dt1) :: d1
+    end type
+
+    type(dt2) :: d2
+    allocate(d2%d1%a(10))
+
+    !$omp parallel private(d2)
+    !$omp end parallel
+  end subroutine
+end module

From 4797437463e63ee289a1ff1904cfb7b2fe6cb4c2 Mon Sep 17 00:00:00 2001
From: Ziqing Luo <ziqing@udel.edu>
Date: Thu, 19 Dec 2024 12:32:31 -0800
Subject: [PATCH 103/209] [clang][NFC] Increase NumStmtBits by 1 as we are
 approaching the limit (#120341)

We have already hit the limit of NumStmtBits downstream after
010d0115fc8e3834fc6f747f0841f3b1e467c4da, which adds 4 new StmtNodes.
---
 clang/include/clang/AST/Stmt.h | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/AST/Stmt.h b/clang/include/clang/AST/Stmt.h
index 83fafbabb1d46..07cb63956aed0 100644
--- a/clang/include/clang/AST/Stmt.h
+++ b/clang/include/clang/AST/Stmt.h
@@ -109,6 +109,18 @@ class alignas(void *) Stmt {
 
   //===--- Statement bitfields classes ---===//
 
+  enum { NumStmtBits = 9 };
+
+#define STMT(CLASS, PARENT)
+#define STMT_RANGE(BASE, FIRST, LAST)
+#define LAST_STMT_RANGE(BASE, FIRST, LAST)                                     \
+  static_assert(                                                               \
+      llvm::isInt<NumStmtBits>(StmtClass::LAST##Class),                        \
+      "The number of 'StmtClass'es is strictly bounded under two to "          \
+      "the power of 'NumStmtBits'");
+#define ABSTRACT_STMT(STMT)
+#include "clang/AST/StmtNodes.inc"
+
   class StmtBitfields {
     friend class ASTStmtReader;
     friend class ASTStmtWriter;
@@ -116,9 +128,8 @@ class alignas(void *) Stmt {
 
     /// The statement class.
     LLVM_PREFERRED_TYPE(StmtClass)
-    unsigned sClass : 8;
+    unsigned sClass : NumStmtBits;
   };
-  enum { NumStmtBits = 8 };
 
   class NullStmtBitfields {
     friend class ASTStmtReader;

From 6e7312bda60249c25e2ae9078d9f70bc2a65838c Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Thu, 19 Dec 2024 21:38:20 +0100
Subject: [PATCH 104/209] [RISCV] Select and/or/xor with certain constants to
 Zbb ANDN/ORN/XNOR (#120221)

    (and X, (C<<12|0xfff)) -> (ANDN X, ~C<<12)
    (or  X, (C<<12|0xfff)) -> (ORN  X, ~C<<12)
    (xor X, (C<<12|0xfff)) -> (XNOR X, ~C<<12)

Emits better code, typically by avoiding an `ADDI HI, -1` instruction.

Co-authored-by: Craig Topper <craig.topper@sifive.com>
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp  |  29 ++++
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h    |   2 +
 llvm/lib/Target/RISCV/RISCVInstrInfoZb.td    |   6 +
 llvm/test/CodeGen/RISCV/pr84653_pr85190.ll   |   6 +-
 llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll | 151 +++++++------------
 5 files changed, 92 insertions(+), 102 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index fa3357f3eacc3..b33d58d177457 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3236,6 +3236,35 @@ bool RISCVDAGToDAGISel::selectSHXADD_UWOp(SDValue N, unsigned ShAmt,
   return false;
 }
 
+bool RISCVDAGToDAGISel::selectInvLogicImm(SDValue N, SDValue &Val) {
+  if (!isa<ConstantSDNode>(N))
+    return false;
+
+  int64_t Imm = cast<ConstantSDNode>(N)->getSExtValue();
+  if ((Imm & 0xfff) != 0xfff || Imm == -1)
+    return false;
+
+  for (const SDNode *U : N->users()) {
+    if (!ISD::isBitwiseLogicOp(U->getOpcode()))
+      return false;
+  }
+
+  // For 32-bit signed constants we already know it's a win: LUI+ADDI vs LUI.
+  // For 64-bit constants, the instruction sequences get complex,
+  // so we select inverted only if it's cheaper.
+  if (!isInt<32>(Imm)) {
+    int OrigImmCost = RISCVMatInt::getIntMatCost(APInt(64, Imm), 64, *Subtarget,
+                                                 /*CompressionCost=*/true);
+    int NegImmCost = RISCVMatInt::getIntMatCost(APInt(64, ~Imm), 64, *Subtarget,
+                                                /*CompressionCost=*/true);
+    if (OrigImmCost <= NegImmCost)
+      return false;
+  }
+
+  Val = selectImm(CurDAG, SDLoc(N), N->getSimpleValueType(0), ~Imm, *Subtarget);
+  return true;
+}
+
 static bool vectorPseudoHasAllNBitUsers(SDNode *User, unsigned UserOpNo,
                                         unsigned Bits,
                                         const TargetInstrInfo *TII) {
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index 2e738d8d25a6d..e75aff7eda993 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -119,6 +119,8 @@ class RISCVDAGToDAGISel : public SelectionDAGISel {
     return selectSHXADD_UWOp(N, ShAmt, Val);
   }
 
+  bool selectInvLogicImm(SDValue N, SDValue &Val);
+
   bool hasAllNBitUsers(SDNode *Node, unsigned Bits,
                        const unsigned Depth = 0) const;
   bool hasAllBUsers(SDNode *Node) const { return hasAllNBitUsers(Node, 8); }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index a78091cd02a35..124caa3b69d31 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -475,10 +475,16 @@ def : InstAlias<"zext.h $rd, $rs", (PACKW GPR:$rd, GPR:$rs, X0)>;
 // Codegen patterns
 //===----------------------------------------------------------------------===//
 
+def invLogicImm : ComplexPattern<XLenVT, 1, "selectInvLogicImm", [], [], 0>;
+
 let Predicates = [HasStdExtZbbOrZbkb] in {
 def : Pat<(XLenVT (and GPR:$rs1, (not GPR:$rs2))), (ANDN GPR:$rs1, GPR:$rs2)>;
 def : Pat<(XLenVT (or  GPR:$rs1, (not GPR:$rs2))), (ORN  GPR:$rs1, GPR:$rs2)>;
 def : Pat<(XLenVT (xor GPR:$rs1, (not GPR:$rs2))), (XNOR GPR:$rs1, GPR:$rs2)>;
+
+def : Pat<(XLenVT (and GPR:$rs1, invLogicImm:$rs2)), (ANDN GPR:$rs1, invLogicImm:$rs2)>;
+def : Pat<(XLenVT (or  GPR:$rs1, invLogicImm:$rs2)), (ORN  GPR:$rs1, invLogicImm:$rs2)>;
+def : Pat<(XLenVT (xor GPR:$rs1, invLogicImm:$rs2)), (XNOR GPR:$rs1, invLogicImm:$rs2)>;
 } // Predicates = [HasStdExtZbbOrZbkb]
 
 let Predicates = [HasStdExtZbbOrZbkb] in {
diff --git a/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
index b1bba5fdc9211..30a9355734772 100644
--- a/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
+++ b/llvm/test/CodeGen/RISCV/pr84653_pr85190.ll
@@ -21,8 +21,7 @@ define i1 @pr84653(i32 %x) {
 ; CHECK-ZBB:       # %bb.0:
 ; CHECK-ZBB-NEXT:    sext.w a1, a0
 ; CHECK-ZBB-NEXT:    lui a2, 524288
-; CHECK-ZBB-NEXT:    addi a2, a2, -1
-; CHECK-ZBB-NEXT:    xor a0, a0, a2
+; CHECK-ZBB-NEXT:    xnor a0, a0, a2
 ; CHECK-ZBB-NEXT:    sext.w a0, a0
 ; CHECK-ZBB-NEXT:    max a0, a0, zero
 ; CHECK-ZBB-NEXT:    slt a0, a0, a1
@@ -82,8 +81,7 @@ define i1 @select_to_or(i32 %x) {
 ; CHECK-ZBB:       # %bb.0:
 ; CHECK-ZBB-NEXT:    sext.w a1, a0
 ; CHECK-ZBB-NEXT:    lui a2, 524288
-; CHECK-ZBB-NEXT:    addi a2, a2, -1
-; CHECK-ZBB-NEXT:    xor a0, a0, a2
+; CHECK-ZBB-NEXT:    xnor a0, a0, a2
 ; CHECK-ZBB-NEXT:    sext.w a0, a0
 ; CHECK-ZBB-NEXT:    min a0, a0, zero
 ; CHECK-ZBB-NEXT:    slt a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll b/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll
index 87e72c28a9965..f1e4bd09fcb92 100644
--- a/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll
+++ b/llvm/test/CodeGen/RISCV/zbb-logic-neg-imm.ll
@@ -9,37 +9,21 @@
 ; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64,ZBS
 
 define i32 @and0xabcdefff(i32 %x) {
-; RV32-LABEL: and0xabcdefff:
-; RV32:       # %bb.0:
-; RV32-NEXT:    lui a1, 703711
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a0, a0, a1
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: and0xabcdefff:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 703711
-; RV64-NEXT:    addiw a1, a1, -1
-; RV64-NEXT:    and a0, a0, a1
-; RV64-NEXT:    ret
+; CHECK-LABEL: and0xabcdefff:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 344865
+; CHECK-NEXT:    andn a0, a0, a1
+; CHECK-NEXT:    ret
   %and = and i32 %x, -1412567041
   ret i32 %and
 }
 
 define i32 @orlow13(i32 %x) {
-; RV32-LABEL: orlow13:
-; RV32:       # %bb.0:
-; RV32-NEXT:    lui a1, 2
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    or a0, a0, a1
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: orlow13:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 2
-; RV64-NEXT:    addiw a1, a1, -1
-; RV64-NEXT:    or a0, a0, a1
-; RV64-NEXT:    ret
+; CHECK-LABEL: orlow13:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1048574
+; CHECK-NEXT:    orn a0, a0, a1
+; CHECK-NEXT:    ret
   %or = or i32 %x, 8191
   ret i32 %or
 }
@@ -47,53 +31,35 @@ define i32 @orlow13(i32 %x) {
 define i64 @orlow24(i64 %x) {
 ; RV32-LABEL: orlow24:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a2, 4096
-; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    or a0, a0, a2
+; RV32-NEXT:    lui a2, 1044480
+; RV32-NEXT:    orn a0, a0, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: orlow24:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 4096
-; RV64-NEXT:    addiw a1, a1, -1
-; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    lui a1, 1044480
+; RV64-NEXT:    orn a0, a0, a1
 ; RV64-NEXT:    ret
   %or = or i64 %x, 16777215
   ret i64 %or
 }
 
 define i32 @xorlow16(i32 %x) {
-; RV32-LABEL: xorlow16:
-; RV32:       # %bb.0:
-; RV32-NEXT:    lui a1, 16
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    xor a0, a0, a1
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: xorlow16:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 16
-; RV64-NEXT:    addiw a1, a1, -1
-; RV64-NEXT:    xor a0, a0, a1
-; RV64-NEXT:    ret
+; CHECK-LABEL: xorlow16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1048560
+; CHECK-NEXT:    xnor a0, a0, a1
+; CHECK-NEXT:    ret
   %xor = xor i32 %x, 65535
   ret i32 %xor
 }
 
 define i32 @xorlow31(i32 %x) {
-; RV32-LABEL: xorlow31:
-; RV32:       # %bb.0:
-; RV32-NEXT:    lui a1, 524288
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    xor a0, a0, a1
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: xorlow31:
-; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 524288
-; RV64-NEXT:    addiw a1, a1, -1
-; RV64-NEXT:    xor a0, a0, a1
-; RV64-NEXT:    ret
+; CHECK-LABEL: xorlow31:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 524288
+; CHECK-NEXT:    xnor a0, a0, a1
+; CHECK-NEXT:    ret
   %xor = xor i32 %x, 2147483647
   ret i32 %xor
 }
@@ -164,8 +130,7 @@ define void @orarray100(ptr %a) {
 ; RV32:       # %bb.0: # %entry
 ; RV32-NEXT:    li a1, 0
 ; RV32-NEXT:    li a2, 0
-; RV32-NEXT:    lui a3, 16
-; RV32-NEXT:    addi a3, a3, -1
+; RV32-NEXT:    lui a3, 1048560
 ; RV32-NEXT:  .LBB8_1: # %for.body
 ; RV32-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV32-NEXT:    slli a4, a1, 2
@@ -175,7 +140,7 @@ define void @orarray100(ptr %a) {
 ; RV32-NEXT:    seqz a6, a1
 ; RV32-NEXT:    add a2, a2, a6
 ; RV32-NEXT:    xori a6, a1, 100
-; RV32-NEXT:    or a5, a5, a3
+; RV32-NEXT:    orn a5, a5, a3
 ; RV32-NEXT:    or a6, a6, a2
 ; RV32-NEXT:    sw a5, 0(a4)
 ; RV32-NEXT:    bnez a6, .LBB8_1
@@ -185,12 +150,11 @@ define void @orarray100(ptr %a) {
 ; RV64-LABEL: orarray100:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    addi a1, a0, 400
-; RV64-NEXT:    lui a2, 16
-; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    lui a2, 1048560
 ; RV64-NEXT:  .LBB8_1: # %for.body
 ; RV64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; RV64-NEXT:    lw a3, 0(a0)
-; RV64-NEXT:    or a3, a3, a2
+; RV64-NEXT:    orn a3, a3, a2
 ; RV64-NEXT:    sw a3, 0(a0)
 ; RV64-NEXT:    addi a0, a0, 4
 ; RV64-NEXT:    bne a0, a1, .LBB8_1
@@ -216,17 +180,16 @@ for.body:
 define void @orarray3(ptr %a) {
 ; CHECK-LABEL: orarray3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, 16
-; CHECK-NEXT:    lw a2, 0(a0)
-; CHECK-NEXT:    lw a3, 4(a0)
-; CHECK-NEXT:    lw a4, 8(a0)
-; CHECK-NEXT:    addi a1, a1, -1
-; CHECK-NEXT:    or a2, a2, a1
-; CHECK-NEXT:    or a3, a3, a1
-; CHECK-NEXT:    or a1, a4, a1
-; CHECK-NEXT:    sw a2, 0(a0)
-; CHECK-NEXT:    sw a3, 4(a0)
-; CHECK-NEXT:    sw a1, 8(a0)
+; CHECK-NEXT:    lw a1, 0(a0)
+; CHECK-NEXT:    lw a2, 4(a0)
+; CHECK-NEXT:    lw a3, 8(a0)
+; CHECK-NEXT:    lui a4, 1048560
+; CHECK-NEXT:    orn a1, a1, a4
+; CHECK-NEXT:    orn a2, a2, a4
+; CHECK-NEXT:    orn a3, a3, a4
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    sw a2, 4(a0)
+; CHECK-NEXT:    sw a3, 8(a0)
 ; CHECK-NEXT:    ret
   %1 = load i32, ptr %a, align 4
   %or = or i32 %1, 65535
@@ -279,16 +242,14 @@ define i32 @compl(i32 %x) {
 define i32 @orlow12(i32 %x) {
 ; NOZBS32-LABEL: orlow12:
 ; NOZBS32:       # %bb.0:
-; NOZBS32-NEXT:    lui a1, 1
-; NOZBS32-NEXT:    addi a1, a1, -1
-; NOZBS32-NEXT:    or a0, a0, a1
+; NOZBS32-NEXT:    lui a1, 1048575
+; NOZBS32-NEXT:    orn a0, a0, a1
 ; NOZBS32-NEXT:    ret
 ;
 ; NOZBS64-LABEL: orlow12:
 ; NOZBS64:       # %bb.0:
-; NOZBS64-NEXT:    lui a1, 1
-; NOZBS64-NEXT:    addiw a1, a1, -1
-; NOZBS64-NEXT:    or a0, a0, a1
+; NOZBS64-NEXT:    lui a1, 1048575
+; NOZBS64-NEXT:    orn a0, a0, a1
 ; NOZBS64-NEXT:    ret
 ;
 ; ZBS-LABEL: orlow12:
@@ -303,16 +264,14 @@ define i32 @orlow12(i32 %x) {
 define i32 @xorlow12(i32 %x) {
 ; NOZBS32-LABEL: xorlow12:
 ; NOZBS32:       # %bb.0:
-; NOZBS32-NEXT:    lui a1, 1
-; NOZBS32-NEXT:    addi a1, a1, -1
-; NOZBS32-NEXT:    xor a0, a0, a1
+; NOZBS32-NEXT:    lui a1, 1048575
+; NOZBS32-NEXT:    xnor a0, a0, a1
 ; NOZBS32-NEXT:    ret
 ;
 ; NOZBS64-LABEL: xorlow12:
 ; NOZBS64:       # %bb.0:
-; NOZBS64-NEXT:    lui a1, 1
-; NOZBS64-NEXT:    addiw a1, a1, -1
-; NOZBS64-NEXT:    xor a0, a0, a1
+; NOZBS64-NEXT:    lui a1, 1048575
+; NOZBS64-NEXT:    xnor a0, a0, a1
 ; NOZBS64-NEXT:    ret
 ;
 ; ZBS-LABEL: xorlow12:
@@ -327,18 +286,16 @@ define i32 @xorlow12(i32 %x) {
 define i64 @andimm64(i64 %x) {
 ; RV32-LABEL: andimm64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    lui a1, 1044496
-; RV32-NEXT:    addi a1, a1, -1
-; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    lui a1, 4080
+; RV32-NEXT:    andn a0, a0, a1
 ; RV32-NEXT:    li a1, 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: andimm64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    lui a1, 65281
+; RV64-NEXT:    lui a1, 983295
 ; RV64-NEXT:    slli a1, a1, 4
-; RV64-NEXT:    addi a1, a1, -1
-; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    andn a0, a0, a1
 ; RV64-NEXT:    ret
   %and = and i64 %x, 4278255615
   ret i64 %and
@@ -347,19 +304,17 @@ define i64 @andimm64(i64 %x) {
 define i64 @andimm64srli(i64 %x) {
 ; RV32-LABEL: andimm64srli:
 ; RV32:       # %bb.0:
+; RV32-NEXT:    lui a2, 1040384
+; RV32-NEXT:    orn a0, a0, a2
 ; RV32-NEXT:    lui a2, 917504
 ; RV32-NEXT:    or a1, a1, a2
-; RV32-NEXT:    lui a2, 8192
-; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    or a0, a0, a2
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: andimm64srli:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    lui a1, 983040
 ; RV64-NEXT:    srli a1, a1, 3
-; RV64-NEXT:    not a1, a1
-; RV64-NEXT:    or a0, a0, a1
+; RV64-NEXT:    orn a0, a0, a1
 ; RV64-NEXT:    ret
   %or = or i64 %x, -2305843009180139521
   ret i64 %or

From 7009b0699343a7eb87e77d8e13f7c70a02ebed5f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 19 Dec 2024 12:41:09 -0800
Subject: [PATCH 105/209] [flang][cuda] Allow STOP in device context (#120625)

STOP statement is allowed in device procedure
---
 flang/lib/Semantics/check-cuda.cpp | 1 +
 flang/test/Semantics/cuf09.cuf     | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp
index bec3969c7a26b..d497ac20e7017 100644
--- a/flang/lib/Semantics/check-cuda.cpp
+++ b/flang/lib/Semantics/check-cuda.cpp
@@ -350,6 +350,7 @@ template <bool IsCUFKernelDo> class DeviceContextChecker {
   void Check(const parser::ActionStmt &stmt, const parser::CharBlock &source) {
     common::visit(
         common::visitors{
+            [&](const common::Indirection<parser::StopStmt> &) { return; },
             [&](const common::Indirection<parser::PrintStmt> &) {},
             [&](const common::Indirection<parser::WriteStmt> &x) {
               if (x.value().format) { // Formatted write to '*' or '6'
diff --git a/flang/test/Semantics/cuf09.cuf b/flang/test/Semantics/cuf09.cuf
index b45ca6d7d1a9d..3307e2a862672 100644
--- a/flang/test/Semantics/cuf09.cuf
+++ b/flang/test/Semantics/cuf09.cuf
@@ -49,6 +49,11 @@ module m
     i = threadIdx%x
     a(i) = c(10) ! ok, a is device and c is constant
   end subroutine
+
+  attributes(global) subroutine stoptest()
+    print*,threadIdx%x
+    stop ! ok
+  end subroutine
 end
 
 program main

From cb8a90b7d17f851dec9c1c2d429622909aa5b605 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Thu, 19 Dec 2024 12:53:48 -0800
Subject: [PATCH 106/209] [ubsan] Remove -ubsan-unique-traps (replace with
 -fno-sanitize-merge) (#120613)

-fno-sanitize-merge (introduced in
https://github.com/llvm/llvm-project/pull/120511) duplicates the
functionality of -ubsan-unique-traps but also allows individual checks
to be specified e.g.,
* "-fno-sanitize-merge" without arguments is equivalent to
-ubsan-unique-traps
* "-fno-sanitize-merge=bool,enum" will apply it only to those two checks

Additionally, the naming is more consistent with the rest of the
-fsanitize- family.

This patch therefore removes -ubsan-unique-traps. This breaks backwards
compatibility; we hope that this is acceptable since '-mllvm
-ubsan-unique-traps' was an experimental flag.

This patch also adds negative test examples to bounds-checking.c, and
strengthens the NOOPTARRAY assertion to prevent spurious matches.

"-bounds-checking-unique-traps" is unaffected by this patch.
---
 clang/docs/ReleaseNotes.rst           |  6 ++++++
 clang/lib/CodeGen/CGExpr.cpp          | 11 ++---------
 clang/test/CodeGen/bounds-checking.c  | 27 ++++++++++++++++++++-------
 clang/test/CodeGen/ubsan-trap-merge.c |  4 ----
 4 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 5f91ff9063403..edb2e4a10ded0 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -445,6 +445,10 @@ New Compiler Flags
 - The ``-Warray-compare-cxx26`` warning has been added to warn about array comparison
   starting from C++26, this warning is enabled as an error by default.
 
+- '-fsanitize-merge' (default) and '-fno-sanitize-merge' have been added for
+  fine-grained control of which UBSan checks are allowed to be merged by the
+  backend (for example, -fno-sanitize-merge=bool,enum).
+
 Deprecated Compiler Flags
 -------------------------
 
@@ -484,6 +488,8 @@ Removed Compiler Flags
   derivatives) is now removed, since it's no longer possible to suppress the
   diagnostic (see above). Users can expect an `unknown warning` diagnostic if
   it's still in use.
+- The experimental flag '-ubsan-unique-traps' has been removed. It is
+  superseded by '-fno-sanitize-merge'.
 
 Attribute Changes in Clang
 --------------------------
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index d3fa5be6777ef..ba1cba291553b 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -52,11 +52,6 @@
 using namespace clang;
 using namespace CodeGen;
 
-// Experiment to make sanitizers easier to debug
-static llvm::cl::opt<bool> ClSanitizeDebugDeoptimization(
-    "ubsan-unique-traps", llvm::cl::Optional,
-    llvm::cl::desc("Deoptimize traps for UBSAN so there is 1 trap per check."));
-
 // TODO: Introduce frontend options to enabled per sanitizers, similar to
 // `fsanitize-trap`.
 static llvm::cl::opt<bool> ClSanitizeGuardChecks(
@@ -3581,8 +3576,7 @@ static void emitCheckHandlerCall(CodeGenFunction &CGF,
                                llvm::AttributeList::FunctionIndex, B),
       /*Local=*/true);
   llvm::CallInst *HandlerCall = CGF.EmitNounwindRuntimeCall(Fn, FnArgs);
-  NoMerge = NoMerge || ClSanitizeDebugDeoptimization ||
-            !CGF.CGM.getCodeGenOpts().OptimizationLevel ||
+  NoMerge = NoMerge || !CGF.CGM.getCodeGenOpts().OptimizationLevel ||
             (CGF.CurCodeDecl && CGF.CurCodeDecl->hasAttr<OptimizeNoneAttr>());
   if (NoMerge)
     HandlerCall->addFnAttr(llvm::Attribute::NoMerge);
@@ -3915,8 +3909,7 @@ void CodeGenFunction::EmitTrapCheck(llvm::Value *Checked,
 
   llvm::BasicBlock *&TrapBB = TrapBBs[CheckHandlerID];
 
-  NoMerge = NoMerge || ClSanitizeDebugDeoptimization ||
-            !CGM.getCodeGenOpts().OptimizationLevel ||
+  NoMerge = NoMerge || !CGM.getCodeGenOpts().OptimizationLevel ||
             (CurCodeDecl && CurCodeDecl->hasAttr<OptimizeNoneAttr>());
 
   if (TrapBB && !NoMerge) {
diff --git a/clang/test/CodeGen/bounds-checking.c b/clang/test/CodeGen/bounds-checking.c
index f6c4880e70a15..d052665012a50 100644
--- a/clang/test/CodeGen/bounds-checking.c
+++ b/clang/test/CodeGen/bounds-checking.c
@@ -1,7 +1,15 @@
-// RUN: %clang_cc1 -fsanitize=local-bounds -emit-llvm -triple x86_64-apple-darwin10 %s -o - | FileCheck %s
-// RUN: %clang_cc1 -fsanitize=array-bounds -O -fsanitize-trap=array-bounds -emit-llvm -triple x86_64-apple-darwin10 -DNO_DYNAMIC %s -o - | FileCheck %s
-// RUN: %clang_cc1 -fsanitize=local-bounds -fsanitize-trap=local-bounds -O3 -mllvm -bounds-checking-unique-traps -emit-llvm -triple x86_64-apple-darwin10 %s -o - | FileCheck %s --check-prefixes=NOOPTLOCAL
-// RUN: %clang_cc1 -fsanitize=array-bounds -fsanitize-trap=array-bounds -O3 -mllvm -ubsan-unique-traps -emit-llvm -triple x86_64-apple-darwin10 %s -o - | FileCheck %s --check-prefixes=NOOPTARRAY
+// RUN: %clang_cc1 -fsanitize=local-bounds                                 -emit-llvm -triple x86_64-apple-darwin10              %s -o - |     FileCheck %s
+// RUN: %clang_cc1 -fsanitize=array-bounds -O                              -emit-llvm -triple x86_64-apple-darwin10 %s -o -              | not FileCheck %s
+// RUN: %clang_cc1 -fsanitize=array-bounds -O -fsanitize-trap=array-bounds -emit-llvm -triple x86_64-apple-darwin10 -DNO_DYNAMIC %s -o - |     FileCheck %s
+//
+// RUN: %clang_cc1 -fsanitize=local-bounds -fsanitize-trap=local-bounds -O3 -mllvm -bounds-checking-unique-traps -emit-llvm -triple x86_64-apple-darwin10 %s -o - |     FileCheck %s --check-prefixes=NOOPTLOCAL
+// RUN: %clang_cc1 -fsanitize=local-bounds -fsanitize-trap=local-bounds -O3                                      -emit-llvm -triple x86_64-apple-darwin10 %s -o - | not FileCheck %s --check-prefixes=NOOPTLOCAL
+//
+// N.B. The clang driver defaults to -fsanitize-merge but clang_cc1 effectively
+// defaults to -fno-sanitize-merge.
+// RUN: %clang_cc1 -fsanitize=array-bounds -fsanitize-trap=array-bounds                               -O3 -emit-llvm -triple x86_64-apple-darwin10 %s -o - |     FileCheck %s --check-prefixes=NOOPTARRAY
+// RUN: %clang_cc1 -fsanitize=array-bounds -fsanitize-trap=array-bounds -fno-sanitize-merge           -O3 -emit-llvm -triple x86_64-apple-darwin10 %s -o - |     FileCheck %s --check-prefixes=NOOPTARRAY
+// RUN: %clang_cc1 -fsanitize=array-bounds -fsanitize-trap=array-bounds -fsanitize-merge=array-bounds -O3 -emit-llvm -triple x86_64-apple-darwin10 %s -o - | not FileCheck %s --check-prefixes=NOOPTARRAY
 //
 // REQUIRES: x86-registered-target
 
@@ -43,7 +51,7 @@ int f4(int i) {
   return b[i];
 }
 
-// Union flexible-array memebers are a C99 extension. All array members with a
+// Union flexible-array members are a C99 extension. All array members with a
 // constant size should be considered FAMs.
 
 union U { int a[0]; int b[1]; int c[2]; };
@@ -72,13 +80,17 @@ int f7(union U *u, int i) {
 char B[10];
 char B2[10];
 // CHECK-LABEL: @f8
+// Check the label to prevent spuriously matching ubsantraps from other
+// functions.
+// NOOPTLOCAL-LABEL: @f8
+// NOOPTARRAY-LABEL: @f8
 void f8(int i, int k) {
   // NOOPTLOCAL: call void @llvm.ubsantrap(i8 3)
-  // NOOPTARRAY: call void @llvm.ubsantrap(i8 18)
+  // NOOPTARRAY: call void @llvm.ubsantrap(i8 18) #[[ATTR2:[0-9]+]]
   B[i] = '\0';
 
   // NOOPTLOCAL: call void @llvm.ubsantrap(i8 5)
-  // NOOPTARRAY: call void @llvm.ubsantrap(i8 18)
+  // NOOPTARRAY: call void @llvm.ubsantrap(i8 18) #[[ATTR2:[0-9]+]]
   B2[k] = '\0';
 }
 
@@ -90,3 +102,4 @@ struct S {
 struct S *f9(int i) {
   return &s[i];
 }
+// NOOPTARRAY: attributes #[[ATTR2]] = { nomerge noreturn nounwind }
diff --git a/clang/test/CodeGen/ubsan-trap-merge.c b/clang/test/CodeGen/ubsan-trap-merge.c
index f211150a09cb6..486aa55f5b811 100644
--- a/clang/test/CodeGen/ubsan-trap-merge.c
+++ b/clang/test/CodeGen/ubsan-trap-merge.c
@@ -10,10 +10,6 @@
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=signed-integer-overflow -O3 %s -o -                                         | FileCheck %s --check-prefixes=HANDLER-NOMERGE
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=signed-integer-overflow -O3 %s -o - -fsanitize-minimal-runtime              | FileCheck %s --check-prefixes=MINRT-NOMERGE
 //
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=signed-integer-overflow -O3 -mllvm -ubsan-unique-traps %s -o - -fsanitize-trap=signed-integer-overflow | FileCheck %s --check-prefixes=TRAP-NOMERGE
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=signed-integer-overflow -O3 -mllvm -ubsan-unique-traps %s -o -                                         | FileCheck %s --check-prefixes=HANDLER-NOMERGE
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=signed-integer-overflow -O3 -mllvm -ubsan-unique-traps %s -o - -fsanitize-minimal-runtime              | FileCheck %s --check-prefixes=MINRT-NOMERGE
-//
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=signed-integer-overflow -O3 -fno-sanitize-merge=signed-integer-overflow  %s -o - -fsanitize-trap=signed-integer-overflow | FileCheck %s --check-prefixes=TRAP-NOMERGE
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=signed-integer-overflow -O3 -fno-sanitize-merge=signed-integer-overflow  %s -o -                                         | FileCheck %s --check-prefixes=HANDLER-NOMERGE
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -fsanitize=signed-integer-overflow -O3 -fno-sanitize-merge=signed-integer-overflow  %s -o - -fsanitize-minimal-runtime              | FileCheck %s --check-prefixes=MINRT-NOMERGE

From 8dfae0c462e9558df77c83c97d89b4b83ed1baff Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 19 Dec 2024 15:52:19 -0500
Subject: [PATCH 107/209] Revert "[libcxx] Use alias for detecting overriden
 function (#114961)"

This reverts commit 62bd10f7d18ca6f544286767cae2c9026d493888.
Breaks building with -flto=thin, see
https://github.com/llvm/llvm-project/pull/114961#issuecomment-2555754056
---
 libcxx/src/include/overridable_function.h | 115 +++++++++++++---------
 libcxx/src/new.cpp                        |  22 +++--
 libcxxabi/src/stdlib_new_delete.cpp       |  22 +++--
 3 files changed, 94 insertions(+), 65 deletions(-)

diff --git a/libcxx/src/include/overridable_function.h b/libcxx/src/include/overridable_function.h
index 9ddf4ac072063..6c70f6242ddd6 100644
--- a/libcxx/src/include/overridable_function.h
+++ b/libcxx/src/include/overridable_function.h
@@ -29,81 +29,106 @@
 // This is a low-level utility which does not work on all platforms, since it needs
 // to make assumptions about the object file format in use. Furthermore, it requires
 // the "base definition" of the function (the one we want to check whether it has been
-// overridden) to be defined using the _LIBCPP_OVERRIDABLE_FUNCTION macro.
+// overridden) to be annotated with the _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE macro.
 //
 // This currently works with Mach-O files (used on Darwin) and with ELF files (used on Linux
 // and others). On platforms where we know how to implement this detection, the macro
 // _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION is defined to 1, and it is defined to 0 on
-// other platforms. The _LIBCPP_OVERRIDABLE_FUNCTION macro expands to regular function
-// definition on unsupported platforms so that it can be used to decorate functions
-// regardless of whether detection is actually supported.
+// other platforms. The _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE macro is defined to
+// nothing on unsupported platforms so that it can be used to decorate functions regardless
+// of whether detection is actually supported.
 //
 // How does this work?
 // -------------------
 //
 // Let's say we want to check whether a weak function `f` has been overridden by the user.
-// The general mechanism works by defining a symbol `f_impl__` and a weak alias `f` via the
-// _LIBCPP_OVERRIDABLE_FUNCTION macro.
+// The general mechanism works by placing `f`'s definition (in the libc++ built library)
+// inside a special section, which we do using the `__section__` attribute via the
+// _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE macro.
 //
 // Then, when comes the time to check whether the function has been overridden, we take
-// the address of the function `f` and we check whether it is different from `f_impl__`.
-// If so it means the function was overriden by the user.
+// the address of the function and we check whether it falls inside the special function
+// we created. This can be done by finding pointers to the start and the end of the section
+// (which is done differently for ELF and Mach-O), and then checking whether `f` falls
+// within those bounds. If it falls within those bounds, then `f` is still inside the
+// special section and so it is the version we defined in the libc++ built library, i.e.
+// it was not overridden. Otherwise, it was overridden by the user because it falls
+// outside of the section.
 //
 // Important note
 // --------------
 //
-// This mechanism should never be used outside of the libc++ built library. Functions defined
-// with this macro must be defined at global scope.
+// This mechanism should never be used outside of the libc++ built library. In particular,
+// attempting to use this within the libc++ headers will not work at all because we don't
+// want to be defining special sections inside user's executables which use our headers.
 //
 
 #if defined(_LIBCPP_OBJECT_FORMAT_MACHO)
 
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <auto _Func>
-_LIBCPP_HIDE_FROM_ABI constexpr bool __is_function_overridden();
+#  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 1
+#  define _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE                                                                 \
+    __attribute__((__section__("__TEXT,__lcxx_override,regular,pure_instructions")))
 
+_LIBCPP_BEGIN_NAMESPACE_STD
+template <class _Ret, class... _Args>
+_LIBCPP_HIDE_FROM_ABI bool __is_function_overridden(_Ret (*__fptr)(_Args...)) noexcept {
+  // Declare two dummy bytes and give them these special `__asm` values. These values are
+  // defined by the linker, which means that referring to `&__lcxx_override_start` will
+  // effectively refer to the address where the section starts (and same for the end).
+  extern char __lcxx_override_start __asm("section$start$__TEXT$__lcxx_override");
+  extern char __lcxx_override_end __asm("section$end$__TEXT$__lcxx_override");
+
+  // Now get a uintptr_t out of these locations, and out of the function pointer.
+  uintptr_t __start = reinterpret_cast<uintptr_t>(&__lcxx_override_start);
+  uintptr_t __end   = reinterpret_cast<uintptr_t>(&__lcxx_override_end);
+  uintptr_t __ptr   = reinterpret_cast<uintptr_t>(__fptr);
+
+#  if __has_feature(ptrauth_calls)
+  // We must pass a void* to ptrauth_strip since it only accepts a pointer type. Also, in particular,
+  // we must NOT pass a function pointer, otherwise we will strip the function pointer, and then attempt
+  // to authenticate and re-sign it when casting it to a uintptr_t again, which will fail because we just
+  // stripped the function pointer. See rdar://122927845.
+  __ptr = reinterpret_cast<uintptr_t>(ptrauth_strip(reinterpret_cast<void*>(__ptr), ptrauth_key_function_pointer));
+#  endif
+
+  // Finally, the function was overridden if it falls outside of the section's bounds.
+  return __ptr < __start || __ptr > __end;
+}
 _LIBCPP_END_NAMESPACE_STD
 
-#  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 1
-#  define _LIBCPP_OVERRIDABLE_FUNCTION(symbol, type, name, arglist)                                                    \
-    static type symbol##_impl__ arglist __asm__("_" _LIBCPP_TOSTRING(symbol));                                         \
-    __asm__(".globl _" _LIBCPP_TOSTRING(symbol));                                                                      \
-    __asm__(".weak_definition _" _LIBCPP_TOSTRING(symbol));                                                            \
-    extern __typeof(symbol##_impl__) name __attribute__((weak_import));                                                \
-    _LIBCPP_BEGIN_NAMESPACE_STD                                                                                        \
-    template <>                                                                                                        \
-    bool __is_function_overridden<static_cast<type(*) arglist>(name)>() {                                              \
-      return static_cast<type(*) arglist>(name) != symbol##_impl__;                                                    \
-    }                                                                                                                  \
-    _LIBCPP_END_NAMESPACE_STD                                                                                          \
-    static type symbol##_impl__ arglist
-
-#elif defined(_LIBCPP_OBJECT_FORMAT_ELF)
+// The NVPTX linker cannot create '__start/__stop' sections.
+#elif defined(_LIBCPP_OBJECT_FORMAT_ELF) && !defined(__NVPTX__)
 
-_LIBCPP_BEGIN_NAMESPACE_STD
+#  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 1
+#  define _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE __attribute__((__section__("__lcxx_override")))
 
-template <auto _Func>
-_LIBCPP_HIDE_FROM_ABI constexpr bool __is_function_overridden();
+// This is very similar to what we do for Mach-O above. The ELF linker will implicitly define
+// variables with those names corresponding to the start and the end of the section.
+//
+// See https://stackoverflow.com/questions/16552710/how-do-you-get-the-start-and-end-addresses-of-a-custom-elf-section
+extern char __start___lcxx_override;
+extern char __stop___lcxx_override;
 
+_LIBCPP_BEGIN_NAMESPACE_STD
+template <class _Ret, class... _Args>
+_LIBCPP_HIDE_FROM_ABI bool __is_function_overridden(_Ret (*__fptr)(_Args...)) noexcept {
+  uintptr_t __start = reinterpret_cast<uintptr_t>(&__start___lcxx_override);
+  uintptr_t __end   = reinterpret_cast<uintptr_t>(&__stop___lcxx_override);
+  uintptr_t __ptr   = reinterpret_cast<uintptr_t>(__fptr);
+
+#  if __has_feature(ptrauth_calls)
+  // We must pass a void* to ptrauth_strip since it only accepts a pointer type. See full explanation above.
+  __ptr = reinterpret_cast<uintptr_t>(ptrauth_strip(reinterpret_cast<void*>(__ptr), ptrauth_key_function_pointer));
+#  endif
+
+  return __ptr < __start || __ptr > __end;
+}
 _LIBCPP_END_NAMESPACE_STD
 
-#  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 1
-#  define _LIBCPP_OVERRIDABLE_FUNCTION(symbol, type, name, arglist)                                                    \
-    static type symbol##_impl__ arglist __asm__(_LIBCPP_TOSTRING(symbol##_impl__));                                    \
-    [[gnu::weak, gnu::alias(_LIBCPP_TOSTRING(symbol##_impl__))]] type name arglist;                                    \
-    _LIBCPP_BEGIN_NAMESPACE_STD                                                                                        \
-    template <>                                                                                                        \
-    bool __is_function_overridden<static_cast<type(*) arglist>(name)>() {                                              \
-      return static_cast<type(*) arglist>(name) != symbol##_impl__;                                                    \
-    }                                                                                                                  \
-    _LIBCPP_END_NAMESPACE_STD                                                                                          \
-    static type symbol##_impl__ arglist
-
 #else
 
 #  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 0
-#  define _LIBCPP_OVERRIDABLE_FUNCTION(symbol, type, name, arglist) _LIBCPP_WEAK type name arglist
+#  define _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE /* nothing */
 
 #endif
 
diff --git a/libcxx/src/new.cpp b/libcxx/src/new.cpp
index b14b52248df33..e010fe4c4f191 100644
--- a/libcxx/src/new.cpp
+++ b/libcxx/src/new.cpp
@@ -43,7 +43,7 @@ static void* operator_new_impl(std::size_t size) {
   return p;
 }
 
-_LIBCPP_OVERRIDABLE_FUNCTION(_Znwm, void*, operator new, (std::size_t size)) _THROW_BAD_ALLOC {
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void* operator new(std::size_t size) _THROW_BAD_ALLOC {
   void* p = operator_new_impl(size);
   if (p == nullptr)
     __throw_bad_alloc_shim();
@@ -54,7 +54,7 @@ _LIBCPP_WEAK void* operator new(size_t size, const std::nothrow_t&) noexcept {
 #  if !_LIBCPP_HAS_EXCEPTIONS
 #    if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden<static_cast<void* (*)(std::size_t)>(&operator new)>(),
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t)>(&operator new)),
       "libc++ was configured with exceptions disabled and `operator new(size_t)` has been overridden, "
       "but `operator new(size_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new(size_t, nothrow_t)` must call `operator new(size_t)`, which will terminate in case "
@@ -74,7 +74,7 @@ _LIBCPP_WEAK void* operator new(size_t size, const std::nothrow_t&) noexcept {
 #  endif
 }
 
-_LIBCPP_OVERRIDABLE_FUNCTION(_Znam, void*, operator new[], (size_t size)) _THROW_BAD_ALLOC {
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void* operator new[](size_t size) _THROW_BAD_ALLOC {
   return ::operator new(size);
 }
 
@@ -82,7 +82,7 @@ _LIBCPP_WEAK void* operator new[](size_t size, const std::nothrow_t&) noexcept {
 #  if !_LIBCPP_HAS_EXCEPTIONS
 #    if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden<static_cast<void* (*)(std::size_t)>(&operator new[])>(),
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t)>(&operator new[])),
       "libc++ was configured with exceptions disabled and `operator new[](size_t)` has been overridden, "
       "but `operator new[](size_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new[](size_t, nothrow_t)` must call `operator new[](size_t)`, which will terminate in case "
@@ -136,8 +136,8 @@ static void* operator_new_aligned_impl(std::size_t size, std::align_val_t alignm
   return p;
 }
 
-_LIBCPP_OVERRIDABLE_FUNCTION(_ZnwmSt11align_val_t, void*, operator new, (std::size_t size, std::align_val_t alignment))
-_THROW_BAD_ALLOC {
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void*
+operator new(std::size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
   void* p = operator_new_aligned_impl(size, alignment);
   if (p == nullptr)
     __throw_bad_alloc_shim();
@@ -148,7 +148,7 @@ _LIBCPP_WEAK void* operator new(size_t size, std::align_val_t alignment, const s
 #    if !_LIBCPP_HAS_EXCEPTIONS
 #      if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden<static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new)>(),
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new)),
       "libc++ was configured with exceptions disabled and `operator new(size_t, align_val_t)` has been overridden, "
       "but `operator new(size_t, align_val_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new(size_t, align_val_t, nothrow_t)` must call `operator new(size_t, align_val_t)`, which will "
@@ -168,14 +168,16 @@ _LIBCPP_WEAK void* operator new(size_t size, std::align_val_t alignment, const s
 #    endif
 }
 
-_LIBCPP_OVERRIDABLE_FUNCTION(_ZnamSt11align_val_t, void*, operator new[], (size_t size, std::align_val_t alignment))
-_THROW_BAD_ALLOC { return ::operator new(size, alignment); }
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void*
+operator new[](size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
+  return ::operator new(size, alignment);
+}
 
 _LIBCPP_WEAK void* operator new[](size_t size, std::align_val_t alignment, const std::nothrow_t&) noexcept {
 #    if !_LIBCPP_HAS_EXCEPTIONS
 #      if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden<static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new[])>(),
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new[])),
       "libc++ was configured with exceptions disabled and `operator new[](size_t, align_val_t)` has been overridden, "
       "but `operator new[](size_t, align_val_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new[](size_t, align_val_t, nothrow_t)` must call `operator new[](size_t, align_val_t)`, which will "
diff --git a/libcxxabi/src/stdlib_new_delete.cpp b/libcxxabi/src/stdlib_new_delete.cpp
index 73798e211c313..f386b28f0cfe6 100644
--- a/libcxxabi/src/stdlib_new_delete.cpp
+++ b/libcxxabi/src/stdlib_new_delete.cpp
@@ -63,7 +63,7 @@ static void* operator_new_impl(std::size_t size) {
   return p;
 }
 
-_LIBCPP_OVERRIDABLE_FUNCTION(_Znwm, void*, operator new, (std::size_t size)) _THROW_BAD_ALLOC {
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void* operator new(std::size_t size) _THROW_BAD_ALLOC {
   void* p = operator_new_impl(size);
   if (p == nullptr)
     __throw_bad_alloc_shim();
@@ -74,7 +74,7 @@ _LIBCPP_WEAK void* operator new(size_t size, const std::nothrow_t&) noexcept {
 #if !_LIBCPP_HAS_EXCEPTIONS
 #  if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden<static_cast<void* (*)(std::size_t)>(&operator new)>(),
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t)>(&operator new)),
       "libc++ was configured with exceptions disabled and `operator new(size_t)` has been overridden, "
       "but `operator new(size_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new(size_t, nothrow_t)` must call `operator new(size_t)`, which will terminate in case "
@@ -94,7 +94,7 @@ _LIBCPP_WEAK void* operator new(size_t size, const std::nothrow_t&) noexcept {
 #endif
 }
 
-_LIBCPP_OVERRIDABLE_FUNCTION(_Znam, void*, operator new[], (size_t size)) _THROW_BAD_ALLOC {
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void* operator new[](size_t size) _THROW_BAD_ALLOC {
   return ::operator new(size);
 }
 
@@ -102,7 +102,7 @@ _LIBCPP_WEAK void* operator new[](size_t size, const std::nothrow_t&) noexcept {
 #if !_LIBCPP_HAS_EXCEPTIONS
 #  if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden<static_cast<void* (*)(std::size_t)>(&operator new[])>(),
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t)>(&operator new[])),
       "libc++ was configured with exceptions disabled and `operator new[](size_t)` has been overridden, "
       "but `operator new[](size_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new[](size_t, nothrow_t)` must call `operator new[](size_t)`, which will terminate in case "
@@ -156,8 +156,8 @@ static void* operator_new_aligned_impl(std::size_t size, std::align_val_t alignm
   return p;
 }
 
-_LIBCPP_OVERRIDABLE_FUNCTION(_ZnwmSt11align_val_t, void*, operator new, (std::size_t size, std::align_val_t alignment))
-_THROW_BAD_ALLOC {
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void*
+operator new(std::size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
   void* p = operator_new_aligned_impl(size, alignment);
   if (p == nullptr)
     __throw_bad_alloc_shim();
@@ -168,7 +168,7 @@ _LIBCPP_WEAK void* operator new(size_t size, std::align_val_t alignment, const s
 #  if !_LIBCPP_HAS_EXCEPTIONS
 #    if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden<static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new)>(),
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new)),
       "libc++ was configured with exceptions disabled and `operator new(size_t, align_val_t)` has been overridden, "
       "but `operator new(size_t, align_val_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new(size_t, align_val_t, nothrow_t)` must call `operator new(size_t, align_val_t)`, which will "
@@ -188,14 +188,16 @@ _LIBCPP_WEAK void* operator new(size_t size, std::align_val_t alignment, const s
 #  endif
 }
 
-_LIBCPP_OVERRIDABLE_FUNCTION(_ZnamSt11align_val_t, void*, operator new[], (size_t size, std::align_val_t alignment))
-_THROW_BAD_ALLOC { return ::operator new(size, alignment); }
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void*
+operator new[](size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
+  return ::operator new(size, alignment);
+}
 
 _LIBCPP_WEAK void* operator new[](size_t size, std::align_val_t alignment, const std::nothrow_t&) noexcept {
 #  if !_LIBCPP_HAS_EXCEPTIONS
 #    if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden<static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new[])>(),
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new[])),
       "libc++ was configured with exceptions disabled and `operator new[](size_t, align_val_t)` has been overridden, "
       "but `operator new[](size_t, align_val_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new[](size_t, align_val_t, nothrow_t)` must call `operator new[](size_t, align_val_t)`, which will "

From 5b5b241edf78bd8e34ccf1ce352e86e571a32b4c Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 19 Dec 2024 13:02:55 -0800
Subject: [PATCH 108/209] [TableGen] Avoid repeated hash lookups (NFC)
 (#120619)

---
 llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
index 193d29717d2c9..1a393beaa14f9 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
@@ -3290,10 +3290,9 @@ void CodeGenDAGPatterns::ParsePatternFragments(bool OutFrags) {
       if (!OpsList->getArgName(j))
         P->error("Operands list should have names for each operand!");
       StringRef ArgNameStr = OpsList->getArgNameStr(j);
-      if (!OperandsSet.count(ArgNameStr))
+      if (!OperandsSet.erase(ArgNameStr))
         P->error("'" + ArgNameStr +
                  "' does not occur in pattern or was multiply specified!");
-      OperandsSet.erase(ArgNameStr);
       Args.push_back(std::string(ArgNameStr));
     }
 

From 34e0f9cd36e9d4eb7fd153f536c811ec668be458 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 19 Dec 2024 16:06:08 -0500
Subject: [PATCH 109/209] [libc++] Remove the need for `uselocale()` (#120158)

Instead of requiring `uselocale()` as part of the base locale API,
define __locale_guard in the few places that need it directly, without
making __locale_guard part of the base API.

In practice, most mainstream platforms never used __locale_guard, so
they also didn't need to define uselocale(), and after this patch they
actually don't define it anymore.
---
 libcxx/include/CMakeLists.txt                 |  1 -
 libcxx/include/__locale_dir/locale_base_api.h |  6 --
 .../locale_base_api/bsd_locale_fallbacks.h    | 16 +++-
 libcxx/include/__locale_dir/locale_guard.h    | 78 -------------------
 .../include/__locale_dir/support/bsd_like.h   |  2 -
 libcxx/include/__locale_dir/support/windows.h | 42 +++++++++-
 .../__support/xlocale/__nop_locale_mgmt.h     |  2 -
 libcxx/include/module.modulemap               |  2 -
 libcxx/src/iostream.cpp                       |  6 +-
 libcxx/src/support/win32/locale_win32.cpp     | 31 ++++----
 10 files changed, 68 insertions(+), 118 deletions(-)
 delete mode 100644 libcxx/include/__locale_dir/locale_guard.h

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index ba392e5883ff3..b5bbfbe7d3a23 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -503,7 +503,6 @@ set(files
   __locale_dir/locale_base_api/ibm.h
   __locale_dir/locale_base_api/musl.h
   __locale_dir/locale_base_api/openbsd.h
-  __locale_dir/locale_guard.h
   __locale_dir/pad_and_output.h
   __locale_dir/support/apple.h
   __locale_dir/support/bsd_like.h
diff --git a/libcxx/include/__locale_dir/locale_base_api.h b/libcxx/include/__locale_dir/locale_base_api.h
index 7e3c205c04a78..c8097beb9052d 100644
--- a/libcxx/include/__locale_dir/locale_base_api.h
+++ b/libcxx/include/__locale_dir/locale_base_api.h
@@ -23,14 +23,12 @@
 // Variadic functions may be implemented as templates with a parameter pack instead
 // of C-style variadic functions.
 //
-// TODO: I think __uselocale() is not necessary if we refactor a bit.
 // TODO: __localeconv shouldn't take a reference, but the Windows implementation doesn't allow copying __locale_t
 //
 // Locale management
 // -----------------
 // namespace __locale {
 //  using __locale_t = implementation-defined;
-//  __locale_t  __uselocale(__locale_t);
 //  __locale_t  __newlocale(int, const char*, __locale_t);
 //  void        __freelocale(__locale_t);
 //  lconv*      __localeconv(__locale_t&);
@@ -139,10 +137,6 @@ namespace __locale {
 //
 using __locale_t = locale_t;
 
-#  ifndef _LIBCPP_MSVCRT_LIKE
-inline _LIBCPP_HIDE_FROM_ABI __locale_t __uselocale(__locale_t __loc) { return uselocale(__loc); }
-#  endif
-
 inline _LIBCPP_HIDE_FROM_ABI __locale_t __newlocale(int __category_mask, const char* __name, __locale_t __loc) {
   return newlocale(__category_mask, __name, __loc);
 }
diff --git a/libcxx/include/__locale_dir/locale_base_api/bsd_locale_fallbacks.h b/libcxx/include/__locale_dir/locale_base_api/bsd_locale_fallbacks.h
index d514bdfaace90..b62a1b737e97f 100644
--- a/libcxx/include/__locale_dir/locale_base_api/bsd_locale_fallbacks.h
+++ b/libcxx/include/__locale_dir/locale_base_api/bsd_locale_fallbacks.h
@@ -14,8 +14,6 @@
 #define _LIBCPP___LOCALE_DIR_LOCALE_BASE_API_BSD_LOCALE_FALLBACKS_H
 
 #include <locale.h>
-
-#include <__locale_dir/locale_guard.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -30,6 +28,20 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+struct __locale_guard {
+  _LIBCPP_HIDE_FROM_ABI __locale_guard(locale_t& __loc) : __old_loc_(::uselocale(__loc)) {}
+
+  _LIBCPP_HIDE_FROM_ABI ~__locale_guard() {
+    if (__old_loc_)
+      ::uselocale(__old_loc_);
+  }
+
+  locale_t __old_loc_;
+
+  __locale_guard(__locale_guard const&)            = delete;
+  __locale_guard& operator=(__locale_guard const&) = delete;
+};
+
 inline _LIBCPP_HIDE_FROM_ABI decltype(MB_CUR_MAX) __libcpp_mb_cur_max_l(locale_t __l) {
   __locale_guard __current(__l);
   return MB_CUR_MAX;
diff --git a/libcxx/include/__locale_dir/locale_guard.h b/libcxx/include/__locale_dir/locale_guard.h
deleted file mode 100644
index 93d38334f3132..0000000000000
--- a/libcxx/include/__locale_dir/locale_guard.h
+++ /dev/null
@@ -1,78 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___LOCALE_DIR_LOCALE_GUARD_H
-#define _LIBCPP___LOCALE_DIR_LOCALE_GUARD_H
-
-#include <__config>
-#include <__locale> // for locale_t
-#include <clocale>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-#if defined(_LIBCPP_MSVCRT_LIKE)
-struct __locale_guard {
-  __locale_guard(__locale::__locale_t __l) : __status(_configthreadlocale(_ENABLE_PER_THREAD_LOCALE)) {
-    // Setting the locale can be expensive even when the locale given is
-    // already the current locale, so do an explicit check to see if the
-    // current locale is already the one we want.
-    const char* __lc = __setlocale(nullptr);
-    // If every category is the same, the locale string will simply be the
-    // locale name, otherwise it will be a semicolon-separated string listing
-    // each category.  In the second case, we know at least one category won't
-    // be what we want, so we only have to check the first case.
-    if (std::strcmp(__l.__get_locale(), __lc) != 0) {
-      __locale_all = _strdup(__lc);
-      if (__locale_all == nullptr)
-        __throw_bad_alloc();
-      __setlocale(__l.__get_locale());
-    }
-  }
-  ~__locale_guard() {
-    // The CRT documentation doesn't explicitly say, but setlocale() does the
-    // right thing when given a semicolon-separated list of locale settings
-    // for the different categories in the same format as returned by
-    // setlocale(LC_ALL, nullptr).
-    if (__locale_all != nullptr) {
-      __setlocale(__locale_all);
-      free(__locale_all);
-    }
-    _configthreadlocale(__status);
-  }
-  static const char* __setlocale(const char* __locale) {
-    const char* __new_locale = setlocale(LC_ALL, __locale);
-    if (__new_locale == nullptr)
-      __throw_bad_alloc();
-    return __new_locale;
-  }
-  int __status;
-  char* __locale_all = nullptr;
-};
-#else
-struct __locale_guard {
-  _LIBCPP_HIDE_FROM_ABI __locale_guard(locale_t& __loc) : __old_loc_(uselocale(__loc)) {}
-
-  _LIBCPP_HIDE_FROM_ABI ~__locale_guard() {
-    if (__old_loc_)
-      uselocale(__old_loc_);
-  }
-
-  locale_t __old_loc_;
-
-  __locale_guard(__locale_guard const&)            = delete;
-  __locale_guard& operator=(__locale_guard const&) = delete;
-};
-#endif
-
-_LIBCPP_END_NAMESPACE_STD
-
-#endif // _LIBCPP___LOCALE_DIR_LOCALE_GUARD_H
diff --git a/libcxx/include/__locale_dir/support/bsd_like.h b/libcxx/include/__locale_dir/support/bsd_like.h
index cce6de64673b0..da31aeaf3c58e 100644
--- a/libcxx/include/__locale_dir/support/bsd_like.h
+++ b/libcxx/include/__locale_dir/support/bsd_like.h
@@ -38,8 +38,6 @@ namespace __locale {
 //
 using __locale_t = ::locale_t;
 
-inline _LIBCPP_HIDE_FROM_ABI __locale_t __uselocale(__locale_t __loc) { return ::uselocale(__loc); }
-
 inline _LIBCPP_HIDE_FROM_ABI __locale_t __newlocale(int __category_mask, const char* __locale, __locale_t __base) {
   return ::newlocale(__category_mask, __locale, __base);
 }
diff --git a/libcxx/include/__locale_dir/support/windows.h b/libcxx/include/__locale_dir/support/windows.h
index fd6aab8de9ac8..03d05a410fdc3 100644
--- a/libcxx/include/__locale_dir/support/windows.h
+++ b/libcxx/include/__locale_dir/support/windows.h
@@ -149,10 +149,6 @@ class __locale_t {
   __lconv_storage* __lc_ = nullptr;
 };
 
-// __uselocale can't be implemented on Windows because Windows allows partial modification
-// of thread-local locale and so _get_current_locale() returns a copy while __uselocale does
-// not create any copies. We can still implement RAII even without __uselocale though.
-__locale_t __uselocale(__locale_t) = delete;
 _LIBCPP_EXPORTED_FROM_ABI __locale_t __newlocale(int __mask, const char* __locale, __locale_t __base);
 inline _LIBCPP_HIDE_FROM_ABI void __freelocale(__locale_t __loc) { ::_free_locale(__loc); }
 _LIBCPP_EXPORTED_FROM_ABI lconv* __localeconv(__locale_t& __loc);
@@ -288,6 +284,44 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT(__scanf__, 3, 4) int __s
 _LIBCPP_DIAGNOSTIC_POP
 #undef _LIBCPP_VARIADIC_ATTRIBUTE_FORMAT
 
+struct __locale_guard {
+  _LIBCPP_HIDE_FROM_ABI __locale_guard(__locale_t __l) : __status(_configthreadlocale(_ENABLE_PER_THREAD_LOCALE)) {
+    // Setting the locale can be expensive even when the locale given is
+    // already the current locale, so do an explicit check to see if the
+    // current locale is already the one we want.
+    const char* __lc = __setlocale(nullptr);
+    // If every category is the same, the locale string will simply be the
+    // locale name, otherwise it will be a semicolon-separated string listing
+    // each category.  In the second case, we know at least one category won't
+    // be what we want, so we only have to check the first case.
+    if (std::strcmp(__l.__get_locale(), __lc) != 0) {
+      __locale_all = _strdup(__lc);
+      if (__locale_all == nullptr)
+        __throw_bad_alloc();
+      __setlocale(__l.__get_locale());
+    }
+  }
+  _LIBCPP_HIDE_FROM_ABI ~__locale_guard() {
+    // The CRT documentation doesn't explicitly say, but setlocale() does the
+    // right thing when given a semicolon-separated list of locale settings
+    // for the different categories in the same format as returned by
+    // setlocale(LC_ALL, nullptr).
+    if (__locale_all != nullptr) {
+      __setlocale(__locale_all);
+      free(__locale_all);
+    }
+    _configthreadlocale(__status);
+  }
+  _LIBCPP_HIDE_FROM_ABI static const char* __setlocale(const char* __locale) {
+    const char* __new_locale = setlocale(LC_ALL, __locale);
+    if (__new_locale == nullptr)
+      __throw_bad_alloc();
+    return __new_locale;
+  }
+  int __status;
+  char* __locale_all = nullptr;
+};
+
 } // namespace __locale
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/include/__support/xlocale/__nop_locale_mgmt.h b/libcxx/include/__support/xlocale/__nop_locale_mgmt.h
index 5aaf3eaa6441d..eabe169cc4bc8 100644
--- a/libcxx/include/__support/xlocale/__nop_locale_mgmt.h
+++ b/libcxx/include/__support/xlocale/__nop_locale_mgmt.h
@@ -21,8 +21,6 @@ inline _LIBCPP_HIDE_FROM_ABI void freelocale(locale_t) {}
 
 inline _LIBCPP_HIDE_FROM_ABI locale_t newlocale(int, const char*, locale_t) { return nullptr; }
 
-inline _LIBCPP_HIDE_FROM_ABI locale_t uselocale(locale_t) { return nullptr; }
-
 #define LC_COLLATE_MASK (1 << LC_COLLATE)
 #define LC_CTYPE_MASK (1 << LC_CTYPE)
 #define LC_MESSAGES_MASK (1 << LC_MESSAGES)
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index a548de803f9fe..86efbd36b20d1 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -1466,8 +1466,6 @@ module std [system] {
 
   module locale {
     header "locale"
-
-    module locale_guard   { header "__locale_dir/locale_guard.h" }
     module pad_and_output { header "__locale_dir/pad_and_output.h" }
 
     module support {
diff --git a/libcxx/src/iostream.cpp b/libcxx/src/iostream.cpp
index ee33f4bae6d89..6db02d5603794 100644
--- a/libcxx/src/iostream.cpp
+++ b/libcxx/src/iostream.cpp
@@ -11,10 +11,6 @@
 #include <new>
 #include <string>
 
-#ifdef _LIBCPP_MSVCRT_LIKE
-#  include <__locale_dir/locale_guard.h>
-#endif
-
 #define _str(s) #s
 #define str(s) _str(s)
 #define _LIBCPP_ABI_NAMESPACE_STR str(_LIBCPP_ABI_NAMESPACE)
@@ -109,7 +105,7 @@ static void force_locale_initialization() {
   static bool once = []() {
     auto loc = __locale::__newlocale(LC_ALL_MASK, "C", 0);
     {
-      __locale_guard g(loc); // forces initialization of locale TLS
+      __locale::__locale_guard g(loc); // forces initialization of locale TLS
       ((void)g);
     }
     __locale::__freelocale(loc);
diff --git a/libcxx/src/support/win32/locale_win32.cpp b/libcxx/src/support/win32/locale_win32.cpp
index 0a537d6997ca2..ec2dd7f36ec70 100644
--- a/libcxx/src/support/win32/locale_win32.cpp
+++ b/libcxx/src/support/win32/locale_win32.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <__locale_dir/locale_guard.h>
 #include <__locale_dir/support/windows.h>
 #include <clocale> // std::localeconv() & friends
 #include <cstdarg> // va_start & friends
@@ -28,7 +27,7 @@ __locale_t __newlocale(int /*mask*/, const char* locale, __locale_t /*base*/) {
 }
 
 lconv* __localeconv(__locale_t& loc) {
-  std::__locale_guard __current(loc);
+  __locale_guard __current(loc);
   lconv* lc = std::localeconv();
   if (!lc)
     return lc;
@@ -40,12 +39,12 @@ lconv* __localeconv(__locale_t& loc) {
 //
 #if !defined(_LIBCPP_MSVCRT)
 float __strtof(const char* nptr, char** endptr, __locale_t loc) {
-  std::__locale_guard __current(loc);
+  __locale_guard __current(loc);
   return std::strtof(nptr, endptr);
 }
 
 long double __strtold(const char* nptr, char** endptr, __locale_t loc) {
-  std::__locale_guard __current(loc);
+  __locale_guard __current(loc);
   return std::strtold(nptr, endptr);
 }
 #endif
@@ -55,7 +54,7 @@ long double __strtold(const char* nptr, char** endptr, __locale_t loc) {
 //
 #if defined(__MINGW32__) && __MSVCRT_VERSION__ < 0x0800
 size_t __strftime(char* ret, size_t n, const char* format, const struct tm* tm, __locale_t loc) {
-  std::__locale_guard __current(loc);
+  __locale_guard __current(loc);
   return std::strftime(ret, n, format, tm);
 }
 #endif
@@ -67,18 +66,18 @@ decltype(MB_CUR_MAX) __mb_len_max(__locale_t __l) {
 #if defined(_LIBCPP_MSVCRT)
   return ::___mb_cur_max_l_func(__l);
 #else
-  std::__locale_guard __current(__l);
+  __locale_guard __current(__l);
   return MB_CUR_MAX;
 #endif
 }
 
 wint_t __btowc(int c, __locale_t loc) {
-  std::__locale_guard __current(loc);
+  __locale_guard __current(loc);
   return std::btowc(c);
 }
 
 int __wctob(wint_t c, __locale_t loc) {
-  std::__locale_guard __current(loc);
+  __locale_guard __current(loc);
   return std::wctob(c);
 }
 
@@ -88,12 +87,12 @@ size_t __wcsnrtombs(char* __restrict dst,
                     size_t len,
                     mbstate_t* __restrict ps,
                     __locale_t loc) {
-  std::__locale_guard __current(loc);
+  __locale_guard __current(loc);
   return ::wcsnrtombs(dst, src, nwc, len, ps);
 }
 
 size_t __wcrtomb(char* __restrict s, wchar_t wc, mbstate_t* __restrict ps, __locale_t loc) {
-  std::__locale_guard __current(loc);
+  __locale_guard __current(loc);
   return std::wcrtomb(s, wc, ps);
 }
 
@@ -103,24 +102,24 @@ size_t __mbsnrtowcs(wchar_t* __restrict dst,
                     size_t len,
                     mbstate_t* __restrict ps,
                     __locale_t loc) {
-  std::__locale_guard __current(loc);
+  __locale_guard __current(loc);
   return ::mbsnrtowcs(dst, src, nms, len, ps);
 }
 
 size_t
 __mbrtowc(wchar_t* __restrict pwc, const char* __restrict s, size_t n, mbstate_t* __restrict ps, __locale_t loc) {
-  std::__locale_guard __current(loc);
+  __locale_guard __current(loc);
   return std::mbrtowc(pwc, s, n, ps);
 }
 
 size_t __mbrlen(const char* __restrict s, size_t n, mbstate_t* __restrict ps, __locale_t loc) {
-  std::__locale_guard __current(loc);
+  __locale_guard __current(loc);
   return std::mbrlen(s, n, ps);
 }
 
 size_t __mbsrtowcs(
     wchar_t* __restrict dst, const char** __restrict src, size_t len, mbstate_t* __restrict ps, __locale_t loc) {
-  std::__locale_guard __current(loc);
+  __locale_guard __current(loc);
   return std::mbsrtowcs(dst, src, len, ps);
 }
 
@@ -132,7 +131,7 @@ int __snprintf(char* ret, size_t n, __locale_t loc, const char* format, ...) {
   int result = ::__stdio_common_vsprintf(
       _CRT_INTERNAL_LOCAL_PRINTF_OPTIONS | _CRT_INTERNAL_PRINTF_STANDARD_SNPRINTF_BEHAVIOR, ret, n, format, loc, ap);
 #else
-  std::__locale_guard __current(loc);
+  __locale_guard __current(loc);
   _LIBCPP_DIAGNOSTIC_PUSH
   _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wformat-nonliteral")
   int result = std::vsnprintf(ret, n, format, ap);
@@ -178,7 +177,7 @@ int __libcpp_vasprintf(char** sptr, const char* __restrict format, va_list ap) {
 int __asprintf(char** ret, __locale_t loc, const char* format, ...) {
   va_list ap;
   va_start(ap, format);
-  std::__locale_guard __current(loc);
+  __locale_guard __current(loc);
   return __libcpp_vasprintf(ret, format, ap);
 }
 

From c2830b218017dd9e2fca843e81baec1d71306b07 Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Thu, 19 Dec 2024 16:11:03 -0500
Subject: [PATCH 110/209] [AMDGPU][True16][MC] added fake16 for gfx12 alias MC
 test (#120624)

This is a NFC.

Duplicate gfx12_asm_vop3_alias.s file to true16/fake16 version and
update `real-true16` flag on it.

This is preparing the upcoming changes for true16
---
 .../MC/AMDGPU/gfx12_asm_vop3_aliases-fake16.s | 55 +++++++++++++++++++
 llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s  |  2 +-
 2 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases-fake16.s

diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases-fake16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases-fake16.s
new file mode 100644
index 0000000000000..8fb8070d42019
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases-fake16.s
@@ -0,0 +1,55 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+
+v_min3_f32 v5, v1, v2, v3
+// GFX12: v_min3_num_f32 v5, v1, v2, v3           ; encoding: [0x05,0x00,0x29,0xd6,0x01,0x05,0x0e,0x04]
+
+v_max3_f32 v5, v1, v2, v3
+// GFX12: v_max3_num_f32 v5, v1, v2, v3           ; encoding: [0x05,0x00,0x2a,0xd6,0x01,0x05,0x0e,0x04]
+
+v_min3_f16 v5, v1, v2, v3
+// GFX12: v_min3_num_f16 v5, v1, v2, v3           ; encoding: [0x05,0x00,0x2b,0xd6,0x01,0x05,0x0e,0x04]
+
+v_max3_f16 v5, v1, v2, v3
+// GFX12: v_max3_num_f16 v5, v1, v2, v3           ; encoding: [0x05,0x00,0x2c,0xd6,0x01,0x05,0x0e,0x04]
+
+v_med3_f32 v5, v1, v2, v3
+// GFX12: v_med3_num_f32 v5, v1, v2, v3           ; encoding: [0x05,0x00,0x31,0xd6,0x01,0x05,0x0e,0x04]
+
+v_med3_f16 v5, v1, v2, v3
+// GFX12: v_med3_num_f16 v5, v1, v2, v3           ; encoding: [0x05,0x00,0x32,0xd6,0x01,0x05,0x0e,0x04]
+
+v_minmax_f32_e64_dpp v0, -v1, -v2, -v3 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: v_minmax_num_f32_e64_dpp v0, -v1, -v2, -v3 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x00,0x00,0x68,0xd6,0xe9,0x04,0x0e,0xe4,0x01,0x88,0xc6,0xfa]
+
+v_maxmin_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: v_maxmin_num_f32_e64_dpp v0, v1, v2, v3 clamp dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x00,0x80,0x69,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0xc6,0xfa]
+
+v_minmax_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
+// GFX12: v_minmax_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6a,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_maxmin_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
+// GFX12: v_maxmin_num_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6b,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_mad_i64_i32 v[5:6], s12, v1, v2, v[3:4]
+// GFX12: v_mad_co_i64_i32 v[5:6], s12, v1, v2, v[3:4] ; encoding: [0x05,0x0c,0xff,0xd6,0x01,0x05,0x0e,0x04]
+
+v_mad_u64_u32 v[5:6], s12, v1, v2, v[3:4]
+// GFX12: v_mad_co_u64_u32 v[5:6], s12, v1, v2, v[3:4] ; encoding: [0x05,0x0c,0xfe,0xd6,0x01,0x05,0x0e,0x04]
+
+v_max_f64 v[5:6], s[2:3], s[4:5]
+// GFX12: v_max_num_f64_e64 v[5:6], s[2:3], s[4:5] ; encoding: [0x05,0x00,0x0e,0xd5,0x02,0x08,0x00,0x00]
+
+v_min_f64 v[5:6], s[2:3], s[4:5]
+// GFX12: v_min_num_f64_e64 v[5:6], s[2:3], s[4:5] ; encoding: [0x05,0x00,0x0d,0xd5,0x02,0x08,0x00,0x00]
+
+v_cvt_pknorm_i16_f16 v5, v1, v2
+// GFX12: v_cvt_pk_norm_i16_f16 v5, v1, v2        ; encoding: [0x05,0x00,0x12,0xd7,0x01,0x05,0x02,0x00]
+
+v_cvt_pknorm_u16_f16 v5, v1, v2
+// GFX12: v_cvt_pk_norm_u16_f16 v5, v1, v2        ; encoding: [0x05,0x00,0x13,0xd7,0x01,0x05,0x02,0x00]
+
+v_add3_nc_u32 v5, v1, v2, s3
+// GFX12: v_add3_u32 v5, v1, v2, s3               ; encoding: [0x05,0x00,0x55,0xd6,0x01,0x05,0x0e,0x00]
+
+v_xor_add_u32 v5, v1, v2, s3
+// GFX12: v_xad_u32 v5, v1, v2, s3                ; encoding: [0x05,0x00,0x45,0xd6,0x01,0x05,0x0e,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s
index f5d5bd580cef8..59cb1a479450f 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_aliases.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,+real-true16 -show-encoding %s | FileCheck --check-prefixes=GFX12 %s
 
 v_min3_f32 v5, v1, v2, v3
 // GFX12: v_min3_num_f32 v5, v1, v2, v3           ; encoding: [0x05,0x00,0x29,0xd6,0x01,0x05,0x0e,0x04]

From 0a94ee694fbe1a17c5a948fbe86b881527f81343 Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Thu, 19 Dec 2024 13:13:55 -0800
Subject: [PATCH 111/209] [libc] update host build docs (#120147)

Update the host build docs to better reflect the current recommended
process.
---
 libc/docs/dev/index.rst       |  1 +
 libc/docs/full_host_build.rst | 72 ++++++++++++++++++++---------------
 2 files changed, 42 insertions(+), 31 deletions(-)

diff --git a/libc/docs/dev/index.rst b/libc/docs/dev/index.rst
index c16121feb3a45..9ed50bb6683aa 100644
--- a/libc/docs/dev/index.rst
+++ b/libc/docs/dev/index.rst
@@ -7,6 +7,7 @@ Developer Guides
 Navigate to the links below for information on the respective topics:
 
 .. toctree::
+   :maxdepth: 1
 
    code_style
    source_tree_layout
diff --git a/libc/docs/full_host_build.rst b/libc/docs/full_host_build.rst
index f687c2fdab213..e25079141f47b 100644
--- a/libc/docs/full_host_build.rst
+++ b/libc/docs/full_host_build.rst
@@ -8,7 +8,7 @@ Full Host Build
    :depth: 1
    :local:
 
-.. note:: 
+.. note::
    Fullbuild requires running headergen, which is a python program that depends on
    pyyaml. The minimum versions are listed on the :ref:`header_generation`
    page, as well as additional information.
@@ -99,11 +99,16 @@ a C++ standard library (like libc++). Hence, we do not include
    `libc++ <https://libcxx.llvm.org/>`_, libcxx-abi and libunwind in the
    LLVM only toolchain and use them to build and link C++ applications.
 
-Below is the list of commands for a simple recipe to build and install the
-libc components along with other components of an LLVM only toolchain.  In this
-we've set the Ninja generator, enabled a full compiler suite, set the build
-type to "Debug", and enabled the Scudo allocator.  The build also tells clang
-to use the freshly built lld and compiler-rt.
+Below is the cmake command for a bootstrapping build of LLVM. This will build
+clang and lld with the current system's toolchain, then build compiler-rt and
+LLVM-libc with that freshly built clang. This ensures that LLVM-libc can take
+advantage of the latest clang features and optimizations.
+
+This build also uses Ninja as cmake's generator, and sets lld and compiler-rt as
+the default for the fresh clang. Those settings are recommended, but the build
+should still work without them. The compiler-rt options are required for
+building `Scudo <https://llvm.org/docs/ScudoHardenedAllocator.html>`_ as the
+allocator for LLVM-libc.
 
 .. note::
    if your build fails with an error saying the compiler can't find
@@ -113,7 +118,6 @@ to use the freshly built lld and compiler-rt.
    this command:
    ``sudo ln -s /usr/include/<TARGET TRIPLE>/asm /usr/include/asm``
 
-.. TODO: Move from projects to runtimes for libc, compiler-rt
 .. code-block:: sh
 
    $> cd llvm-project  # The llvm-project checkout
@@ -122,8 +126,9 @@ to use the freshly built lld and compiler-rt.
    $> SYSROOT=/path/to/sysroot # Remember to set this!
    $> cmake ../llvm  \
       -G Ninja  \
-      -DLLVM_ENABLE_PROJECTS="clang;lld;libc;compiler-rt"   \
-      -DCMAKE_BUILD_TYPE=Debug  \
+      -DLLVM_ENABLE_PROJECTS="clang;lld"   \
+      -DLLVM_ENABLE_RUNTIMES="libc;compiler-rt" \
+      -DCMAKE_BUILD_TYPE=Release  \
       -DCMAKE_C_COMPILER=clang \
       -DCMAKE_CXX_COMPILER=clang++ \
       -DLLVM_LIBC_FULL_BUILD=ON \
@@ -133,25 +138,8 @@ to use the freshly built lld and compiler-rt.
       -DCOMPILER_RT_SCUDO_STANDALONE_BUILD_SHARED=OFF        \
       -DCLANG_DEFAULT_LINKER=lld \
       -DCLANG_DEFAULT_RTLIB=compiler-rt \
-      -DDEFAULT_SYSROOT=$SYSROOT \
       -DCMAKE_INSTALL_PREFIX=$SYSROOT
 
-We will go over some of the special options passed to the ``cmake`` command
-above.
-
-* **Enabled Projects** - Since we want to build and install clang, lld
-  and compiler-rt along with the libc, we specify
-  ``clang;libc;lld;compiler-rt`` as the list of enabled projects.
-* **The full build option** - Since we want to do build the full libc, we pass
-  ``-DLLVM_LIBC_FULL_BUILD=ON``.
-* **Scudo related options** - LLVM's libc uses
-  `Scudo <https://llvm.org/docs/ScudoHardenedAllocator.html>`_ as its allocator.
-  So, when building the full libc, we should specify that we want to include
-  Scudo in the libc. Since the libc currently only supports static linking, we
-  also specify that we do not want to build the Scudo shared library.
-* **Default sysroot and install prefix** - This is the path to the tool chain
-  install directory.  This is the directory where you intend to set up the sysroot.
-
 Build and install
 =================
 
@@ -164,14 +152,19 @@ Build and install
    you may need to delete them from ``/usr/local/include``.
 
 After configuring the build with the above ``cmake`` command, one can build and
-install the libc, clang (and its support libraries and builtins), lld and
-compiler-rt, with the following command:
+install the toolchain with
 
 .. code-block:: sh
 
    $> ninja install-clang install-builtins install-compiler-rt  \
       install-core-resource-headers install-libc install-lld
 
+or
+
+.. code-block:: sh
+
+   $> ninja install
+
 Once the above command completes successfully, the ``$SYSROOT`` directory you
 have specified with the CMake configure step above will contain a full LLVM-only
 toolchain with which you can build practical/real-world C applications. See
@@ -190,9 +183,9 @@ These instructions should work on a Debian-based x86_64 system:
 
    $> apt download linux-libc-dev
    $> dpkg -x linux-libc-dev*deb .
-   $> mv usr/include/* /path/to/sysroot/include
-   $> rm -rf usr linux-libc-dev*deb
-   $> ln -s x86_64-linux-gnu/asm ~/Programming/sysroot/include/asm
+   $> cp -r usr/* /path/to/sysroot/
+   $> rm -r usr linux-libc-dev*deb
+   $> ln -s /path/to/sysroot/include/x86_64-linux-gnu/asm /path/to/sysroot/include/asm
 
 Using your newly built libc
 ===========================
@@ -208,3 +201,20 @@ invocation:
    Because the libc does not yet support dynamic linking, the -static parameter
    must be added to all clang invocations.
 
+
+You can make sure you're using the newly built toolchain by trying out features
+that aren't yet supported by the system toolchain, such as fixed point. The
+following is an example program that demonstrates the difference:
+
+.. code-block:: C
+
+   // $ $SYSROOT/bin/clang example.c -static -ffixed-point --sysroot=$SYSROOT
+
+   #include <stdio.h>
+   int main() {
+      printf("Hello, World!\n%.9f\n%.9lK\n",
+         4294967295.000000001,
+         4294967295.000000001ulK);
+      return 0;
+   }
+

From f000c053bfa6f86f5ffac9e1177e6c88f18ae1bd Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 19 Dec 2024 21:19:06 +0000
Subject: [PATCH 112/209] [VectorCombine] Add test coverage to
 shuffleToIdentity for fp casts. NFC

---
 .../AArch64/shuffletoidentity.ll              | 164 ++++++++++++++++++
 1 file changed, 164 insertions(+)

diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
index d4446e27742f8..44572b79ad407 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -1108,5 +1108,169 @@ define <8 x i8> @operandbundles_second(<8 x i8> %a) {
   ret <8 x i8> %r
 }
 
+define <8 x i32> @fptoi(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: @fptoi(
+; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT:    [[ABT:%.*]] = fptosi <4 x float> [[AT]] to <4 x i32>
+; CHECK-NEXT:    [[ABB:%.*]] = fptosi <4 x float> [[AB]] to <4 x i32>
+; CHECK-NEXT:    [[BBT:%.*]] = fptoui <4 x float> [[BT]] to <4 x i32>
+; CHECK-NEXT:    [[BBB:%.*]] = fptoui <4 x float> [[BB]] to <4 x i32>
+; CHECK-NEXT:    [[MT:%.*]] = mul <4 x i32> [[ABT]], [[BBT]]
+; CHECK-NEXT:    [[MB:%.*]] = mul <4 x i32> [[ABB]], [[BBB]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[MT]], <4 x i32> [[MB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i32> [[R]]
+;
+  %ab = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %at = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+  %bb = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bt = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+  %abt = fptosi <4 x float> %at to <4 x i32>
+  %abb = fptosi <4 x float> %ab to <4 x i32>
+  %bbt = fptoui <4 x float> %bt to <4 x i32>
+  %bbb = fptoui <4 x float> %bb to <4 x i32>
+  %mt = mul <4 x i32> %abt, %bbt
+  %mb = mul <4 x i32> %abb, %bbb
+  %r = shufflevector <4 x i32> %mt, <4 x i32> %mb, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x i32> %r
+}
+
+define <8 x half> @itofp(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @itofp(
+; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT:    [[ABT:%.*]] = sitofp <4 x i16> [[AT]] to <4 x half>
+; CHECK-NEXT:    [[ABB:%.*]] = sitofp <4 x i16> [[AB]] to <4 x half>
+; CHECK-NEXT:    [[BBT:%.*]] = uitofp <4 x i16> [[BT]] to <4 x half>
+; CHECK-NEXT:    [[BBB:%.*]] = uitofp <4 x i16> [[BB]] to <4 x half>
+; CHECK-NEXT:    [[MT:%.*]] = fmul <4 x half> [[ABT]], [[BBT]]
+; CHECK-NEXT:    [[MB:%.*]] = fmul <4 x half> [[ABB]], [[BBB]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x half> [[MT]], <4 x half> [[MB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x half> [[R]]
+;
+  %ab = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %at = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+  %bb = shufflevector <8 x i16> %b, <8 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %bt = shufflevector <8 x i16> %b, <8 x i16> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+  %abt = sitofp <4 x i16> %at to <4 x half>
+  %abb = sitofp <4 x i16> %ab to <4 x half>
+  %bbt = uitofp <4 x i16> %bt to <4 x half>
+  %bbb = uitofp <4 x i16> %bb to <4 x half>
+  %mt = fmul <4 x half> %abt, %bbt
+  %mb = fmul <4 x half> %abb, %bbb
+  %r = shufflevector <4 x half> %mt, <4 x half> %mb, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret <8 x half> %r
+}
+
+define <16 x i32> @const_types(<16 x i32> %wide.vec, <16 x i32> %wide.vec116) {
+; CHECK-LABEL: @const_types(
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC:%.*]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC113:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[STRIDED_VEC114:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[STRIDED_VEC115:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i32> [[STRIDED_VEC]] to <4 x i64>
+; CHECK-NEXT:    [[STRIDED_VEC117:%.*]] = shufflevector <16 x i32> [[WIDE_VEC116:%.*]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT:    [[STRIDED_VEC118:%.*]] = shufflevector <16 x i32> [[WIDE_VEC116]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT:    [[STRIDED_VEC119:%.*]] = shufflevector <16 x i32> [[WIDE_VEC116]], <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT:    [[STRIDED_VEC120:%.*]] = shufflevector <16 x i32> [[WIDE_VEC116]], <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i32> [[STRIDED_VEC117]] to <4 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw <4 x i64> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i32> [[STRIDED_VEC113]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i32> [[STRIDED_VEC118]] to <4 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw <4 x i64> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = sext <4 x i32> [[STRIDED_VEC114]] to <4 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <4 x i32> [[STRIDED_VEC119]] to <4 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <4 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = sext <4 x i32> [[STRIDED_VEC115]] to <4 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i32> [[STRIDED_VEC120]] to <4 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nsw <4 x i64> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP14:%.*]] = lshr <8 x i64> [[TMP13]], splat (i64 32)
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc nuw <8 x i64> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP15]], <8 x i32> splat (i32 1073741823))
+; CHECK-NEXT:    [[TMP17:%.*]] = shl <8 x i32> [[TMP16]], splat (i32 1)
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP19:%.*]] = lshr <8 x i64> [[TMP18]], splat (i64 32)
+; CHECK-NEXT:    [[TMP20:%.*]] = trunc nuw <8 x i64> [[TMP19]] to <8 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP20]], <8 x i32> splat (i32 1073741823))
+; CHECK-NEXT:    [[TMP22:%.*]] = shl <8 x i32> [[TMP21]], splat (i32 1)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP17]], <8 x i32> [[TMP22]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    ret <16 x i32> [[INTERLEAVED_VEC]]
+;
+  %strided.vec = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %strided.vec113 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %strided.vec114 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %strided.vec115 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %10 = sext <4 x i32> %strided.vec to <4 x i64>
+  %strided.vec117 = shufflevector <16 x i32> %wide.vec116, <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+  %strided.vec118 = shufflevector <16 x i32> %wide.vec116, <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+  %strided.vec119 = shufflevector <16 x i32> %wide.vec116, <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+  %strided.vec120 = shufflevector <16 x i32> %wide.vec116, <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+  %11 = sext <4 x i32> %strided.vec117 to <4 x i64>
+  %12 = mul nsw <4 x i64> %11, %10
+  %13 = lshr <4 x i64> %12, splat (i64 32)
+  %14 = trunc nuw <4 x i64> %13 to <4 x i32>
+  %15 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %14, <4 x i32> splat (i32 1073741823))
+  %16 = shl <4 x i32> %15, splat (i32 1)
+  %17 = sext <4 x i32> %strided.vec113 to <4 x i64>
+  %18 = sext <4 x i32> %strided.vec118 to <4 x i64>
+  %19 = mul nsw <4 x i64> %18, %17
+  %20 = lshr <4 x i64> %19, splat (i64 32)
+  %21 = trunc nuw <4 x i64> %20 to <4 x i32>
+  %22 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %21, <4 x i32> splat (i32 1073741823))
+  %23 = shl <4 x i32> %22, splat (i32 1)
+  %24 = sext <4 x i32> %strided.vec114 to <4 x i64>
+  %25 = sext <4 x i32> %strided.vec119 to <4 x i64>
+  %26 = mul nsw <4 x i64> %25, %24
+  %27 = lshr <4 x i64> %26, splat (i64 32)
+  %28 = trunc nuw <4 x i64> %27 to <4 x i32>
+  %29 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %28, <4 x i32> splat (i32 1073741823))
+  %30 = shl <4 x i32> %29, splat (i32 1)
+  %31 = sext <4 x i32> %strided.vec115 to <4 x i64>
+  %32 = sext <4 x i32> %strided.vec120 to <4 x i64>
+  %33 = mul nsw <4 x i64> %32, %31
+  %34 = lshr <4 x i64> %33, splat (i64 32)
+  %35 = trunc nuw <4 x i64> %34 to <4 x i32>
+  %36 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %35, <4 x i32> splat (i32 1073741823))
+  %37 = shl <4 x i32> %36, splat (i32 1)
+  %38 = shufflevector <4 x i32> %16, <4 x i32> %23, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %39 = shufflevector <4 x i32> %30, <4 x i32> %37, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %interleaved.vec = shufflevector <8 x i32> %38, <8 x i32> %39, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+  ret <16 x i32> %interleaved.vec
+}
+
+define <32 x half> @cast_types(<32 x i16> %wide.vec) {
+; CHECK-LABEL: @cast_types(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[WIDE_VEC:%.*]], <32 x i16> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+; CHECK-NEXT:    [[TMP5:%.*]] = sitofp <16 x i16> [[TMP1]] to <16 x half>
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <16 x half> [[TMP5]], splat (half 0xH0200)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i16> [[WIDE_VEC]], <32 x i16> poison, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+; CHECK-NEXT:    [[TMP7:%.*]] = sitofp <16 x i16> [[TMP4]] to <16 x half>
+; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <16 x half> [[TMP7]], splat (half 0xH0200)
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x half> [[TMP6]], <16 x half> [[TMP8]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:    ret <32 x half> [[INTERLEAVED_VEC]]
+;
+  %strided.vec = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %strided.vec49 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %strided.vec50 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %strided.vec51 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  %5 = sitofp <8 x i16> %strided.vec to <8 x half>
+  %6 = fmul fast <8 x half> %5, splat (half 0xH0200)
+  %7 = sitofp <8 x i16> %strided.vec49 to <8 x half>
+  %8 = fmul fast <8 x half> %7, splat (half 0xH0200)
+  %9 = sitofp <8 x i16> %strided.vec50 to <8 x half>
+  %10 = fmul fast <8 x half> %9, splat (half 0xH0200)
+  %11 = sitofp <8 x i16> %strided.vec51 to <8 x half>
+  %12 = fmul fast <8 x half> %11, splat (half 0xH0200)
+  %13 = shufflevector <8 x half> %6, <8 x half> %8, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %14 = shufflevector <8 x half> %10, <8 x half> %12, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %interleaved.vec = shufflevector <16 x half> %13, <16 x half> %14, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  ret <32 x half> %interleaved.vec
+}
+
 declare <4 x i64> @llvm.fshl.v4i64(<4 x i64>, <4 x i64>, <4 x i64>)
 declare void @use(<4 x i8>)

From 4c3e13ebca560cd2377dfeb0c7b9186bd6c96ae1 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 19 Dec 2024 21:21:14 +0000
Subject: [PATCH 113/209] [gn build] Port 34e0f9cd36e9

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 5e52a2a9460ce..0a18e67ca5c42 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -575,7 +575,6 @@ if (current_toolchain == default_toolchain) {
       "__locale_dir/locale_base_api/ibm.h",
       "__locale_dir/locale_base_api/musl.h",
       "__locale_dir/locale_base_api/openbsd.h",
-      "__locale_dir/locale_guard.h",
       "__locale_dir/pad_and_output.h",
       "__locale_dir/support/apple.h",
       "__locale_dir/support/bsd_like.h",

From 9e322c56f7b3637377855c59b9665d5b299cba7b Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 19 Dec 2024 21:30:56 +0000
Subject: [PATCH 114/209] [TySan] Don't report globals with external storage.
 (#120565)

Globals with external storage should have been initialized where they
are defined.

Fixes https://github.com/llvm/llvm-project/issues/120448

PR: https://github.com/llvm/llvm-project/pull/120565
---
 clang/lib/CodeGen/SanitizerMetadata.cpp      |  3 +
 clang/test/CodeGen/sanitize-type-globals.cpp | 58 ++++++++++++++++++++
 2 files changed, 61 insertions(+)
 create mode 100644 clang/test/CodeGen/sanitize-type-globals.cpp

diff --git a/clang/lib/CodeGen/SanitizerMetadata.cpp b/clang/lib/CodeGen/SanitizerMetadata.cpp
index 405124c8b8717..61fdf3399ff3c 100644
--- a/clang/lib/CodeGen/SanitizerMetadata.cpp
+++ b/clang/lib/CodeGen/SanitizerMetadata.cpp
@@ -145,6 +145,9 @@ void SanitizerMetadata::reportGlobal(llvm::GlobalVariable *GV, const VarDecl &D,
     for (auto *Attr : D.specific_attrs<NoSanitizeAttr>())
       NoSanitizeMask |= Attr->getMask();
 
+    if (D.hasExternalStorage())
+      NoSanitizeMask |= SanitizerKind::Type;
+
     return NoSanitizeMask;
   };
 
diff --git a/clang/test/CodeGen/sanitize-type-globals.cpp b/clang/test/CodeGen/sanitize-type-globals.cpp
new file mode 100644
index 0000000000000..7cb8de8b238cc
--- /dev/null
+++ b/clang/test/CodeGen/sanitize-type-globals.cpp
@@ -0,0 +1,58 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --filter-out "attributes" --filter-out "attributes #" --version 5
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm -o - %s -fsanitize=type | FileCheck -check-prefix=CHECK %s
+
+//.
+// CHECK: @x = global %struct.CompleteS zeroinitializer, align 8
+// CHECK: @y = external global %struct.S, align 1
+// CHECK: @__tysan_shadow_memory_address = external global i64
+// CHECK: @__tysan_app_memory_mask = external global i64
+// CHECK: @__tysan_v1_Simple_20C_2b_2b_20TBAA = linkonce_odr constant { i64, i64, [16 x i8] } { i64 2, i64 0, [16 x i8] c"Simple C++ TBAA\00" }, comdat
+// CHECK: @__tysan_v1_omnipotent_20char = linkonce_odr constant { i64, i64, ptr, i64, [16 x i8] } { i64 2, i64 1, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, i64 0, [16 x i8] c"omnipotent char\00" }, comdat
+// CHECK: @__tysan_v1_int = linkonce_odr constant { i64, i64, ptr, i64, [4 x i8] } { i64 2, i64 1, ptr @__tysan_v1_omnipotent_20char, i64 0, [4 x i8] c"int\00" }, comdat
+// CHECK: @__tysan_v1_any_20pointer = linkonce_odr constant { i64, i64, ptr, i64, [12 x i8] } { i64 2, i64 1, ptr @__tysan_v1_omnipotent_20char, i64 0, [12 x i8] c"any pointer\00" }, comdat
+// CHECK: @__tysan_v1_p1_20int = linkonce_odr constant { i64, i64, ptr, i64, [7 x i8] } { i64 2, i64 1, ptr @__tysan_v1_any_20pointer, i64 0, [7 x i8] c"p1 int\00" }, comdat
+// CHECK: @__tysan_v1___ZTS9CompleteS = linkonce_odr constant { i64, i64, ptr, i64, ptr, i64, [15 x i8] } { i64 2, i64 2, ptr @__tysan_v1_int, i64 0, ptr @__tysan_v1_p1_20int, i64 8, [15 x i8] c"_ZTS9CompleteS\00" }, comdat
+// CHECK: @llvm.used = appending global [7 x ptr] [ptr @tysan.module_ctor, ptr @__tysan_v1_Simple_20C_2b_2b_20TBAA, ptr @__tysan_v1_omnipotent_20char, ptr @__tysan_v1_int, ptr @__tysan_v1_any_20pointer, ptr @__tysan_v1_p1_20int, ptr @__tysan_v1___ZTS9CompleteS], section "llvm.metadata"
+// CHECK: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 0, ptr @tysan.module_ctor, ptr null }]
+//.
+struct CompleteS {
+  int x;
+  int *ptr;
+};
+
+void f(CompleteS *);
+CompleteS x;
+// CHECK-LABEL: define dso_local void @_Z1gv(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK:  [[ENTRY:.*:]]
+// CHECK:    call void @_Z1fP9CompleteS(ptr noundef @x)
+// CHECK:    ret void
+//
+void g() { f(&x); }
+
+typedef struct S IncompleteS;
+void f(IncompleteS *);
+extern IncompleteS y;
+// CHECK-LABEL: define dso_local void @_Z1hv(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK:  [[ENTRY:.*:]]
+// CHECK:    call void @_Z1fP1S(ptr noundef @y)
+// CHECK:    ret void
+//
+void h() { f(&y); }
+//.
+// CHECK: attributes #[[ATTR0]] = { mustprogress noinline nounwind optnone sanitize_type "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// CHECK: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind "target-features"="+cx8,+mmx,+sse,+sse2,+x87" }
+// CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind }
+//.
+// CHECK: [[META0:![0-9]+]] = !{ptr @x, [[META1:![0-9]+]]}
+// CHECK: [[META1]] = !{!"_ZTS9CompleteS", [[META2:![0-9]+]], i64 0, [[META5:![0-9]+]], i64 8}
+// CHECK: [[META2]] = !{!"int", [[META3:![0-9]+]], i64 0}
+// CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+// CHECK: [[META4]] = !{!"Simple C++ TBAA"}
+// CHECK: [[META5]] = !{!"p1 int", [[META6:![0-9]+]], i64 0}
+// CHECK: [[META6]] = !{!"any pointer", [[META3]], i64 0}
+// CHECK: [[META7:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META8:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.

From d33a2c58112bdd74225b0ff4f07acc49bed7e6ea Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Thu, 19 Dec 2024 13:31:29 -0800
Subject: [PATCH 115/209] [BoundsSan] Update BoundsChecking.cpp to use no-merge
 attribute where applicable (#120620)

https://github.com/llvm/llvm-project/pull/65972 introduced
-ubsan-unique-traps and -bounds-checking-unique-traps, which attach the
function size to the ubsantrap intrinsic.

https://github.com/llvm/llvm-project/pull/117651 changed
ubsan-unique-traps to use nomerge instead of the function size, but did
not update -bounds-checking-unique-traps. This patch adds nomerge to
bounds-checking-unique-traps.
---
 clang/test/CodeGen/bounds-checking.c                   | 5 +++--
 llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/clang/test/CodeGen/bounds-checking.c b/clang/test/CodeGen/bounds-checking.c
index d052665012a50..1eedae9d8be0f 100644
--- a/clang/test/CodeGen/bounds-checking.c
+++ b/clang/test/CodeGen/bounds-checking.c
@@ -85,11 +85,11 @@ char B2[10];
 // NOOPTLOCAL-LABEL: @f8
 // NOOPTARRAY-LABEL: @f8
 void f8(int i, int k) {
-  // NOOPTLOCAL: call void @llvm.ubsantrap(i8 3)
+  // NOOPTLOCAL: call void @llvm.ubsantrap(i8 3) #[[ATTR1:[0-9]+]]
   // NOOPTARRAY: call void @llvm.ubsantrap(i8 18) #[[ATTR2:[0-9]+]]
   B[i] = '\0';
 
-  // NOOPTLOCAL: call void @llvm.ubsantrap(i8 5)
+  // NOOPTLOCAL: call void @llvm.ubsantrap(i8 5) #[[ATTR1:[0-9]+]]
   // NOOPTARRAY: call void @llvm.ubsantrap(i8 18) #[[ATTR2:[0-9]+]]
   B2[k] = '\0';
 }
@@ -102,4 +102,5 @@ struct S {
 struct S *f9(int i) {
   return &s[i];
 }
+// NOOPTLOCAL: attributes #[[ATTR1]] = { nomerge noreturn nounwind }
 // NOOPTARRAY: attributes #[[ATTR2]] = { nomerge noreturn nounwind }
diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
index c86d967716a5a..1a2dc5984523e 100644
--- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -196,8 +196,10 @@ static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
 
     CallInst *TrapCall;
     if (DebugTrapBB) {
+      // Ideally we would use the SanitizerHandler::OutOfBounds constant
       TrapCall = IRB.CreateIntrinsic(
           IntrID, {}, ConstantInt::get(IRB.getInt8Ty(), Fn->size()));
+      TrapCall->addFnAttr(llvm::Attribute::NoMerge);
     } else {
       TrapCall = IRB.CreateIntrinsic(IntrID, {}, {});
     }
@@ -251,4 +253,4 @@ void BoundsCheckingPass::printPipeline(
     OS << "<rt-abort>";
     break;
   }
-}
\ No newline at end of file
+}

From 5f096fd2216001296b809002ee474ee6d7f06e0e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 19 Dec 2024 21:46:49 +0000
Subject: [PATCH 116/209] Revert "[LoopVectorizer] Add support for partial
 reductions (#92418)"

This reverts commit 060d62b48aeb5080ffcae1dc56e41a06c6f56701.

It looks like this is triggering an assertion when build llvm-test-suite
on ARM64 macOS.

Reproducer from MultiSource/Benchmarks/Ptrdist/bc/number.c

    target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
    target triple = "arm64-apple-macosx15.0.0"

    define void @test(i64 %idx.neg, i8 %0) #0 {
    entry:
      br label %while.body

    while.body:                                       ; preds = %while.body, %entry
      %n1ptr.0.idx131 = phi i64 [ %n1ptr.0.add, %while.body ], [ %idx.neg, %entry ]
      %n2ptr.0.idx130 = phi i64 [ %n2ptr.0.add, %while.body ], [ 0, %entry ]
      %sum.1129 = phi i64 [ %add99, %while.body ], [ 0, %entry ]
      %n1ptr.0.add = add i64 %n1ptr.0.idx131, 1
      %conv = sext i8 %0 to i64
      %n2ptr.0.add = add i64 %n2ptr.0.idx130, 1
      %1 = load i8, ptr null, align 1
      %conv97 = sext i8 %1 to i64
      %mul = mul i64 %conv97, %conv
      %add99 = add i64 %mul, %sum.1129
      %cmp94 = icmp ugt i64 %n1ptr.0.idx131, 0
      %cmp95 = icmp ne i64 %n2ptr.0.idx130, -1
      %2 = and i1 %cmp94, %cmp95
      br i1 %2, label %while.body, label %while.end.loopexit

    while.end.loopexit:                               ; preds = %while.body
      %add99.lcssa = phi i64 [ %add99, %while.body ]
      ret void
    }

    attributes #0 = { "target-cpu"="apple-m1" }

> opt -p loop-vectorize
Assertion failed: ((VF.isScalar() || V->getType()->isVectorTy()) && "scalar values must be stored as (0, 0)"), function set, file VPlan.h, line 284.
---
 .../llvm/Analysis/TargetTransformInfo.h       |   39 -
 .../llvm/Analysis/TargetTransformInfoImpl.h   |    9 -
 llvm/lib/Analysis/TargetTransformInfo.cpp     |   17 -
 .../AArch64/AArch64TargetTransformInfo.h      |   56 -
 .../Transforms/Vectorize/LoopVectorize.cpp    |  136 +-
 .../Transforms/Vectorize/VPRecipeBuilder.h    |   59 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   63 +-
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |    8 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   74 +-
 llvm/lib/Transforms/Vectorize/VPlanValue.h    |    1 -
 .../AArch64/fully-unrolled-cost.ll            |   20 +-
 .../partial-reduce-dot-product-epilogue.ll    |   99 -
 .../partial-reduce-dot-product-neon.ll        | 1375 -------------
 .../AArch64/partial-reduce-dot-product.ll     | 1733 -----------------
 .../AArch64/partial-reduce-no-dotprod.ll      |   61 -
 .../LoopVectorize/AArch64/vplan-printing.ll   |   93 -
 16 files changed, 31 insertions(+), 3812 deletions(-)
 delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
 delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
 delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
 delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll
 delete mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index c6b846f96f162..752313ab15858 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -211,12 +211,6 @@ typedef TargetTransformInfo TTI;
 /// for IR-level transformations.
 class TargetTransformInfo {
 public:
-  enum PartialReductionExtendKind { PR_None, PR_SignExtend, PR_ZeroExtend };
-
-  /// Get the kind of extension that an instruction represents.
-  static PartialReductionExtendKind
-  getPartialReductionExtendKind(Instruction *I);
-
   /// Construct a TTI object using a type implementing the \c Concept
   /// API below.
   ///
@@ -1286,18 +1280,6 @@ class TargetTransformInfo {
   /// \return if target want to issue a prefetch in address space \p AS.
   bool shouldPrefetchAddressSpace(unsigned AS) const;
 
-  /// \return The cost of a partial reduction, which is a reduction from a
-  /// vector to another vector with fewer elements of larger size. They are
-  /// represented by the llvm.experimental.partial.reduce.add intrinsic, which
-  /// takes an accumulator and a binary operation operand that itself is fed by
-  /// two extends. An example of an operation that uses a partial reduction is a
-  /// dot product, which reduces a vector to another of 4 times fewer elements.
-  InstructionCost
-  getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
-                          ElementCount VF, PartialReductionExtendKind OpAExtend,
-                          PartialReductionExtendKind OpBExtend,
-                          std::optional<unsigned> BinOp = std::nullopt) const;
-
   /// \return The maximum interleave factor that any transform should try to
   /// perform for this target. This number depends on the level of parallelism
   /// and the number of execution units in the CPU.
@@ -2125,18 +2107,6 @@ class TargetTransformInfo::Concept {
   /// \return if target want to issue a prefetch in address space \p AS.
   virtual bool shouldPrefetchAddressSpace(unsigned AS) const = 0;
 
-  /// \return The cost of a partial reduction, which is a reduction from a
-  /// vector to another vector with fewer elements of larger size. They are
-  /// represented by the llvm.experimental.partial.reduce.add intrinsic, which
-  /// takes an accumulator and a binary operation operand that itself is fed by
-  /// two extends. An example of an operation that uses a partial reduction is a
-  /// dot product, which reduces a vector to another of 4 times fewer elements.
-  virtual InstructionCost
-  getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
-                          ElementCount VF, PartialReductionExtendKind OpAExtend,
-                          PartialReductionExtendKind OpBExtend,
-                          std::optional<unsigned> BinOp) const = 0;
-
   virtual unsigned getMaxInterleaveFactor(ElementCount VF) = 0;
   virtual InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
@@ -2816,15 +2786,6 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
     return Impl.shouldPrefetchAddressSpace(AS);
   }
 
-  InstructionCost getPartialReductionCost(
-      unsigned Opcode, Type *InputType, Type *AccumType, ElementCount VF,
-      PartialReductionExtendKind OpAExtend,
-      PartialReductionExtendKind OpBExtend,
-      std::optional<unsigned> BinOp = std::nullopt) const override {
-    return Impl.getPartialReductionCost(Opcode, InputType, AccumType, VF,
-                                        OpAExtend, OpBExtend, BinOp);
-  }
-
   unsigned getMaxInterleaveFactor(ElementCount VF) override {
     return Impl.getMaxInterleaveFactor(VF);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 5fa0c46ad292d..9c74b2a0c31df 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -585,15 +585,6 @@ class TargetTransformInfoImplBase {
   bool enableWritePrefetching() const { return false; }
   bool shouldPrefetchAddressSpace(unsigned AS) const { return !AS; }
 
-  InstructionCost
-  getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
-                          ElementCount VF,
-                          TTI::PartialReductionExtendKind OpAExtend,
-                          TTI::PartialReductionExtendKind OpBExtend,
-                          std::optional<unsigned> BinOp = std::nullopt) const {
-    return InstructionCost::getInvalid();
-  }
-
   unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; }
 
   InstructionCost getArithmeticInstrCost(
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index c62e40db0c577..b32dffa9f0fe8 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -863,14 +863,6 @@ bool TargetTransformInfo::shouldPrefetchAddressSpace(unsigned AS) const {
   return TTIImpl->shouldPrefetchAddressSpace(AS);
 }
 
-InstructionCost TargetTransformInfo::getPartialReductionCost(
-    unsigned Opcode, Type *InputType, Type *AccumType, ElementCount VF,
-    PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend,
-    std::optional<unsigned> BinOp) const {
-  return TTIImpl->getPartialReductionCost(Opcode, InputType, AccumType, VF,
-                                          OpAExtend, OpBExtend, BinOp);
-}
-
 unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const {
   return TTIImpl->getMaxInterleaveFactor(VF);
 }
@@ -982,15 +974,6 @@ InstructionCost TargetTransformInfo::getShuffleCost(
   return Cost;
 }
 
-TargetTransformInfo::PartialReductionExtendKind
-TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) {
-  if (isa<SExtInst>(I))
-    return PR_SignExtend;
-  if (isa<ZExtInst>(I))
-    return PR_ZeroExtend;
-  return PR_None;
-}
-
 TTI::CastContextHint
 TargetTransformInfo::getCastContextHint(const Instruction *I) {
   if (!I)
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 2a31cacc203f4..83b86e31565e4 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -23,7 +23,6 @@
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/Support/InstructionCost.h"
 #include <cstdint>
 #include <optional>
 
@@ -358,61 +357,6 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
     return BaseT::isLegalNTLoad(DataType, Alignment);
   }
 
-  InstructionCost
-  getPartialReductionCost(unsigned Opcode, Type *InputType, Type *AccumType,
-                          ElementCount VF,
-                          TTI::PartialReductionExtendKind OpAExtend,
-                          TTI::PartialReductionExtendKind OpBExtend,
-                          std::optional<unsigned> BinOp) const {
-
-    InstructionCost Invalid = InstructionCost::getInvalid();
-    InstructionCost Cost(TTI::TCC_Basic);
-
-    if (Opcode != Instruction::Add)
-      return Invalid;
-
-    EVT InputEVT = EVT::getEVT(InputType);
-    EVT AccumEVT = EVT::getEVT(AccumType);
-
-    if (VF.isScalable() && !ST->isSVEorStreamingSVEAvailable())
-      return Invalid;
-    if (VF.isFixed() && (!ST->isNeonAvailable() || !ST->hasDotProd()))
-      return Invalid;
-
-    if (InputEVT == MVT::i8) {
-      switch (VF.getKnownMinValue()) {
-      default:
-        return Invalid;
-      case 8:
-        if (AccumEVT == MVT::i32)
-          Cost *= 2;
-        else if (AccumEVT != MVT::i64)
-          return Invalid;
-        break;
-      case 16:
-        if (AccumEVT == MVT::i64)
-          Cost *= 2;
-        else if (AccumEVT != MVT::i32)
-          return Invalid;
-        break;
-      }
-    } else if (InputEVT == MVT::i16) {
-      // FIXME: Allow i32 accumulator but increase cost, as we would extend
-      //        it to i64.
-      if (VF.getKnownMinValue() != 8 || AccumEVT != MVT::i64)
-        return Invalid;
-    } else
-      return Invalid;
-
-    if (OpAExtend == TTI::PR_None || OpBExtend == TTI::PR_None)
-      return Invalid;
-
-    if (!BinOp || (*BinOp) != Instruction::Mul)
-      return Invalid;
-
-    return Cost;
-  }
-
   bool enableOrderedReductions() const { return true; }
 
   InstructionCost getInterleavedMemoryOpCost(
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index ad963137f1af1..1f6996cd9c1f4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7605,10 +7605,6 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
         }
         continue;
       }
-      // The VPlan-based cost model is more accurate for partial reduction and
-      // comparing against the legacy cost isn't desirable.
-      if (isa<VPPartialReductionRecipe>(&R))
-        return true;
       if (Instruction *UI = GetInstructionForCost(&R))
         SeenInstrs.insert(UI);
     }
@@ -8831,103 +8827,6 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
   return Recipe;
 }
 
-/// Find all possible partial reductions in the loop and track all of those that
-/// are valid so recipes can be formed later.
-void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
-  // Find all possible partial reductions.
-  SmallVector<std::pair<PartialReductionChain, unsigned>, 1>
-      PartialReductionChains;
-  for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
-    if (std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
-            getScaledReduction(Phi, RdxDesc, Range))
-      PartialReductionChains.push_back(*Pair);
-
-  // A partial reduction is invalid if any of its extends are used by
-  // something that isn't another partial reduction. This is because the
-  // extends are intended to be lowered along with the reduction itself.
-
-  // Build up a set of partial reduction bin ops for efficient use checking.
-  SmallSet<User *, 4> PartialReductionBinOps;
-  for (const auto &[PartialRdx, _] : PartialReductionChains)
-    PartialReductionBinOps.insert(PartialRdx.BinOp);
-
-  auto ExtendIsOnlyUsedByPartialReductions =
-      [&PartialReductionBinOps](Instruction *Extend) {
-        return all_of(Extend->users(), [&](const User *U) {
-          return PartialReductionBinOps.contains(U);
-        });
-      };
-
-  // Check if each use of a chain's two extends is a partial reduction
-  // and only add those that don't have non-partial reduction users.
-  for (auto Pair : PartialReductionChains) {
-    PartialReductionChain Chain = Pair.first;
-    if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
-        ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))
-      ScaledReductionExitInstrs.insert(std::make_pair(Chain.Reduction, Pair));
-  }
-}
-
-std::optional<std::pair<PartialReductionChain, unsigned>>
-VPRecipeBuilder::getScaledReduction(PHINode *PHI,
-                                    const RecurrenceDescriptor &Rdx,
-                                    VFRange &Range) {
-  // TODO: Allow scaling reductions when predicating. The select at
-  // the end of the loop chooses between the phi value and most recent
-  // reduction result, both of which have different VFs to the active lane
-  // mask when scaling.
-  if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent()))
-    return std::nullopt;
-
-  auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
-  if (!Update)
-    return std::nullopt;
-
-  Value *Op = Update->getOperand(0);
-  if (Op == PHI)
-    Op = Update->getOperand(1);
-
-  auto *BinOp = dyn_cast<BinaryOperator>(Op);
-  if (!BinOp || !BinOp->hasOneUse())
-    return std::nullopt;
-
-  using namespace llvm::PatternMatch;
-  Value *A, *B;
-  if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) ||
-      !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B))))
-    return std::nullopt;
-
-  Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
-  Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
-
-  // Check that the extends extend from the same type.
-  if (A->getType() != B->getType())
-    return std::nullopt;
-
-  TTI::PartialReductionExtendKind OpAExtend =
-      TargetTransformInfo::getPartialReductionExtendKind(ExtA);
-  TTI::PartialReductionExtendKind OpBExtend =
-      TargetTransformInfo::getPartialReductionExtendKind(ExtB);
-
-  PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp);
-
-  unsigned TargetScaleFactor =
-      PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
-          A->getType()->getPrimitiveSizeInBits());
-
-  if (LoopVectorizationPlanner::getDecisionAndClampRange(
-          [&](ElementCount VF) {
-            InstructionCost Cost = TTI->getPartialReductionCost(
-                Update->getOpcode(), A->getType(), PHI->getType(), VF,
-                OpAExtend, OpBExtend, std::make_optional(BinOp->getOpcode()));
-            return Cost.isValid();
-          },
-          Range))
-    return std::make_pair(Chain, TargetScaleFactor);
-
-  return std::nullopt;
-}
-
 VPRecipeBase *
 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
                                         ArrayRef<VPValue *> Operands,
@@ -8952,14 +8851,9 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
           Legal->getReductionVars().find(Phi)->second;
       assert(RdxDesc.getRecurrenceStartValue() ==
              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
-
-      // If the PHI is used by a partial reduction, set the scale factor.
-      std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
-          getScaledReductionForInstr(RdxDesc.getLoopExitInstr());
-      unsigned ScaleFactor = Pair ? Pair->second : 1;
-      PhiRecipe = new VPReductionPHIRecipe(
-          Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),
-          CM.useOrderedReductions(RdxDesc), ScaleFactor);
+      PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
+                                           CM.isInLoopReduction(Phi),
+                                           CM.useOrderedReductions(RdxDesc));
     } else {
       // TODO: Currently fixed-order recurrences are modeled as chains of
       // first-order recurrences. If there are no users of the intermediate
@@ -8991,9 +8885,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
     return tryToWidenMemory(Instr, Operands, Range);
 
-  if (getScaledReductionForInstr(Instr))
-    return tryToCreatePartialReduction(Instr, Operands);
-
   if (!shouldWiden(Instr, Range))
     return nullptr;
 
@@ -9014,21 +8905,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
   return tryToWiden(Instr, Operands, VPBB);
 }
 
-VPRecipeBase *
-VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
-                                             ArrayRef<VPValue *> Operands) {
-  assert(Operands.size() == 2 &&
-         "Unexpected number of operands for partial reduction");
-
-  VPValue *BinOp = Operands[0];
-  VPValue *Phi = Operands[1];
-  if (isa<VPReductionPHIRecipe>(BinOp->getDefiningRecipe()))
-    std::swap(BinOp, Phi);
-
-  return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi,
-                                      Reduction);
-}
-
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
                                                         ElementCount MaxVF) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -9346,8 +9222,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
 
-  VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
-                                Builder);
+  VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
 
   // ---------------------------------------------------------------------------
   // Pre-construction: record ingredients whose recipes we'll need to further
@@ -9393,9 +9268,6 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
         return Legal->blockNeedsPredication(BB) || NeedsBlends;
       });
-
-  RecipeBuilder.collectScaledReductions(Range);
-
   auto *MiddleVPBB = Plan->getMiddleBlock();
   VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index cf653e2d3e658..5d4a3b555981c 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -21,28 +21,8 @@ namespace llvm {
 class LoopVectorizationLegality;
 class LoopVectorizationCostModel;
 class TargetLibraryInfo;
-class TargetTransformInfo;
 struct HistogramInfo;
 
-/// A chain of instructions that form a partial reduction.
-/// Designed to match: reduction_bin_op (bin_op (extend (A), (extend (B))),
-/// accumulator).
-struct PartialReductionChain {
-  PartialReductionChain(Instruction *Reduction, Instruction *ExtendA,
-                        Instruction *ExtendB, Instruction *BinOp)
-      : Reduction(Reduction), ExtendA(ExtendA), ExtendB(ExtendB), BinOp(BinOp) {
-  }
-  /// The top-level binary operation that forms the reduction to a scalar
-  /// after the loop body.
-  Instruction *Reduction;
-  /// The extension of each of the inner binary operation's operands.
-  Instruction *ExtendA;
-  Instruction *ExtendB;
-
-  /// The binary operation using the extends that is then reduced.
-  Instruction *BinOp;
-};
-
 /// Helper class to create VPRecipies from IR instructions.
 class VPRecipeBuilder {
   /// The VPlan new recipes are added to.
@@ -54,9 +34,6 @@ class VPRecipeBuilder {
   /// Target Library Info.
   const TargetLibraryInfo *TLI;
 
-  // Target Transform Info.
-  const TargetTransformInfo *TTI;
-
   /// The legality analysis.
   LoopVectorizationLegality *Legal;
 
@@ -86,11 +63,6 @@ class VPRecipeBuilder {
   /// created.
   SmallVector<VPHeaderPHIRecipe *, 4> PhisToFix;
 
-  /// The set of reduction exit instructions that will be scaled to
-  /// a smaller VF via partial reductions, paired with the scaling factor.
-  DenseMap<const Instruction *, std::pair<PartialReductionChain, unsigned>>
-      ScaledReductionExitInstrs;
-
   /// Check if \p I can be widened at the start of \p Range and possibly
   /// decrease the range such that the returned value holds for the entire \p
   /// Range. The function should not be called for memory instructions or calls.
@@ -139,35 +111,13 @@ class VPRecipeBuilder {
   VPHistogramRecipe *tryToWidenHistogram(const HistogramInfo *HI,
                                          ArrayRef<VPValue *> Operands);
 
-  /// Examines reduction operations to see if the target can use a cheaper
-  /// operation with a wider per-iteration input VF and narrower PHI VF.
-  /// Returns null if no scaled reduction was found, otherwise a pair with a
-  /// struct containing reduction information and the scaling factor between the
-  /// number of elements in the input and output.
-  std::optional<std::pair<PartialReductionChain, unsigned>>
-  getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx,
-                     VFRange &Range);
-
 public:
   VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI,
-                  const TargetTransformInfo *TTI,
                   LoopVectorizationLegality *Legal,
                   LoopVectorizationCostModel &CM,
                   PredicatedScalarEvolution &PSE, VPBuilder &Builder)
-      : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), TTI(TTI), Legal(Legal),
-        CM(CM), PSE(PSE), Builder(Builder) {}
-
-  std::optional<std::pair<PartialReductionChain, unsigned>>
-  getScaledReductionForInstr(const Instruction *ExitInst) {
-    auto It = ScaledReductionExitInstrs.find(ExitInst);
-    return It == ScaledReductionExitInstrs.end()
-               ? std::nullopt
-               : std::make_optional(It->second);
-  }
-
-  /// Find all possible partial reductions in the loop and track all of those
-  /// that are valid so recipes can be formed later.
-  void collectScaledReductions(VFRange &Range);
+      : Plan(Plan), OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM),
+        PSE(PSE), Builder(Builder) {}
 
   /// Create and return a widened recipe for \p I if one can be created within
   /// the given VF \p Range.
@@ -175,11 +125,6 @@ class VPRecipeBuilder {
                                        ArrayRef<VPValue *> Operands,
                                        VFRange &Range, VPBasicBlock *VPBB);
 
-  /// Create and return a partial reduction recipe for a reduction instruction
-  /// along with binary operation and reduction phi operands.
-  VPRecipeBase *tryToCreatePartialReduction(Instruction *Reduction,
-                                            ArrayRef<VPValue *> Operands);
-
   /// Set the recipe created for given ingredient.
   void setRecipe(Instruction *I, VPRecipeBase *R) {
     assert(!Ingredient2Recipe.contains(I) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 3c23b12190c47..8dd94a292f707 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -889,7 +889,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
     case VPRecipeBase::VPWidenPointerInductionSC:
     case VPRecipeBase::VPReductionPHISC:
     case VPRecipeBase::VPScalarCastSC:
-    case VPRecipeBase::VPPartialReductionSC:
       return true;
     case VPRecipeBase::VPBranchOnMaskSC:
     case VPRecipeBase::VPInterleaveSC:
@@ -2374,28 +2373,23 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
   /// The phi is part of an ordered reduction. Requires IsInLoop to be true.
   bool IsOrdered;
 
-  /// When expanding the reduction PHI, the plan's VF element count is divided
-  /// by this factor to form the reduction phi's VF.
-  unsigned VFScaleFactor = 1;
-
 public:
   /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
   /// RdxDesc.
   VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
                        VPValue &Start, bool IsInLoop = false,
-                       bool IsOrdered = false, unsigned VFScaleFactor = 1)
+                       bool IsOrdered = false)
       : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start),
-        RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered),
-        VFScaleFactor(VFScaleFactor) {
+        RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) {
     assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
   }
 
   ~VPReductionPHIRecipe() override = default;
 
   VPReductionPHIRecipe *clone() override {
-    auto *R = new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()),
-                                       RdxDesc, *getOperand(0), IsInLoop,
-                                       IsOrdered, VFScaleFactor);
+    auto *R =
+        new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), RdxDesc,
+                                 *getOperand(0), IsInLoop, IsOrdered);
     R->addOperand(getBackedgeValue());
     return R;
   }
@@ -2426,51 +2420,6 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
   bool isInLoop() const { return IsInLoop; }
 };
 
-/// A recipe for forming partial reductions. In the loop, an accumulator and
-/// vector operand are added together and passed to the next iteration as the
-/// next accumulator. After the loop body, the accumulator is reduced to a
-/// scalar value.
-class VPPartialReductionRecipe : public VPSingleDefRecipe {
-  unsigned Opcode;
-
-public:
-  VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0,
-                           VPValue *Op1)
-      : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1,
-                                 ReductionInst) {}
-  VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1,
-                           Instruction *ReductionInst = nullptr)
-      : VPSingleDefRecipe(VPDef::VPPartialReductionSC,
-                          ArrayRef<VPValue *>({Op0, Op1}), ReductionInst),
-        Opcode(Opcode) {
-    assert(isa<VPReductionPHIRecipe>(getOperand(1)->getDefiningRecipe()) &&
-           "Unexpected operand order for partial reduction recipe");
-  }
-  ~VPPartialReductionRecipe() override = default;
-
-  VPPartialReductionRecipe *clone() override {
-    return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1));
-  }
-
-  VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
-
-  /// Generate the reduction in the loop.
-  void execute(VPTransformState &State) override;
-
-  /// Return the cost of this VPPartialReductionRecipe.
-  InstructionCost computeCost(ElementCount VF,
-                              VPCostContext &Ctx) const override;
-
-  /// Get the binary op's opcode.
-  unsigned getOpcode() const { return Opcode; }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent,
-             VPSlotTracker &SlotTracker) const override;
-#endif
-};
-
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
 /// instructions.
 class VPBlendRecipe : public VPSingleDefRecipe {
@@ -2678,7 +2627,7 @@ class VPReductionRecipe : public VPSingleDefRecipe {
     return R && classof(R);
   }
 
-  /// Generate the reduction in the loop.
+  /// Generate the reduction in the loop
   void execute(VPTransformState &State) override;
 
   /// Return the cost of VPReductionRecipe.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 0422b6bc079e9..0d981ff5826ed 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -225,10 +225,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
               [](const auto *R) { return R->getScalarType(); })
           .Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
                 VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
-                VPReverseVectorPointerRecipe, VPWidenCanonicalIVRecipe,
-                VPPartialReductionRecipe>([this](const VPRecipeBase *R) {
-            return inferScalarType(R->getOperand(0));
-          })
+                VPReverseVectorPointerRecipe, VPWidenCanonicalIVRecipe>(
+              [this](const VPRecipeBase *R) {
+                return inferScalarType(R->getOperand(0));
+              })
           .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPWidenEVLRecipe,
                 VPReplicateRecipe, VPWidenCallRecipe, VPWidenMemoryRecipe,
                 VPWidenSelectRecipe>(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 5dd1b8403f46b..cda90d70e5c8d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -292,66 +292,6 @@ InstructionCost VPRecipeBase::computeCost(ElementCount VF,
   llvm_unreachable("subclasses should implement computeCost");
 }
 
-InstructionCost
-VPPartialReductionRecipe::computeCost(ElementCount VF,
-                                      VPCostContext &Ctx) const {
-  std::optional<unsigned> Opcode = std::nullopt;
-  VPRecipeBase *BinOpR = getOperand(0)->getDefiningRecipe();
-  if (auto *WidenR = dyn_cast<VPWidenRecipe>(BinOpR))
-    Opcode = std::make_optional(WidenR->getOpcode());
-
-  VPRecipeBase *ExtAR = BinOpR->getOperand(0)->getDefiningRecipe();
-  VPRecipeBase *ExtBR = BinOpR->getOperand(1)->getDefiningRecipe();
-
-  auto GetExtendKind = [](VPRecipeBase *R) {
-    auto *WidenCastR = dyn_cast<VPWidenCastRecipe>(R);
-    if (!WidenCastR)
-      return TargetTransformInfo::PR_None;
-    if (WidenCastR->getOpcode() == Instruction::CastOps::ZExt)
-      return TargetTransformInfo::PR_ZeroExtend;
-    if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
-      return TargetTransformInfo::PR_SignExtend;
-    return TargetTransformInfo::PR_None;
-  };
-
-  auto *PhiType = Ctx.Types.inferScalarType(getOperand(1));
-  auto *ExtTy = Ctx.Types.inferScalarType(ExtAR->getOperand(0));
-
-  return Ctx.TTI.getPartialReductionCost(getOpcode(), ExtTy, PhiType, VF,
-                                         GetExtendKind(ExtAR),
-                                         GetExtendKind(ExtBR), Opcode);
-}
-
-void VPPartialReductionRecipe::execute(VPTransformState &State) {
-  State.setDebugLocFrom(getDebugLoc());
-  auto &Builder = State.Builder;
-
-  assert(getOpcode() == Instruction::Add &&
-         "Unhandled partial reduction opcode");
-
-  Value *BinOpVal = State.get(getOperand(0));
-  Value *PhiVal = State.get(getOperand(1));
-  assert(PhiVal && BinOpVal && "Phi and Mul must be set");
-
-  Type *RetTy = PhiVal->getType();
-
-  CallInst *V = Builder.CreateIntrinsic(
-      RetTy, Intrinsic::experimental_vector_partial_reduce_add,
-      {PhiVal, BinOpVal}, nullptr, "partial.reduce");
-
-  State.set(this, V);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
-                                     VPSlotTracker &SlotTracker) const {
-  O << Indent << "PARTIAL-REDUCE ";
-  printAsOperand(O, SlotTracker);
-  O << " = " << Instruction::getOpcodeName(getOpcode()) << " ";
-  printOperands(O, SlotTracker);
-}
-#endif
-
 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
   assert(OpType == OperationType::FPMathOp &&
          "recipe doesn't have fast math flags");
@@ -3428,8 +3368,6 @@ void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
 void VPReductionPHIRecipe::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
 
-  auto VF = State.VF.divideCoefficientBy(VFScaleFactor);
-
   // Reductions do not have to start at zero. They can start with
   // any loop invariant values.
   VPValue *StartVPV = getStartValue();
@@ -3439,9 +3377,9 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
   // this value when we vectorize all of the instructions that use the PHI.
-  bool ScalarPHI = VF.isScalar() || IsInLoop;
-  Type *VecTy =
-      ScalarPHI ? StartV->getType() : VectorType::get(StartV->getType(), VF);
+  bool ScalarPHI = State.VF.isScalar() || IsInLoop;
+  Type *VecTy = ScalarPHI ? StartV->getType()
+                          : VectorType::get(StartV->getType(), State.VF);
 
   BasicBlock *HeaderBB = State.CFG.PrevBB;
   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
@@ -3491,13 +3429,13 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
         // Create start and identity vector values for the reduction in the
         // preheader.
         // TODO: Introduce recipes in VPlan preheader to create initial values.
-        Iden = Builder.CreateVectorSplat(VF, Iden);
+        Iden = Builder.CreateVectorSplat(State.VF, Iden);
         IRBuilderBase::InsertPointGuard IPBuilder(Builder);
         Builder.SetInsertPoint(VectorPH->getTerminator());
         Constant *Zero = Builder.getInt32(0);
         StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
       } else {
-        Iden = Builder.CreateVectorSplat(VF, Iden);
+        Iden = Builder.CreateVectorSplat(State.VF, Iden);
       }
     }
   }
@@ -3515,8 +3453,6 @@ void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
   printAsOperand(O, SlotTracker);
   O << " = phi ";
   printOperands(O, SlotTracker);
-  if (VFScaleFactor != 1)
-    O << " (VF scaled by 1/" << VFScaleFactor << ")";
 }
 #endif
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 7aaf4002b8b3e..957a602091c73 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -329,7 +329,6 @@ class VPDef {
     VPInterleaveSC,
     VPReductionEVLSC,
     VPReductionSC,
-    VPPartialReductionSC,
     VPReplicateSC,
     VPScalarCastSC,
     VPScalarIVStepsSC,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
index c3e8c895fce24..1cfb507a74344 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
@@ -11,10 +11,10 @@ define i64 @test(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i64 %i.iv.next, 16
 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 8: 30
+; CHECK: Cost for VF 8: 26
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 16: 56
+; CHECK: Cost for VF 16: 48
 ; CHECK: LV: Selecting VF: 16
 entry:
   br label %for.body
@@ -31,8 +31,8 @@ for.body:                                         ; preds = %entry, %for.body
   %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %i.iv
   %1 = load i8, ptr %arrayidx2, align 1
   %conv3 = zext i8 %1 to i64
-  %div = udiv i64 %conv3, %conv
-  %add = add i64 %div, %sum
+  %mul = mul nuw nsw i64 %conv3, %conv
+  %add = add i64 %mul, %sum
   %i.iv.next = add nuw nsw i64 %i.iv, 1
   %exitcond.not = icmp eq i64 %i.iv.next, 16
   br i1 %exitcond.not, label %exit, label %for.body
@@ -45,11 +45,11 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i64 %i.iv.next, 16
 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 8: 30
+; CHECK: Cost for VF 8: 26
 ; CHECK-NEXT: Cost of 1 for VF 16: induction instruction   %i.iv.next = add nuw nsw i64 %i.iv, 1
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 16: 57
+; CHECK: Cost for VF 16: 49
 ; CHECK: LV: Selecting VF: vscale x 2
 entry:
   br label %for.body
@@ -64,8 +64,8 @@ for.body:                                         ; preds = %entry, %for.body
   %arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next
   %1 = load i8, ptr %arrayidx2, align 1
   %conv3 = zext i8 %1 to i64
-  %div = udiv i64 %conv3, %conv
-  %add = add i64 %sum, %div
+  %mul = mul nuw nsw i64 %conv3, %conv
+  %add = add i64 %sum, %mul
   %exitcond.not = icmp eq i64 %i.iv.next, 16
   br i1 %exitcond.not, label %exit, label %for.body
 
@@ -82,11 +82,11 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i64 %i.iv.next, 16
 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 8: 24
+; CHECK: Cost for VF 8: 27
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 16: 42
+; CHECK: Cost for VF 16: 48
 ; CHECK: LV: Selecting VF: 16
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
deleted file mode 100644
index 586070d714780..0000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ /dev/null
@@ -1,99 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -mattr=+dotprod -passes=loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-none-unknown-elf"
-
-define i32 @dotp(ptr %a, ptr %b) #0 {
-; CHECK-LABEL: define i32 @dotp(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  iter.check:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 4
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]]
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP15]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ [[TMP18]], [[VEC_EPILOG_PH]] ], [ [[TMP27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX2]], 0
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
-; CHECK-NEXT:    [[TMP22:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP23]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
-; CHECK-NEXT:    [[TMP25:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP26:%.*]] = mul <vscale x 4 x i32> [[TMP25]], [[TMP22]]
-; CHECK-NEXT:    [[TMP27]] = add <vscale x 4 x i32> [[TMP26]], [[VEC_PHI3]]
-; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX2]], [[TMP17]]
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP27]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %gep.a = getelementptr i8, ptr %a, i64 %iv
-  %load.a = load i8, ptr %gep.a, align 1
-  %ext.a = zext i8 %load.a to i32
-  %gep.b = getelementptr i8, ptr %b, i64 %iv
-  %load.b = load i8, ptr %gep.b, align 1
-  %ext.b = zext i8 %load.b to i32
-  %mul = mul i32 %ext.b, %ext.a
-  %add = add i32 %mul, %accum
-  %iv.next = add i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 0
-  br i1 %exitcond.not, label %for.exit, label %for.body
-
-for.exit:                        ; preds = %for.body
-  ret i32 %add
-}
-
-attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
deleted file mode 100644
index c66695f1b50f0..0000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ /dev/null
@@ -1,1375 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1
-; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED
-; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -force-vector-interleave=1 -vectorizer-maximize-bandwidth -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-none-unknown-elf"
-
-define i32 @dotp(ptr %a, ptr %b) {
-; CHECK-INTERLEAVE1-LABEL: define i32 @dotp(
-; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-INTERLEAVE1-NEXT:  entry:
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       vector.body:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-;
-; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
-; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-INTERLEAVED-NEXT:  entry:
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED:       vector.body:
-; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-;
-; CHECK-MAXBW-LABEL: define i32 @dotp(
-; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-MAXBW-NEXT:  entry:
-; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-MAXBW:       vector.body:
-; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-MAXBW-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-MAXBW-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %gep.a = getelementptr i8, ptr %a, i64 %iv
-  %load.a = load i8, ptr %gep.a, align 1
-  %ext.a = zext i8 %load.a to i32
-  %gep.b = getelementptr i8, ptr %b, i64 %iv
-  %load.b = load i8, ptr %gep.b, align 1
-  %ext.b = zext i8 %load.b to i32
-  %mul = mul i32 %ext.b, %ext.a
-  %add = add i32 %mul, %accum
-  %iv.next = add i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 0
-  br i1 %exitcond.not, label %for.exit, label %for.body
-
-for.exit:                        ; preds = %for.body
-  ret i32 %add
-}
-
-define i32 @not_dotp_different_types(ptr %a, ptr %b) {
-; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types(
-; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT:  entry:
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       vector.body:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = load i16, ptr [[TMP23]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP40:%.*]] = load i16, ptr [[TMP24]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP41:%.*]] = load i16, ptr [[TMP25]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = load i16, ptr [[TMP26]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP43:%.*]] = load i16, ptr [[TMP27]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP44:%.*]] = load i16, ptr [[TMP28]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = load i16, ptr [[TMP29]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP46:%.*]] = load i16, ptr [[TMP30]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3
-; CHECK-INTERLEAVE1-NEXT:    [[TMP55:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP39]], i32 4
-; CHECK-INTERLEAVE1-NEXT:    [[TMP56:%.*]] = insertelement <16 x i16> [[TMP55]], i16 [[TMP40]], i32 5
-; CHECK-INTERLEAVE1-NEXT:    [[TMP57:%.*]] = insertelement <16 x i16> [[TMP56]], i16 [[TMP41]], i32 6
-; CHECK-INTERLEAVE1-NEXT:    [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 7
-; CHECK-INTERLEAVE1-NEXT:    [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 9
-; CHECK-INTERLEAVE1-NEXT:    [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 10
-; CHECK-INTERLEAVE1-NEXT:    [[TMP62:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 11
-; CHECK-INTERLEAVE1-NEXT:    [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12
-; CHECK-INTERLEAVE1-NEXT:    [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13
-; CHECK-INTERLEAVE1-NEXT:    [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14
-; CHECK-INTERLEAVE1-NEXT:    [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15
-; CHECK-INTERLEAVE1-NEXT:    [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-INTERLEAVE1-NEXT:    [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-;
-; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types(
-; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT:  entry:
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED:       vector.body:
-; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 16
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 17
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 18
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 19
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 20
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 21
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 22
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 23
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = add i64 [[INDEX]], 24
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX]], 25
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = add i64 [[INDEX]], 26
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX]], 27
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = add i64 [[INDEX]], 28
-; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = add i64 [[INDEX]], 29
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX]], 30
-; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 31
-; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = load i16, ptr [[TMP37]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP70:%.*]] = load i16, ptr [[TMP38]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = load i16, ptr [[TMP39]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = load i16, ptr [[TMP40]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = load i16, ptr [[TMP41]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = load i16, ptr [[TMP42]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP75:%.*]] = load i16, ptr [[TMP43]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP76:%.*]] = load i16, ptr [[TMP44]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP77:%.*]] = load i16, ptr [[TMP45]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP78:%.*]] = load i16, ptr [[TMP46]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP79:%.*]] = load i16, ptr [[TMP47]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP80:%.*]] = load i16, ptr [[TMP48]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP81:%.*]] = load i16, ptr [[TMP49]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = load i16, ptr [[TMP50]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP83:%.*]] = load i16, ptr [[TMP51]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP84:%.*]] = load i16, ptr [[TMP52]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3
-; CHECK-INTERLEAVED-NEXT:    [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5
-; CHECK-INTERLEAVED-NEXT:    [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6
-; CHECK-INTERLEAVED-NEXT:    [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7
-; CHECK-INTERLEAVED-NEXT:    [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9
-; CHECK-INTERLEAVED-NEXT:    [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10
-; CHECK-INTERLEAVED-NEXT:    [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11
-; CHECK-INTERLEAVED-NEXT:    [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12
-; CHECK-INTERLEAVED-NEXT:    [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13
-; CHECK-INTERLEAVED-NEXT:    [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14
-; CHECK-INTERLEAVED-NEXT:    [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15
-; CHECK-INTERLEAVED-NEXT:    [[TMP101:%.*]] = load i16, ptr [[TMP53]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP102:%.*]] = load i16, ptr [[TMP54]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP103:%.*]] = load i16, ptr [[TMP55]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP104:%.*]] = load i16, ptr [[TMP56]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP105:%.*]] = load i16, ptr [[TMP57]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP106:%.*]] = load i16, ptr [[TMP58]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP107:%.*]] = load i16, ptr [[TMP59]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP108:%.*]] = load i16, ptr [[TMP60]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP109:%.*]] = load i16, ptr [[TMP61]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP110:%.*]] = load i16, ptr [[TMP62]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP111:%.*]] = load i16, ptr [[TMP63]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP112:%.*]] = load i16, ptr [[TMP64]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP113:%.*]] = load i16, ptr [[TMP65]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP114:%.*]] = load i16, ptr [[TMP66]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP115:%.*]] = load i16, ptr [[TMP67]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP116:%.*]] = load i16, ptr [[TMP68]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3
-; CHECK-INTERLEAVED-NEXT:    [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5
-; CHECK-INTERLEAVED-NEXT:    [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6
-; CHECK-INTERLEAVED-NEXT:    [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7
-; CHECK-INTERLEAVED-NEXT:    [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9
-; CHECK-INTERLEAVED-NEXT:    [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10
-; CHECK-INTERLEAVED-NEXT:    [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11
-; CHECK-INTERLEAVED-NEXT:    [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12
-; CHECK-INTERLEAVED-NEXT:    [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13
-; CHECK-INTERLEAVED-NEXT:    [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14
-; CHECK-INTERLEAVED-NEXT:    [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15
-; CHECK-INTERLEAVED-NEXT:    [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-INTERLEAVED-NEXT:    [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-;
-; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types(
-; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-MAXBW-NEXT:  entry:
-; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-MAXBW:       vector.body:
-; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
-; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
-; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
-; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
-; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
-; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
-; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
-; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
-; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
-; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
-; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
-; CHECK-MAXBW-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-MAXBW-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
-; CHECK-MAXBW-NEXT:    [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP39:%.*]] = load i16, ptr [[TMP23]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP40:%.*]] = load i16, ptr [[TMP24]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP41:%.*]] = load i16, ptr [[TMP25]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP42:%.*]] = load i16, ptr [[TMP26]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP43:%.*]] = load i16, ptr [[TMP27]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP44:%.*]] = load i16, ptr [[TMP28]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP45:%.*]] = load i16, ptr [[TMP29]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP46:%.*]] = load i16, ptr [[TMP30]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0
-; CHECK-MAXBW-NEXT:    [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1
-; CHECK-MAXBW-NEXT:    [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2
-; CHECK-MAXBW-NEXT:    [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3
-; CHECK-MAXBW-NEXT:    [[TMP55:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP39]], i32 4
-; CHECK-MAXBW-NEXT:    [[TMP56:%.*]] = insertelement <16 x i16> [[TMP55]], i16 [[TMP40]], i32 5
-; CHECK-MAXBW-NEXT:    [[TMP57:%.*]] = insertelement <16 x i16> [[TMP56]], i16 [[TMP41]], i32 6
-; CHECK-MAXBW-NEXT:    [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 7
-; CHECK-MAXBW-NEXT:    [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 8
-; CHECK-MAXBW-NEXT:    [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 9
-; CHECK-MAXBW-NEXT:    [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 10
-; CHECK-MAXBW-NEXT:    [[TMP62:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 11
-; CHECK-MAXBW-NEXT:    [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12
-; CHECK-MAXBW-NEXT:    [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13
-; CHECK-MAXBW-NEXT:    [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14
-; CHECK-MAXBW-NEXT:    [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15
-; CHECK-MAXBW-NEXT:    [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]]
-; CHECK-MAXBW-NEXT:    [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-MAXBW-NEXT:    [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-MAXBW-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %gep.a = getelementptr i8, ptr %a, i64 %iv
-  %load.a = load i8, ptr %gep.a, align 1
-  %ext.a = zext i8 %load.a to i32
-  %gep.b = getelementptr i8, ptr %b, i64 %iv
-  %load.b = load i16, ptr %gep.b, align 2
-  %ext.b = zext i16 %load.b to i32
-  %mul = mul i32 %ext.b, %ext.a
-  %add = add i32 %mul, %accum
-  %iv.next = add i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 0
-  br i1 %exitcond.not, label %for.exit, label %for.body
-
-for.exit:                        ; preds = %for.body
-  ret i32 %add
-}
-
-define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
-; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_loop_carried(
-; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT:  entry:
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       vector.body:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-;
-; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried(
-; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT:  entry:
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED:       vector.body:
-; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-;
-; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried(
-; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-MAXBW-NEXT:  entry:
-; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-MAXBW:       vector.body:
-; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-MAXBW-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %accum = phi i32 [ 0, %entry ], [ %mul, %for.body ]
-  %gep.a = getelementptr i8, ptr %a, i64 %iv
-  %load.a = load i8, ptr %gep.a, align 1
-  %ext.a = zext i8 %load.a to i32
-  %gep.b = getelementptr i8, ptr %b, i64 %iv
-  %load.b = load i8, ptr %gep.b, align 1
-  %ext.b = zext i8 %load.b to i32
-  %mul = mul i32 %ext.b, %ext.a
-  %add = add i32 %mul, %accum
-  %iv.next = add i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 0
-  br i1 %exitcond.not, label %for.exit, label %for.body
-
-for.exit:                        ; preds = %for.body
-  ret i32 %add
-}
-
-define i32 @not_dotp_not_phi(ptr %a, ptr %b) {
-; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi(
-; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT:  entry:
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       vector.body:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-;
-; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi(
-; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT:  entry:
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED:       vector.body:
-; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-;
-; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi(
-; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-MAXBW-NEXT:  entry:
-; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-MAXBW:       vector.body:
-; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[TMP6]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %gep.a = getelementptr i8, ptr %a, i64 %iv
-  %load.a = load i8, ptr %gep.a, align 1
-  %ext.a = zext i8 %load.a to i32
-  %gep.b = getelementptr i8, ptr %b, i64 %iv
-  %load.b = load i8, ptr %gep.b, align 1
-  %ext.b = zext i8 %load.b to i32
-  %mul = mul i32 %ext.b, %ext.a
-  %add = add i32 %mul, %ext.b
-  %iv.next = add i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 0
-  br i1 %exitcond.not, label %for.exit, label %for.body
-
-for.exit:                        ; preds = %for.body
-  ret i32 %add
-}
-
-define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
-; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled(
-; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT:  entry:
-; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       vector.body:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-;
-; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled(
-; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT:  entry:
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
-; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED:       vector.body:
-; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-;
-; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled(
-; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-MAXBW-NEXT:  entry:
-; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16
-; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16
-; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-MAXBW:       vector.body:
-; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
-; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
-; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP24]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
-; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP29]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
-; CHECK-MAXBW-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
-; CHECK-MAXBW-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]])
-; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-;
-entry:
-  br label %for.body
-
-for.body:                                    ; preds = %entry, %for.body
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %accum3 = phi i32 [ 0, %entry ], [ %add.a3, %for.body ]
-  %accum2 = phi i32 [ 0, %entry ], [ %add.a2, %for.body ]
-  %accum1 = phi i32 [ 0, %entry ], [ %add.a1, %for.body ]
-  %accum0 = phi i32 [ 0, %entry ], [ %add.a0, %for.body ]
-  %gep.a0 = getelementptr inbounds i8, ptr %a, i64 %iv
-  %gep.b0 = getelementptr inbounds i8, ptr %b, i64 %iv
-  %offset.1 = or disjoint i64 %iv, 1
-  %gep.a1 = getelementptr inbounds i8, ptr %a, i64 %offset.1
-  %gep.b1 = getelementptr inbounds i8, ptr %b, i64 %offset.1
-  %offset.2 = or disjoint i64 %iv, 2
-  %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %offset.2
-  %gep.b2 = getelementptr inbounds i8, ptr %b, i64 %offset.2
-  %offset.3 = or disjoint i64 %iv, 3
-  %gep.a3 = getelementptr inbounds i8, ptr %a, i64 %offset.3
-  %gep.b3 = getelementptr inbounds i8, ptr %b, i64 %offset.3
-  %load.a0 = load i8, ptr %gep.a0, align 1
-  %ext.a0 = sext i8 %load.a0 to i32
-  %load.b0 = load i8, ptr %gep.b0, align 1
-  %ext.b0 = sext i8 %load.b0 to i32
-  %mul.a0 = mul nsw i32 %ext.b0, %ext.a0
-  %add.a0 = add nsw i32 %mul.a0, %accum0
-  %load.a1 = load i8, ptr %gep.a1, align 1
-  %ext.a1 = sext i8 %load.a1 to i32
-  %load.b1 = load i8, ptr %gep.b1, align 1
-  %ext.b1 = sext i8 %load.b1 to i32
-  %mul.a1 = mul nsw i32 %ext.a1, %ext.b1
-  %add.a1 = add nsw i32 %mul.a1, %accum1
-  %load.a2 = load i8, ptr %gep.a2, align 1
-  %ext.a2 = sext i8 %load.a2 to i32
-  %load.b2 = load i8, ptr %gep.b2, align 1
-  %ext.b2 = sext i8 %load.b2 to i32
-  %mul.a2 = mul nsw i32 %ext.a2, %ext.b2
-  %add.a2 = add nsw i32 %mul.a2, %accum2
-  %load.a3 = load i8, ptr %gep.a3, align 1
-  %ext.a3 = sext i8 %load.a3 to i32
-  %load.b3 = load i8, ptr %gep.b3, align 1
-  %ext.b3 = sext i8 %load.b3 to i32
-  %mul.a3 = mul nsw i32 %ext.a3, %ext.b3
-  %add.a3 = add nsw i32 %mul.a3, %accum3
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %num_in
-  br i1 %exitcond.not, label %exit, label %for.body
-
-exit:                                        ; preds = %for.body
-  %result0 = add nsw i32 %add.a0, %add.a1
-  %result1 = add nsw i32 %add.a2, %add.a3
-  %result = add nsw i32 %result0, %result1
-  ret i32 %result
-}
-
-define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) {
-; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated(
-; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT:  entry:
-; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       vector.body:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-;
-; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated(
-; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT:  entry:
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32
-; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED:       vector.body:
-; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP9]], [[TMP4]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP10]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-;
-; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated(
-; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-MAXBW-NEXT:  entry:
-; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
-; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
-; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-MAXBW:       vector.body:
-; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul nsw <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
-; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
-  %load.a = load i8, ptr %gep.a, align 1
-  %ext.a = sext i8 %load.a to i32
-  %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
-  %load.b = load i8, ptr %gep.b, align 1
-  %ext.b = sext i8 %load.b to i32
-  %mul = mul nsw i32 %ext.b, %ext.a
-  %add = add nsw i32 %mul, %accum
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %for.body
-
-exit:                        ; preds = %for.body
-  ret i32 %add
-}
-
-define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) {
-; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated_pragma(
-; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT:  entry:
-; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 15
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
-; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       vector.body:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-;
-; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma(
-; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT:  entry:
-; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 15
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
-; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
-; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED:       vector.body:
-; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-;
-; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma(
-; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-MAXBW-NEXT:  entry:
-; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 15
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
-; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
-; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
-; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
-; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-MAXBW:       vector.body:
-; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE62:%.*]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE62]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP180:%.*]], [[PRED_LOAD_CONTINUE62]] ]
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
-; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
-; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = icmp ule <16 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = extractelement <16 x i1> [[TMP16]], i32 0
-; CHECK-MAXBW-NEXT:    br i1 [[TMP17]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %gep.a = getelementptr inbounds i8, ptr %b, i64 %iv
-  %load.a = load i8, ptr %gep.a, align 1
-  %ext.a = sext i8 %load.a to i32
-  %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %iv
-  %load.b = load i8, ptr %gep.a2, align 1
-  %ext.b = sext i8 %load.b to i32
-  %mul = mul nsw i32 %ext.b, %ext.a
-  %add = add nsw i32 %mul, %accum
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !7
-
-exit:                        ; preds = %for.body
-  ret i32 %add
-}
-
-define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
-; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user(
-; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT:  entry:
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       vector.body:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-INTERLEAVE1:       scalar.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       for.body:
-; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
-; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
-; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
-; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
-; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
-; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
-; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK-INTERLEAVE1:       for.exit:
-; CHECK-INTERLEAVE1-NEXT:    [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]]
-; CHECK-INTERLEAVE1-NEXT:    ret i32 [[RESULT]]
-;
-; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user(
-; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT:  entry:
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED:       vector.body:
-; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = extractelement <16 x i32> [[TMP10]], i32 15
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-INTERLEAVED:       scalar.ph:
-; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-INTERLEAVED-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-INTERLEAVED:       for.body:
-; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
-; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
-; CHECK-INTERLEAVED-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
-; CHECK-INTERLEAVED-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
-; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK-INTERLEAVED:       for.exit:
-; CHECK-INTERLEAVED-NEXT:    [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; CHECK-INTERLEAVED-NEXT:    [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]]
-; CHECK-INTERLEAVED-NEXT:    ret i32 [[RESULT]]
-;
-; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user(
-; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-MAXBW-NEXT:  entry:
-; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-MAXBW:       vector.body:
-; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
-; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
-; CHECK-MAXBW-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-MAXBW:       scalar.ph:
-; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-MAXBW:       for.body:
-; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
-; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-MAXBW-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
-; CHECK-MAXBW-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
-; CHECK-MAXBW-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
-; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
-; CHECK-MAXBW-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
-; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
-; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK-MAXBW:       for.exit:
-; CHECK-MAXBW-NEXT:    [[EXT_B_LCSSA:%.*]] = phi i32 [ [[EXT_B]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-MAXBW-NEXT:    [[RESULT:%.*]] = add i32 [[ADD_LCSSA]], [[EXT_B_LCSSA]]
-; CHECK-MAXBW-NEXT:    ret i32 [[RESULT]]
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %gep.a = getelementptr i8, ptr %a, i64 %iv
-  %load.a = load i8, ptr %gep.a, align 1
-  %ext.a = zext i8 %load.a to i32
-  %gep.b = getelementptr i8, ptr %b, i64 %iv
-  %load.b = load i8, ptr %gep.b, align 1
-  %ext.b = zext i8 %load.b to i32
-  %mul = mul i32 %ext.b, %ext.a
-  %add = add i32 %mul, %accum
-  %iv.next = add i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 0
-  br i1 %exitcond.not, label %for.exit, label %for.body
-
-for.exit:                        ; preds = %for.body
-  %result = add i32 %add, %ext.b
-  ret i32 %result
-}
-
-!7 = distinct !{!7, !8, !9, !10}
-!8 = !{!"llvm.loop.mustprogress"}
-!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
-!10 = !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
deleted file mode 100644
index 9530947232192..0000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ /dev/null
@@ -1,1733 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1
-; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-none-unknown-elf"
-
-define i32 @dotp(ptr %a, ptr %b) #0 {
-; CHECK-INTERLEAVE1-LABEL: define i32 @dotp(
-; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-INTERLEAVE1-NEXT:  entry:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
-; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY1:%.*]]
-; CHECK-INTERLEAVE1:       vector.body:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP14:%.*]], [[VECTOR_BODY1]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP18]], [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP5]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP15]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
-; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]]
-; CHECK-INTERLEAVE1:       scalar.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
-; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       for.body:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
-; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
-; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
-; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK-INTERLEAVE1:       for.exit:
-; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP27]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-INTERLEAVE1-NEXT:    ret i32 [[ADD_LCSSA]]
-;
-; CHECK-INTERLEAVED-LABEL: define i32 @dotp(
-; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-INTERLEAVED-NEXT:  entry:
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
-; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY1:%.*]]
-; CHECK-INTERLEAVED:       vector.body:
-; CHECK-INTERLEAVED-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY1]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP23:%.*]], [[VECTOR_BODY1]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH1]] ], [ [[TMP24:%.*]], [[VECTOR_BODY1]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP17]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP14]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP28]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = mul i64 [[TMP26]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP28]], i64 [[TMP27]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP29]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP30]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP25]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]]
-; CHECK-INTERLEAVED:       scalar.ph:
-; CHECK-INTERLEAVED-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH:%.*]] ]
-; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_PH]] ]
-; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED:       for.body:
-; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[ADD:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32
-; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
-; CHECK-INTERLEAVED-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK-INTERLEAVED:       for.exit:
-; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[VECTOR_BODY]] ], [ [[TMP16]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-INTERLEAVED-NEXT:    ret i32 [[ADD_LCSSA]]
-;
-; CHECK-MAXBW-LABEL: define i32 @dotp(
-; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-MAXBW-NEXT:  entry:
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-MAXBW:       vector.body:
-; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP20]], [[TMP13]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP22]])
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %gep.a = getelementptr i8, ptr %a, i64 %iv
-  %load.a = load i8, ptr %gep.a, align 1
-  %ext.a = zext i8 %load.a to i32
-  %gep.b = getelementptr i8, ptr %b, i64 %iv
-  %load.b = load i8, ptr %gep.b, align 1
-  %ext.b = zext i8 %load.b to i32
-  %mul = mul i32 %ext.b, %ext.a
-  %add = add i32 %mul, %accum
-  %iv.next = add i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 0
-  br i1 %exitcond.not, label %for.exit, label %for.body
-
-for.exit:                        ; preds = %for.body
-  ret i32 %add
-}
-
-define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
-; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_different_types(
-; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT:  entry:
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       vector.body:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP16]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = load i16, ptr [[TMP19]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = load i16, ptr [[TMP20]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = load i16, ptr [[TMP21]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = load i16, ptr [[TMP22]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP41:%.*]] = load i16, ptr [[TMP23]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = load i16, ptr [[TMP24]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP43:%.*]] = load i16, ptr [[TMP25]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP44:%.*]] = load i16, ptr [[TMP26]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = load i16, ptr [[TMP27]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP46:%.*]] = load i16, ptr [[TMP28]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP55:%.*]] = load i16, ptr [[TMP29]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP56:%.*]] = load i16, ptr [[TMP30]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP47:%.*]] = load i16, ptr [[TMP31]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP48:%.*]] = load i16, ptr [[TMP32]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP49:%.*]] = load i16, ptr [[TMP33]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP50:%.*]] = load i16, ptr [[TMP34]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP51:%.*]] = insertelement <16 x i16> poison, i16 [[TMP35]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP52:%.*]] = insertelement <16 x i16> [[TMP51]], i16 [[TMP36]], i32 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP53:%.*]] = insertelement <16 x i16> [[TMP52]], i16 [[TMP37]], i32 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP54:%.*]] = insertelement <16 x i16> [[TMP53]], i16 [[TMP38]], i32 3
-; CHECK-INTERLEAVE1-NEXT:    [[TMP57:%.*]] = insertelement <16 x i16> [[TMP54]], i16 [[TMP41]], i32 4
-; CHECK-INTERLEAVE1-NEXT:    [[TMP58:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[TMP42]], i32 5
-; CHECK-INTERLEAVE1-NEXT:    [[TMP59:%.*]] = insertelement <16 x i16> [[TMP58]], i16 [[TMP43]], i32 6
-; CHECK-INTERLEAVE1-NEXT:    [[TMP60:%.*]] = insertelement <16 x i16> [[TMP59]], i16 [[TMP44]], i32 7
-; CHECK-INTERLEAVE1-NEXT:    [[TMP61:%.*]] = insertelement <16 x i16> [[TMP60]], i16 [[TMP45]], i32 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP96:%.*]] = insertelement <16 x i16> [[TMP61]], i16 [[TMP46]], i32 9
-; CHECK-INTERLEAVE1-NEXT:    [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP55]], i32 10
-; CHECK-INTERLEAVE1-NEXT:    [[TMP62:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP56]], i32 11
-; CHECK-INTERLEAVE1-NEXT:    [[TMP63:%.*]] = insertelement <16 x i16> [[TMP62]], i16 [[TMP47]], i32 12
-; CHECK-INTERLEAVE1-NEXT:    [[TMP64:%.*]] = insertelement <16 x i16> [[TMP63]], i16 [[TMP48]], i32 13
-; CHECK-INTERLEAVE1-NEXT:    [[TMP65:%.*]] = insertelement <16 x i16> [[TMP64]], i16 [[TMP49]], i32 14
-; CHECK-INTERLEAVE1-NEXT:    [[TMP66:%.*]] = insertelement <16 x i16> [[TMP65]], i16 [[TMP50]], i32 15
-; CHECK-INTERLEAVE1-NEXT:    [[TMP67:%.*]] = zext <16 x i16> [[TMP66]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP68:%.*]] = mul <16 x i32> [[TMP67]], [[TMP18]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP69]] = add <16 x i32> [[TMP68]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-INTERLEAVE1-NEXT:    [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP70]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]])
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-INTERLEAVE1:       scalar.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       for.body:
-; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
-; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32
-; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
-; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load i16, ptr [[GEP_B]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = zext i16 [[LOAD_B]] to i32
-; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]]
-; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = add i32 [[MUL]], [[ACCUM]]
-; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 0
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK-INTERLEAVE1:       for.exit:
-; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP71]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-INTERLEAVE1-NEXT:    ret i32 [[ADD_LCSSA]]
-;
-; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_different_types(
-; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT:  entry:
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED:       vector.body:
-; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP137:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 16
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 17
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 18
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 19
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 20
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 21
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 22
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 23
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = add i64 [[INDEX]], 24
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX]], 25
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = add i64 [[INDEX]], 26
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add i64 [[INDEX]], 27
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = add i64 [[INDEX]], 28
-; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = add i64 [[INDEX]], 29
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = add i64 [[INDEX]], 30
-; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX]], 31
-; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP16]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP17]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP18]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP19]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP20]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP21]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP22]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP62:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP23]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP24]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP64:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP25]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP65:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP26]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP27]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP28]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP29]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP139:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP30]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP140:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP31]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = load i16, ptr [[TMP39]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP70:%.*]] = load i16, ptr [[TMP40]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = load i16, ptr [[TMP41]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = load i16, ptr [[TMP42]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = load i16, ptr [[TMP43]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = load i16, ptr [[TMP44]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP75:%.*]] = load i16, ptr [[TMP45]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP76:%.*]] = load i16, ptr [[TMP46]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP77:%.*]] = load i16, ptr [[TMP47]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP78:%.*]] = load i16, ptr [[TMP48]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP79:%.*]] = load i16, ptr [[TMP49]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP80:%.*]] = load i16, ptr [[TMP50]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP81:%.*]] = load i16, ptr [[TMP51]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = load i16, ptr [[TMP52]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP83:%.*]] = load i16, ptr [[TMP53]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP84:%.*]] = load i16, ptr [[TMP54]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP85:%.*]] = insertelement <16 x i16> poison, i16 [[TMP69]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP86:%.*]] = insertelement <16 x i16> [[TMP85]], i16 [[TMP70]], i32 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP87:%.*]] = insertelement <16 x i16> [[TMP86]], i16 [[TMP71]], i32 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP88:%.*]] = insertelement <16 x i16> [[TMP87]], i16 [[TMP72]], i32 3
-; CHECK-INTERLEAVED-NEXT:    [[TMP89:%.*]] = insertelement <16 x i16> [[TMP88]], i16 [[TMP73]], i32 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP90:%.*]] = insertelement <16 x i16> [[TMP89]], i16 [[TMP74]], i32 5
-; CHECK-INTERLEAVED-NEXT:    [[TMP91:%.*]] = insertelement <16 x i16> [[TMP90]], i16 [[TMP75]], i32 6
-; CHECK-INTERLEAVED-NEXT:    [[TMP92:%.*]] = insertelement <16 x i16> [[TMP91]], i16 [[TMP76]], i32 7
-; CHECK-INTERLEAVED-NEXT:    [[TMP93:%.*]] = insertelement <16 x i16> [[TMP92]], i16 [[TMP77]], i32 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP94:%.*]] = insertelement <16 x i16> [[TMP93]], i16 [[TMP78]], i32 9
-; CHECK-INTERLEAVED-NEXT:    [[TMP95:%.*]] = insertelement <16 x i16> [[TMP94]], i16 [[TMP79]], i32 10
-; CHECK-INTERLEAVED-NEXT:    [[TMP96:%.*]] = insertelement <16 x i16> [[TMP95]], i16 [[TMP80]], i32 11
-; CHECK-INTERLEAVED-NEXT:    [[TMP97:%.*]] = insertelement <16 x i16> [[TMP96]], i16 [[TMP81]], i32 12
-; CHECK-INTERLEAVED-NEXT:    [[TMP98:%.*]] = insertelement <16 x i16> [[TMP97]], i16 [[TMP82]], i32 13
-; CHECK-INTERLEAVED-NEXT:    [[TMP99:%.*]] = insertelement <16 x i16> [[TMP98]], i16 [[TMP83]], i32 14
-; CHECK-INTERLEAVED-NEXT:    [[TMP100:%.*]] = insertelement <16 x i16> [[TMP99]], i16 [[TMP84]], i32 15
-; CHECK-INTERLEAVED-NEXT:    [[TMP101:%.*]] = load i16, ptr [[TMP55]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP102:%.*]] = load i16, ptr [[TMP56]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP103:%.*]] = load i16, ptr [[TMP57]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP104:%.*]] = load i16, ptr [[TMP58]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP105:%.*]] = load i16, ptr [[TMP59]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP106:%.*]] = load i16, ptr [[TMP60]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP107:%.*]] = load i16, ptr [[TMP61]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP108:%.*]] = load i16, ptr [[TMP62]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP109:%.*]] = load i16, ptr [[TMP63]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP110:%.*]] = load i16, ptr [[TMP64]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP111:%.*]] = load i16, ptr [[TMP65]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP112:%.*]] = load i16, ptr [[TMP66]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP113:%.*]] = load i16, ptr [[TMP67]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP114:%.*]] = load i16, ptr [[TMP68]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP115:%.*]] = load i16, ptr [[TMP139]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP116:%.*]] = load i16, ptr [[TMP140]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3
-; CHECK-INTERLEAVED-NEXT:    [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5
-; CHECK-INTERLEAVED-NEXT:    [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6
-; CHECK-INTERLEAVED-NEXT:    [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7
-; CHECK-INTERLEAVED-NEXT:    [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9
-; CHECK-INTERLEAVED-NEXT:    [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10
-; CHECK-INTERLEAVED-NEXT:    [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11
-; CHECK-INTERLEAVED-NEXT:    [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12
-; CHECK-INTERLEAVED-NEXT:    [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13
-; CHECK-INTERLEAVED-NEXT:    [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14
-; CHECK-INTERLEAVED-NEXT:    [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15
-; CHECK-INTERLEAVED-NEXT:    [[TMP133:%.*]] = zext <16 x i16> [[TMP100]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP135:%.*]] = mul <16 x i32> [[TMP133]], [[TMP35]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP137]] = add <16 x i32> [[TMP135]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-INTERLEAVED-NEXT:    [[TMP141:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP141]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP138]], [[TMP137]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP142:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-;
-; CHECK-MAXBW-LABEL: define i32 @not_dotp_different_types(
-; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-MAXBW-NEXT:  entry:
-; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-MAXBW:       vector.body:
-; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP138:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 5
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 6
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 7
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 8
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 9
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 10
-; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 11
-; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 12
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 13
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 14
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 15
-; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[TMP32]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP33]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-MAXBW-NEXT:    [[TMP38:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP1]]
-; CHECK-MAXBW-NEXT:    [[TMP39:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
-; CHECK-MAXBW-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP4]]
-; CHECK-MAXBW-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP5]]
-; CHECK-MAXBW-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-MAXBW-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP7]]
-; CHECK-MAXBW-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
-; CHECK-MAXBW-NEXT:    [[TMP46:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
-; CHECK-MAXBW-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP12]]
-; CHECK-MAXBW-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP13]]
-; CHECK-MAXBW-NEXT:    [[TMP51:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-MAXBW-NEXT:    [[TMP52:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP15]]
-; CHECK-MAXBW-NEXT:    [[TMP101:%.*]] = load i16, ptr [[TMP37]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP102:%.*]] = load i16, ptr [[TMP38]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP103:%.*]] = load i16, ptr [[TMP39]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP104:%.*]] = load i16, ptr [[TMP40]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP105:%.*]] = load i16, ptr [[TMP41]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP106:%.*]] = load i16, ptr [[TMP42]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP107:%.*]] = load i16, ptr [[TMP43]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP108:%.*]] = load i16, ptr [[TMP44]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP109:%.*]] = load i16, ptr [[TMP45]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP110:%.*]] = load i16, ptr [[TMP46]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP111:%.*]] = load i16, ptr [[TMP47]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP112:%.*]] = load i16, ptr [[TMP48]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP113:%.*]] = load i16, ptr [[TMP49]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP114:%.*]] = load i16, ptr [[TMP50]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP115:%.*]] = load i16, ptr [[TMP51]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP116:%.*]] = load i16, ptr [[TMP52]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP117:%.*]] = insertelement <16 x i16> poison, i16 [[TMP101]], i32 0
-; CHECK-MAXBW-NEXT:    [[TMP118:%.*]] = insertelement <16 x i16> [[TMP117]], i16 [[TMP102]], i32 1
-; CHECK-MAXBW-NEXT:    [[TMP119:%.*]] = insertelement <16 x i16> [[TMP118]], i16 [[TMP103]], i32 2
-; CHECK-MAXBW-NEXT:    [[TMP120:%.*]] = insertelement <16 x i16> [[TMP119]], i16 [[TMP104]], i32 3
-; CHECK-MAXBW-NEXT:    [[TMP121:%.*]] = insertelement <16 x i16> [[TMP120]], i16 [[TMP105]], i32 4
-; CHECK-MAXBW-NEXT:    [[TMP122:%.*]] = insertelement <16 x i16> [[TMP121]], i16 [[TMP106]], i32 5
-; CHECK-MAXBW-NEXT:    [[TMP123:%.*]] = insertelement <16 x i16> [[TMP122]], i16 [[TMP107]], i32 6
-; CHECK-MAXBW-NEXT:    [[TMP124:%.*]] = insertelement <16 x i16> [[TMP123]], i16 [[TMP108]], i32 7
-; CHECK-MAXBW-NEXT:    [[TMP125:%.*]] = insertelement <16 x i16> [[TMP124]], i16 [[TMP109]], i32 8
-; CHECK-MAXBW-NEXT:    [[TMP126:%.*]] = insertelement <16 x i16> [[TMP125]], i16 [[TMP110]], i32 9
-; CHECK-MAXBW-NEXT:    [[TMP127:%.*]] = insertelement <16 x i16> [[TMP126]], i16 [[TMP111]], i32 10
-; CHECK-MAXBW-NEXT:    [[TMP128:%.*]] = insertelement <16 x i16> [[TMP127]], i16 [[TMP112]], i32 11
-; CHECK-MAXBW-NEXT:    [[TMP129:%.*]] = insertelement <16 x i16> [[TMP128]], i16 [[TMP113]], i32 12
-; CHECK-MAXBW-NEXT:    [[TMP130:%.*]] = insertelement <16 x i16> [[TMP129]], i16 [[TMP114]], i32 13
-; CHECK-MAXBW-NEXT:    [[TMP131:%.*]] = insertelement <16 x i16> [[TMP130]], i16 [[TMP115]], i32 14
-; CHECK-MAXBW-NEXT:    [[TMP132:%.*]] = insertelement <16 x i16> [[TMP131]], i16 [[TMP116]], i32 15
-; CHECK-MAXBW-NEXT:    [[TMP134:%.*]] = zext <16 x i16> [[TMP132]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP136:%.*]] = mul <16 x i32> [[TMP134]], [[TMP36]]
-; CHECK-MAXBW-NEXT:    [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-MAXBW-NEXT:    [[TMP139:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-MAXBW-NEXT:    br i1 [[TMP139]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %gep.a = getelementptr i8, ptr %a, i64 %iv
-  %load.a = load i8, ptr %gep.a, align 1
-  %ext.a = zext i8 %load.a to i32
-  %gep.b = getelementptr i8, ptr %b, i64 %iv
-  %load.b = load i16, ptr %gep.b, align 2
-  %ext.b = zext i16 %load.b to i32
-  %mul = mul i32 %ext.b, %ext.a
-  %add = add i32 %mul, %accum
-  %iv.next = add i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 0
-  br i1 %exitcond.not, label %for.exit, label %for.body
-
-for.exit:                        ; preds = %for.body
-  ret i32 %add
-}
-
-define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
-; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_loop_carried(
-; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT:  entry:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
-; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       vector.body:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP16]], i32 -1)
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP17]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-;
-; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried(
-; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT:  entry:
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
-; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED:       vector.body:
-; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP21]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[TMP24]], <vscale x 8 x i32> [[TMP25]], i32 -1)
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-;
-; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried(
-; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-MAXBW-NEXT:  entry:
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
-; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-MAXBW:       vector.body:
-; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP17]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
-; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP25]], i32 -1)
-; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %accum = phi i32 [ 0, %entry ], [ %mul, %for.body ]
-  %gep.a = getelementptr i8, ptr %a, i64 %iv
-  %load.a = load i8, ptr %gep.a, align 1
-  %ext.a = zext i8 %load.a to i32
-  %gep.b = getelementptr i8, ptr %b, i64 %iv
-  %load.b = load i8, ptr %gep.b, align 1
-  %ext.b = zext i8 %load.b to i32
-  %mul = mul i32 %ext.b, %ext.a
-  %add = add i32 %mul, %accum
-  %iv.next = add i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 0
-  br i1 %exitcond.not, label %for.exit, label %for.body
-
-for.exit:                        ; preds = %for.body
-  ret i32 %add
-}
-
-define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
-; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_not_phi(
-; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT:  entry:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
-; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       vector.body:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-;
-; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_phi(
-; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT:  entry:
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
-; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED:       vector.body:
-; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP16]], i64 [[TMP19]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP20]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP21]] = add <vscale x 8 x i32> [[TMP30]], [[TMP22]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-;
-; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_phi(
-; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-MAXBW-NEXT:  entry:
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP6]], 8
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
-; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-MAXBW:       vector.body:
-; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP14]]
-; CHECK-MAXBW-NEXT:    [[TMP21]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %gep.a = getelementptr i8, ptr %a, i64 %iv
-  %load.a = load i8, ptr %gep.a, align 1
-  %ext.a = zext i8 %load.a to i32
-  %gep.b = getelementptr i8, ptr %b, i64 %iv
-  %load.b = load i8, ptr %gep.b, align 1
-  %ext.b = zext i8 %load.b to i32
-  %mul = mul i32 %ext.b, %ext.a
-  %add = add i32 %mul, %ext.b
-  %iv.next = add i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 0
-  br i1 %exitcond.not, label %for.exit, label %for.body
-
-for.exit:                        ; preds = %for.body
-  ret i32 %add
-}
-
-define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
-; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled(
-; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT:  entry:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP20]], 4
-; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       vector.body:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = mul nsw <vscale x 4 x i32> [[TMP21]], [[TMP36]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP38]], [[VEC_PHI3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD5]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i8>, ptr [[TMP19]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD6]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = mul nsw <vscale x 4 x i32> [[TMP25]], [[TMP42]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP30]] = add <vscale x 4 x i32> [[TMP28]], [[VEC_PHI2]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 4 x i8>, ptr [[TMP22]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD7]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = mul nsw <vscale x 4 x i32> [[TMP31]], [[TMP33]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP35]] = add <vscale x 4 x i32> [[TMP34]], [[VEC_PHI1]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP27]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP29]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP40:%.*]] = mul nsw <vscale x 4 x i32> [[TMP37]], [[TMP39]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP41]] = add <vscale x 4 x i32> [[TMP40]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP26]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-;
-; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled(
-; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT:  entry:
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], 8
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP16]], 8
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP18]]
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = mul i64 [[TMP34]], 8
-; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED:       vector.body:
-; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP80:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP81:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI4:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI5:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI6:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI7:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = or disjoint i64 [[TMP0]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP0]], 3
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP56]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP20]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP21]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = mul i64 [[TMP25]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 [[TMP26]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 4 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP72]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD9]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP82:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD10]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul nsw <vscale x 4 x i32> [[TMP28]], [[TMP66]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = mul nsw <vscale x 4 x i32> [[TMP82]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP50]] = add <vscale x 4 x i32> [[TMP30]], [[VEC_PHI6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP33]] = add <vscale x 4 x i32> [[TMP31]], [[VEC_PHI7]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = mul i64 [[TMP35]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP36]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD11:%.*]] = load <vscale x 4 x i8>, ptr [[TMP17]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 4 x i8>, ptr [[TMP37]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD11]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD12]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = mul i64 [[TMP41]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP42]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD13:%.*]] = load <vscale x 4 x i8>, ptr [[TMP19]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 4 x i8>, ptr [[TMP43]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD13]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD14]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = mul nsw <vscale x 4 x i32> [[TMP38]], [[TMP44]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = mul nsw <vscale x 4 x i32> [[TMP39]], [[TMP45]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP48]] = add <vscale x 4 x i32> [[TMP46]], [[VEC_PHI4]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP49]] = add <vscale x 4 x i32> [[TMP47]], [[VEC_PHI5]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = mul i64 [[TMP51]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP52]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD15:%.*]] = load <vscale x 4 x i8>, ptr [[TMP22]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 4 x i8>, ptr [[TMP53]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD15]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD16]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = mul i64 [[TMP57]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP58]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD17:%.*]] = load <vscale x 4 x i8>, ptr [[TMP24]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD18:%.*]] = load <vscale x 4 x i8>, ptr [[TMP59]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD17]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD18]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP62:%.*]] = mul nsw <vscale x 4 x i32> [[TMP54]], [[TMP60]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = mul nsw <vscale x 4 x i32> [[TMP55]], [[TMP61]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP64]] = add <vscale x 4 x i32> [[TMP62]], [[VEC_PHI2]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP65]] = add <vscale x 4 x i32> [[TMP63]], [[VEC_PHI3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = mul i64 [[TMP67]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP68]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD19:%.*]] = load <vscale x 4 x i8>, ptr [[TMP27]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD20:%.*]] = load <vscale x 4 x i8>, ptr [[TMP69]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP70:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD19]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP71:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD20]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP74:%.*]] = mul i64 [[TMP73]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[TMP74]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD21:%.*]] = load <vscale x 4 x i8>, ptr [[TMP29]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD22:%.*]] = load <vscale x 4 x i8>, ptr [[TMP75]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP76:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD21]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP77:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD22]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP78:%.*]] = mul nsw <vscale x 4 x i32> [[TMP70]], [[TMP76]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP79:%.*]] = mul nsw <vscale x 4 x i32> [[TMP71]], [[TMP77]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP80]] = add <vscale x 4 x i32> [[TMP78]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP81]] = add <vscale x 4 x i32> [[TMP79]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP40]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-;
-; CHECK-MAXBW-LABEL: define i32 @dotp_unrolled(
-; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-MAXBW-NEXT:  entry:
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP1]]
-; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-MAXBW:       vector.body:
-; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI4:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI5:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI6:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI7:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[TMP6]], 1
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = or disjoint i64 [[TMP6]], 2
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP12]]
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP12]]
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = or disjoint i64 [[TMP6]], 3
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
-; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
-; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP18]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x i8>, ptr [[TMP24]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD9]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = mul nsw <vscale x 8 x i32> [[TMP29]], [[TMP23]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE11]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI7]], <vscale x 8 x i32> [[TMP31]])
-; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 8 x i8>, ptr [[TMP32]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD12]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 8 x i8>, ptr [[TMP38]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP43:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD14]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP45:%.*]] = mul nsw <vscale x 8 x i32> [[TMP37]], [[TMP43]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI6]], <vscale x 8 x i32> [[TMP45]])
-; CHECK-MAXBW-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD18:%.*]] = load <vscale x 8 x i8>, ptr [[TMP46]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP51:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD18]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD20:%.*]] = load <vscale x 8 x i8>, ptr [[TMP52]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP57:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD20]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP59:%.*]] = mul nsw <vscale x 8 x i32> [[TMP51]], [[TMP57]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE17]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI5]], <vscale x 8 x i32> [[TMP59]])
-; CHECK-MAXBW-NEXT:    [[TMP60:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD24:%.*]] = load <vscale x 8 x i8>, ptr [[TMP60]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP65:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD24]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD26:%.*]] = load <vscale x 8 x i8>, ptr [[TMP66]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP71:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD26]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP73:%.*]] = mul nsw <vscale x 8 x i32> [[TMP65]], [[TMP71]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE16]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI4]], <vscale x 8 x i32> [[TMP73]])
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-MAXBW-NEXT:    [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-;
-entry:
-  br label %for.body
-
-for.body:                                    ; preds = %entry, %for.body
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %accum3 = phi i32 [ 0, %entry ], [ %add.a3, %for.body ]
-  %accum2 = phi i32 [ 0, %entry ], [ %add.a2, %for.body ]
-  %accum1 = phi i32 [ 0, %entry ], [ %add.a1, %for.body ]
-  %accum0 = phi i32 [ 0, %entry ], [ %add.a0, %for.body ]
-  %gep.a0 = getelementptr inbounds i8, ptr %a, i64 %iv
-  %gep.b0 = getelementptr inbounds i8, ptr %b, i64 %iv
-  %offset.1 = or disjoint i64 %iv, 1
-  %gep.a1 = getelementptr inbounds i8, ptr %a, i64 %offset.1
-  %gep.b1 = getelementptr inbounds i8, ptr %b, i64 %offset.1
-  %offset.2 = or disjoint i64 %iv, 2
-  %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %offset.2
-  %gep.b2 = getelementptr inbounds i8, ptr %b, i64 %offset.2
-  %offset.3 = or disjoint i64 %iv, 3
-  %gep.a3 = getelementptr inbounds i8, ptr %a, i64 %offset.3
-  %gep.b3 = getelementptr inbounds i8, ptr %b, i64 %offset.3
-  %load.a0 = load i8, ptr %gep.a0, align 1
-  %ext.a0 = sext i8 %load.a0 to i32
-  %load.b0 = load i8, ptr %gep.b0, align 1
-  %ext.b0 = sext i8 %load.b0 to i32
-  %mul.a0 = mul nsw i32 %ext.b0, %ext.a0
-  %add.a0 = add nsw i32 %mul.a0, %accum0
-  %load.a1 = load i8, ptr %gep.a1, align 1
-  %ext.a1 = sext i8 %load.a1 to i32
-  %load.b1 = load i8, ptr %gep.b1, align 1
-  %ext.b1 = sext i8 %load.b1 to i32
-  %mul.a1 = mul nsw i32 %ext.a1, %ext.b1
-  %add.a1 = add nsw i32 %mul.a1, %accum1
-  %load.a2 = load i8, ptr %gep.a2, align 1
-  %ext.a2 = sext i8 %load.a2 to i32
-  %load.b2 = load i8, ptr %gep.b2, align 1
-  %ext.b2 = sext i8 %load.b2 to i32
-  %mul.a2 = mul nsw i32 %ext.a2, %ext.b2
-  %add.a2 = add nsw i32 %mul.a2, %accum2
-  %load.a3 = load i8, ptr %gep.a3, align 1
-  %ext.a3 = sext i8 %load.a3 to i32
-  %load.b3 = load i8, ptr %gep.b3, align 1
-  %ext.b3 = sext i8 %load.b3 to i32
-  %mul.a3 = mul nsw i32 %ext.a3, %ext.b3
-  %add.a3 = add nsw i32 %mul.a3, %accum3
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %num_in
-  br i1 %exitcond.not, label %exit, label %for.body
-
-exit:                                        ; preds = %for.body
-  %result0 = add nsw i32 %add.a0, %add.a1
-  %result1 = add nsw i32 %add.a2, %add.a3
-  %result = add nsw i32 %result0, %result1
-  ret i32 %result
-}
-
-define i32 @not_dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
-; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated(
-; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT:  entry:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP7]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP10]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP11]], 4
-; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       vector.body:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul nsw <vscale x 4 x i32> [[TMP12]], [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
-; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
-;
-; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated(
-; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT:  entry:
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP8]], 8
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP15]], 8
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED:       vector.body:
-; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 [[TMP17]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul nsw <vscale x 4 x i32> [[TMP19]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul nsw <vscale x 4 x i32> [[TMP20]], [[TMP25]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP21]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
-;
-; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated(
-; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-MAXBW-NEXT:  entry:
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-MAXBW:       vector.body:
-; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul nsw <vscale x 8 x i32> [[TMP20]], [[TMP13]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP22]])
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv
-  %load.a = load i8, ptr %gep.a, align 1
-  %ext.a = sext i8 %load.a to i32
-  %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv
-  %load.b = load i8, ptr %gep.b, align 1
-  %ext.b = sext i8 %load.b to i32
-  %mul = mul nsw i32 %ext.b, %ext.a
-  %add = add nsw i32 %mul, %accum
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %for.body
-
-exit:                        ; preds = %for.body
-  ret i32 %add
-}
-
-define i32 @not_dotp_predicated_pragma(i64 %N, ptr %a, ptr %b) #0 {
-; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_predicated_pragma(
-; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT:  entry:
-; CHECK-INTERLEAVE1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP10]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP12]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP15]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], [[TMP6]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0
-; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
-; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       vector.body:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP5]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-;
-; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_predicated_pragma(
-; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT:  entry:
-; CHECK-INTERLEAVED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = sub i64 [[TMP10]], 1
-; CHECK-INTERLEAVED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP11]]
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP12]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP15]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0
-; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
-; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED:       vector.body:
-; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP5]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP8]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP2]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-;
-; CHECK-MAXBW-LABEL: define i32 @not_dotp_predicated_pragma(
-; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-MAXBW-NEXT:  entry:
-; CHECK-MAXBW-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
-; CHECK-MAXBW-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP2]]
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
-; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
-; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
-; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-MAXBW:       vector.body:
-; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP15]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = sext <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]] to <vscale x 4 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 4 x i32> [[TMP16]], [[TMP13]]
-; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
-; CHECK-MAXBW-NEXT:    [[TMP19]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
-; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
-; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
-; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 4 x i1> [[TMP20]], i32 0
-; CHECK-MAXBW-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %gep.a = getelementptr inbounds i8, ptr %b, i64 %iv
-  %load.a = load i8, ptr %gep.a, align 1
-  %ext.a = sext i8 %load.a to i32
-  %gep.a2 = getelementptr inbounds i8, ptr %a, i64 %iv
-  %load.b = load i8, ptr %gep.a2, align 1
-  %ext.b = sext i8 %load.b to i32
-  %mul = mul nsw i32 %ext.b, %ext.a
-  %add = add nsw i32 %mul, %accum
-  %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, %N
-  br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !7
-
-exit:                        ; preds = %for.body
-  ret i32 %add
-}
-
-define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
-; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_extend_user(
-; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVE1-NEXT:  entry:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-INTERLEAVE1-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP8]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP11]], 4
-; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       vector.body:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP12]], [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-;
-; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user(
-; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-INTERLEAVED-NEXT:  entry:
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-INTERLEAVED-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP7]]
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP14]], 8
-; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED:       vector.body:
-; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP2]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP15]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP2]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP17]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP20]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP21]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-;
-; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user(
-; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-MAXBW-NEXT:  entry:
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-MAXBW-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-MAXBW:       vector.body:
-; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP6]]
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP14]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP15]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP20]], [[TMP13]]
-; CHECK-MAXBW-NEXT:    [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %gep.a = getelementptr i8, ptr %a, i64 %iv
-  %load.a = load i8, ptr %gep.a, align 1
-  %ext.a = zext i8 %load.a to i32
-  %gep.b = getelementptr i8, ptr %b, i64 %iv
-  %load.b = load i8, ptr %gep.b, align 1
-  %ext.b = zext i8 %load.b to i32
-  %mul = mul i32 %ext.b, %ext.a
-  %add = add i32 %mul, %accum
-  %iv.next = add i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 0
-  br i1 %exitcond.not, label %for.exit, label %for.body
-
-for.exit:                        ; preds = %for.body
-  %result = add i32 %add, %ext.b
-  ret i32 %result
-}
-
-define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
-; CHECK-INTERLEAVE1-LABEL: define i64 @dotp_cost_disagreement(
-; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-INTERLEAVE1-NEXT:  entry:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
-; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVE1:       vector.body:
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i8>, ptr [[TMP12]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD1]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP13]], [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-;
-; CHECK-INTERLEAVED-LABEL: define i64 @dotp_cost_disagreement(
-; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-INTERLEAVED-NEXT:  entry:
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-INTERLEAVED:       vector.body:
-; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add nuw nsw i64 [[TMP6]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i64 [[TMP18]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP16]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP20]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP21]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP22]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP25]] = add <vscale x 2 x i64> [[VEC_PHI1]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-;
-; CHECK-MAXBW-LABEL: define i64 @dotp_cost_disagreement(
-; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-MAXBW-NEXT:  entry:
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 16, [[TMP1]]
-; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 16, [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 16, [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK-MAXBW:       vector.body:
-; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP6]], 1
-; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP11]], i32 0
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP13]], [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i64> @llvm.experimental.vector.partial.reduce.add.nxv1i64.nxv8i64(<vscale x 1 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[TMP14]])
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> [[PARTIAL_REDUCE]])
-; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 16, [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %entry, %for.body
-  %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
-  %sum = phi i64 [ 0, %entry ], [ %add, %for.body ]
-  %arrayidx = getelementptr inbounds nuw i8, ptr %a, i64 %i.iv
-  %0 = load i8, ptr %arrayidx, align 1
-  %conv = zext i8 %0 to i64
-  %i.iv.next = add nuw nsw i64 %i.iv, 1
-  %arrayidx2 = getelementptr inbounds nuw i8, ptr %b, i64 %i.iv.next
-  %1 = load i8, ptr %arrayidx2, align 1
-  %conv3 = zext i8 %1 to i64
-  %mul = mul nuw nsw i64 %conv3, %conv
-  %add = add i64 %sum, %mul
-  %exitcond.not = icmp eq i64 %i.iv.next, 16
-  br i1 %exitcond.not, label %exit, label %for.body
-
-exit:                                 ; preds = %for.body
-  ret i64 %add
-}
-
-!7 = distinct !{!7, !8, !9, !10}
-!8 = !{!"llvm.loop.mustprogress"}
-!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
-!10 = !{!"llvm.loop.vectorize.enable", i1 true}
-attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll
deleted file mode 100644
index f24b115ab9f99..0000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-no-dotprod.ll
+++ /dev/null
@@ -1,61 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon -S < %s | FileCheck %s
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-none-unknown-elf"
-
-define i32 @not_dotp(ptr %a, ptr %b) {
-; CHECK-LABEL: define i32 @not_dotp(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
-; CHECK-NEXT:    [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %gep.a = getelementptr i8, ptr %a, i64 %iv
-  %load.a = load i8, ptr %gep.a, align 1
-  %ext.a = zext i8 %load.a to i32
-  %gep.b = getelementptr i8, ptr %b, i64 %iv
-  %load.b = load i8, ptr %gep.b, align 1
-  %ext.b = zext i8 %load.b to i32
-  %mul = mul i32 %ext.b, %ext.a
-  %add = add i32 %mul, %accum
-  %iv.next = add i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 0
-  br i1 %exitcond.not, label %for.exit, label %for.body
-
-for.exit:                        ; preds = %for.body
-  ret i32 %add
-}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
deleted file mode 100644
index 06aaf29b382a2..0000000000000
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ /dev/null
@@ -1,93 +0,0 @@
-; REQUIRES: asserts
-; RUN: opt -mattr=+neon,+dotprod -passes=loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -disable-output %s 2>&1 | FileCheck %s
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-none-unknown-elf"
-
-; Tests for printing VPlans that are enabled under AArch64
-
-define i32 @print_partial_reduction(ptr %a, ptr %b) {
-; CHECK:      VPlan 'Initial VPlan for VF={8,16},UF>=1' {
-; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF
-; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
-; CHECK-NEXT: Live-in ir<0> = original trip-count
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<entry>:
-; CHECK-NEXT: Successor(s): vector.ph
-; CHECK-EMPTY:
-; CHECK-NEXT: vector.ph:
-; CHECK-NEXT: Successor(s): vector loop
-; CHECK-EMPTY:
-; CHECK-NEXT: <x1> vector loop: {
-; CHECK-NEXT: vector.body:
-; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi ir<0>, ir<[[REDUCE:%.+]]> (VF scaled by 1/4)
-; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>
-; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]>
-; CHECK-NEXT:   vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a>
-; CHECK-NEXT:   WIDEN ir<%load.a> = load vp<[[PTR_A]]>
-; CHECK-NEXT:   WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
-; CHECK-NEXT:   CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]>
-; CHECK-NEXT:   vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b>
-; CHECK-NEXT:   WIDEN ir<%load.b> = load vp<[[PTR_B]]>
-; CHECK-NEXT:   WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
-; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
-; CHECK-NEXT:   PARTIAL-REDUCE ir<[[REDUCE]]> = add ir<%mul>, ir<[[ACC]]>
-; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
-; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-; CHECK-NEXT: Successor(s): middle.block
-; CHECK-EMPTY:
-; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]>
-; CHECK-NEXT:   EMIT vp<[[EXTRACT:%.+]]> = extract-from-end vp<[[RED_RESULT]]>, ir<1>
-; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq ir<0>, vp<%1>
-; CHECK-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
-; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
-; CHECK-EMPTY:
-; CHECK-NEXT: scalar.ph:
-; CHECK-NEXT:   EMIT vp<%bc.merge.rdx> = resume-phi vp<[[RED_RESULT]]>, ir<0>
-; CHECK-NEXT: Successor(s): ir-bb<for.body>
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<for.body>:
-; CHECK-NEXT:   IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-; CHECK-NEXT:   IR   %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] (extra operand: vp<%bc.merge.rdx> from scalar.ph)
-; CHECK-NEXT:   IR   %gep.a = getelementptr i8, ptr %a, i64 %iv
-; CHECK-NEXT:   IR   %load.a = load i8, ptr %gep.a, align 1
-; CHECK-NEXT:   IR   %ext.a = zext i8 %load.a to i32
-; CHECK-NEXT:   IR   %gep.b = getelementptr i8, ptr %b, i64 %iv
-; CHECK-NEXT:   IR   %load.b = load i8, ptr %gep.b, align 1
-; CHECK-NEXT:   IR   %ext.b = zext i8 %load.b to i32
-; CHECK-NEXT:   IR   %mul = mul i32 %ext.b, %ext.a
-; CHECK-NEXT:   IR   %add = add i32 %mul, %accum
-; CHECK-NEXT:   IR   %iv.next = add i64 %iv, 1
-; CHECK-NEXT:   IR   %exitcond.not = icmp eq i64 %iv.next, 0
-; CHECK-NEXT: No successors
-; CHECK-EMPTY:
-; CHECK-NEXT: ir-bb<exit>:
-; CHECK-NEXT:   IR   %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[EXTRACT]]> from middle.block)
-; CHECK-NEXT: No successors
-; CHECK-NEXT: }
-;
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
-  %accum = phi i32 [ 0, %entry ], [ %add, %for.body ]
-  %gep.a = getelementptr i8, ptr %a, i64 %iv
-  %load.a = load i8, ptr %gep.a, align 1
-  %ext.a = zext i8 %load.a to i32
-  %gep.b = getelementptr i8, ptr %b, i64 %iv
-  %load.b = load i8, ptr %gep.b, align 1
-  %ext.b = zext i8 %load.b to i32
-  %mul = mul i32 %ext.b, %ext.a
-  %add = add i32 %mul, %accum
-  %iv.next = add i64 %iv, 1
-  %exitcond.not = icmp eq i64 %iv.next, 0
-  br i1 %exitcond.not, label %exit, label %for.body
-
-exit:
-  ret i32 %add
-}

From 0517772b4ac20c5d3a0de0d4703354a179833248 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 19 Dec 2024 14:14:11 -0800
Subject: [PATCH 117/209] Delete unused PoisonChecking utility pass

This was introduced ~5yrs ago (by me), and has never really gotten
any adoption.  By now, it's significantly out of sync with new/changed
poison propoagation rules.  The idea is still reasonable, but the
imagined use case is largely covered by alive2 these days anyways.
---
 .../Instrumentation/PoisonChecking.h          |  23 --
 llvm/lib/Passes/PassBuilder.cpp               |   1 -
 llvm/lib/Passes/PassRegistry.def              |   1 -
 .../Transforms/Instrumentation/CMakeLists.txt |   1 -
 .../Instrumentation/PoisonChecking.cpp        | 358 ------------------
 .../PoisonChecking/basic-flag-validation.ll   | 322 ----------------
 .../PoisonChecking/ub-checks.ll               | 192 ----------
 .../lib/Transforms/Instrumentation/BUILD.gn   |   1 -
 8 files changed, 899 deletions(-)
 delete mode 100644 llvm/include/llvm/Transforms/Instrumentation/PoisonChecking.h
 delete mode 100644 llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
 delete mode 100644 llvm/test/Instrumentation/PoisonChecking/basic-flag-validation.ll
 delete mode 100644 llvm/test/Instrumentation/PoisonChecking/ub-checks.ll

diff --git a/llvm/include/llvm/Transforms/Instrumentation/PoisonChecking.h b/llvm/include/llvm/Transforms/Instrumentation/PoisonChecking.h
deleted file mode 100644
index 4ec7ec809db71..0000000000000
--- a/llvm/include/llvm/Transforms/Instrumentation/PoisonChecking.h
+++ /dev/null
@@ -1,23 +0,0 @@
-//===- PoisonChecking.h - ---------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_POISONCHECKING_H
-#define LLVM_TRANSFORMS_INSTRUMENTATION_POISONCHECKING_H
-
-#include "llvm/IR/PassManager.h"
-
-namespace llvm {
-
-struct PoisonCheckingPass : public PassInfoMixin<PoisonCheckingPass> {
-  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
-  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
-};
-
-}
-
-#endif // LLVM_TRANSFORMS_INSTRUMENTATION_POISONCHECKING_H
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index aa5b9077376e0..d70ac48f25118 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -220,7 +220,6 @@
 #include "llvm/Transforms/Instrumentation/PGOCtxProfLowering.h"
 #include "llvm/Transforms/Instrumentation/PGOForceFunctionAttrs.h"
 #include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
-#include "llvm/Transforms/Instrumentation/PoisonChecking.h"
 #include "llvm/Transforms/Instrumentation/RealtimeSanitizer.h"
 #include "llvm/Transforms/Instrumentation/SanitizerBinaryMetadata.h"
 #include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index ecc9f55493710..ba3adcb0e317c 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -115,7 +115,6 @@ MODULE_PASS("partial-inliner", PartialInlinerPass())
 MODULE_PASS("pgo-icall-prom", PGOIndirectCallPromotion())
 MODULE_PASS("pgo-instr-gen", PGOInstrumentationGen())
 MODULE_PASS("pgo-instr-use", PGOInstrumentationUse())
-MODULE_PASS("poison-checking", PoisonCheckingPass())
 MODULE_PASS("pre-isel-intrinsic-lowering", PreISelIntrinsicLoweringPass(TM))
 MODULE_PASS("print", PrintModulePass(errs()))
 MODULE_PASS("print-callgraph", CallGraphPrinterPass(errs()))
diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
index 5abc7fc805283..5c437437fe362 100644
--- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -19,7 +19,6 @@ add_llvm_component_library(LLVMInstrumentation
   PGOForceFunctionAttrs.cpp
   PGOInstrumentation.cpp
   PGOMemOPSizeOpt.cpp
-  PoisonChecking.cpp
   SanitizerCoverage.cpp
   SanitizerBinaryMetadata.cpp
   ValueProfileCollector.cpp
diff --git a/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp b/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
deleted file mode 100644
index e094acdc31781..0000000000000
--- a/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
+++ /dev/null
@@ -1,358 +0,0 @@
-//===- PoisonChecking.cpp - -----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Implements a transform pass which instruments IR such that poison semantics
-// are made explicit.  That is, it provides a (possibly partial) executable
-// semantics for every instruction w.r.t. poison as specified in the LLVM
-// LangRef.  There are obvious parallels to the sanitizer tools, but this pass
-// is focused purely on the semantics of LLVM IR, not any particular source
-// language.   If you're looking for something to see if your C/C++ contains
-// UB, this is not it.
-//
-// The rewritten semantics of each instruction will include the following
-// components:
-//
-// 1) The original instruction, unmodified.
-// 2) A propagation rule which translates dynamic information about the poison
-//    state of each input to whether the dynamic output of the instruction
-//    produces poison.
-// 3) A creation rule which validates any poison producing flags on the
-//    instruction itself (e.g. checks for overflow on nsw).
-// 4) A check rule which traps (to a handler function) if this instruction must
-//    execute undefined behavior given the poison state of it's inputs.
-//
-// This is a must analysis based transform; that is, the resulting code may
-// produce a false negative result (not report UB when actually exists
-// according to the LangRef spec), but should never produce a false positive
-// (report UB where it doesn't exist).
-//
-// Use cases for this pass include:
-// - Understanding (and testing!) the implications of the definition of poison
-//   from the LangRef.
-// - Validating the output of a IR fuzzer to ensure that all programs produced
-//   are well defined on the specific input used.
-// - Finding/confirming poison specific miscompiles by checking the poison
-//   status of an input/IR pair is the same before and after an optimization
-//   transform.
-// - Checking that a bugpoint reduction does not introduce UB which didn't
-//   exist in the original program being reduced.
-//
-// The major sources of inaccuracy are currently:
-// - Most validation rules not yet implemented for instructions with poison
-//   relavant flags.  At the moment, only nsw/nuw on add/sub are supported.
-// - UB which is control dependent on a branch on poison is not yet
-//   reported. Currently, only data flow dependence is modeled.
-// - Poison which is propagated through memory is not modeled.  As such,
-//   storing poison to memory and then reloading it will cause a false negative
-//   as we consider the reloaded value to not be poisoned.
-// - Poison propagation across function boundaries is not modeled.  At the
-//   moment, all arguments and return values are assumed not to be poison.
-// - Undef is not modeled.  In particular, the optimizer's freedom to pick
-//   concrete values for undef bits so as to maximize potential for producing
-//   poison is not modeled.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Instrumentation/PoisonChecking.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CommandLine.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "poison-checking"
-
-static cl::opt<bool>
-LocalCheck("poison-checking-function-local",
-           cl::init(false),
-           cl::desc("Check that returns are non-poison (for testing)"));
-
-
-static bool isConstantFalse(Value* V) {
-  assert(V->getType()->isIntegerTy(1));
-  if (auto *CI = dyn_cast<ConstantInt>(V))
-    return CI->isZero();
-  return false;
-}
-
-static Value *buildOrChain(IRBuilder<> &B, ArrayRef<Value*> Ops) {
-  if (Ops.size() == 0)
-    return B.getFalse();
-  unsigned i = 0;
-  for (; i < Ops.size() && isConstantFalse(Ops[i]); i++) {}
-  if (i == Ops.size())
-    return B.getFalse();
-  Value *Accum = Ops[i++];
-  for (Value *Op : llvm::drop_begin(Ops, i))
-    if (!isConstantFalse(Op))
-      Accum = B.CreateOr(Accum, Op);
-  return Accum;
-}
-
-static void generateCreationChecksForBinOp(Instruction &I,
-                                           SmallVectorImpl<Value*> &Checks) {
-  assert(isa<BinaryOperator>(I));
-
-  IRBuilder<> B(&I);
-  Value *LHS = I.getOperand(0);
-  Value *RHS = I.getOperand(1);
-  switch (I.getOpcode()) {
-  default:
-    return;
-  case Instruction::Add: {
-    if (I.hasNoSignedWrap()) {
-      auto *OverflowOp =
-        B.CreateBinaryIntrinsic(Intrinsic::sadd_with_overflow, LHS, RHS);
-      Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
-    }
-    if (I.hasNoUnsignedWrap()) {
-      auto *OverflowOp =
-        B.CreateBinaryIntrinsic(Intrinsic::uadd_with_overflow, LHS, RHS);
-      Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
-    }
-    break;
-  }
-  case Instruction::Sub: {
-    if (I.hasNoSignedWrap()) {
-      auto *OverflowOp =
-        B.CreateBinaryIntrinsic(Intrinsic::ssub_with_overflow, LHS, RHS);
-      Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
-    }
-    if (I.hasNoUnsignedWrap()) {
-      auto *OverflowOp =
-        B.CreateBinaryIntrinsic(Intrinsic::usub_with_overflow, LHS, RHS);
-      Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
-    }
-    break;
-  }
-  case Instruction::Mul: {
-    if (I.hasNoSignedWrap()) {
-      auto *OverflowOp =
-        B.CreateBinaryIntrinsic(Intrinsic::smul_with_overflow, LHS, RHS);
-      Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
-    }
-    if (I.hasNoUnsignedWrap()) {
-      auto *OverflowOp =
-        B.CreateBinaryIntrinsic(Intrinsic::umul_with_overflow, LHS, RHS);
-      Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
-    }
-    break;
-  }
-  case Instruction::UDiv: {
-    if (I.isExact()) {
-      auto *Check =
-        B.CreateICmp(ICmpInst::ICMP_NE, B.CreateURem(LHS, RHS),
-                     ConstantInt::get(LHS->getType(), 0));
-      Checks.push_back(Check);
-    }
-    break;
-  }
-  case Instruction::SDiv: {
-    if (I.isExact()) {
-      auto *Check =
-        B.CreateICmp(ICmpInst::ICMP_NE, B.CreateSRem(LHS, RHS),
-                     ConstantInt::get(LHS->getType(), 0));
-      Checks.push_back(Check);
-    }
-    break;
-  }
-  case Instruction::AShr:
-  case Instruction::LShr:
-  case Instruction::Shl: {
-    Value *ShiftCheck =
-      B.CreateICmp(ICmpInst::ICMP_UGE, RHS,
-                   ConstantInt::get(RHS->getType(),
-                                    LHS->getType()->getScalarSizeInBits()));
-    Checks.push_back(ShiftCheck);
-    break;
-  }
-  };
-}
-
-/// Given an instruction which can produce poison on non-poison inputs
-/// (i.e. canCreatePoison returns true), generate runtime checks to produce
-/// boolean indicators of when poison would result.
-static void generateCreationChecks(Instruction &I,
-                                   SmallVectorImpl<Value*> &Checks) {
-  IRBuilder<> B(&I);
-  if (isa<BinaryOperator>(I) && !I.getType()->isVectorTy())
-    generateCreationChecksForBinOp(I, Checks);
-
-  // Handle non-binops separately
-  switch (I.getOpcode()) {
-  default:
-    // Note there are a couple of missing cases here, once implemented, this
-    // should become an llvm_unreachable.
-    break;
-  case Instruction::ExtractElement: {
-    Value *Vec = I.getOperand(0);
-    auto *VecVTy = dyn_cast<FixedVectorType>(Vec->getType());
-    if (!VecVTy)
-      break;
-    Value *Idx = I.getOperand(1);
-    unsigned NumElts = VecVTy->getNumElements();
-    Value *Check =
-      B.CreateICmp(ICmpInst::ICMP_UGE, Idx,
-                   ConstantInt::get(Idx->getType(), NumElts));
-    Checks.push_back(Check);
-    break;
-  }
-  case Instruction::InsertElement: {
-    Value *Vec = I.getOperand(0);
-    auto *VecVTy = dyn_cast<FixedVectorType>(Vec->getType());
-    if (!VecVTy)
-      break;
-    Value *Idx = I.getOperand(2);
-    unsigned NumElts = VecVTy->getNumElements();
-    Value *Check =
-      B.CreateICmp(ICmpInst::ICMP_UGE, Idx,
-                   ConstantInt::get(Idx->getType(), NumElts));
-    Checks.push_back(Check);
-    break;
-  }
-  };
-}
-
-static Value *getPoisonFor(DenseMap<Value *, Value *> &ValToPoison, Value *V) {
-  auto Itr = ValToPoison.find(V);
-  if (Itr != ValToPoison.end())
-    return Itr->second;
-  if (isa<Constant>(V)) {
-    return ConstantInt::getFalse(V->getContext());
-  }
-  // Return false for unknwon values - this implements a non-strict mode where
-  // unhandled IR constructs are simply considered to never produce poison.  At
-  // some point in the future, we probably want a "strict mode" for testing if
-  // nothing else.
-  return ConstantInt::getFalse(V->getContext());
-}
-
-static void CreateAssert(IRBuilder<> &B, Value *Cond) {
-  assert(Cond->getType()->isIntegerTy(1));
-  if (auto *CI = dyn_cast<ConstantInt>(Cond))
-    if (CI->isAllOnesValue())
-      return;
-
-  Module *M = B.GetInsertBlock()->getModule();
-  M->getOrInsertFunction("__poison_checker_assert",
-                         Type::getVoidTy(M->getContext()),
-                         Type::getInt1Ty(M->getContext()));
-  Function *TrapFunc = M->getFunction("__poison_checker_assert");
-  B.CreateCall(TrapFunc, Cond);
-}
-
-static void CreateAssertNot(IRBuilder<> &B, Value *Cond) {
-  assert(Cond->getType()->isIntegerTy(1));
-  CreateAssert(B, B.CreateNot(Cond));
-}
-
-static bool rewrite(Function &F) {
-  auto * const Int1Ty = Type::getInt1Ty(F.getContext());
-
-  DenseMap<Value *, Value *> ValToPoison;
-
-  for (BasicBlock &BB : F)
-    for (auto I = BB.begin(); isa<PHINode>(&*I); I++) {
-      auto *OldPHI = cast<PHINode>(&*I);
-      auto *NewPHI = PHINode::Create(Int1Ty, OldPHI->getNumIncomingValues());
-      for (unsigned i = 0; i < OldPHI->getNumIncomingValues(); i++)
-        NewPHI->addIncoming(UndefValue::get(Int1Ty),
-                            OldPHI->getIncomingBlock(i));
-      NewPHI->insertBefore(OldPHI);
-      ValToPoison[OldPHI] = NewPHI;
-    }
-
-  for (BasicBlock &BB : F)
-    for (Instruction &I : BB) {
-      if (isa<PHINode>(I)) continue;
-
-      IRBuilder<> B(cast<Instruction>(&I));
-
-      // Note: There are many more sources of documented UB, but this pass only
-      // attempts to find UB triggered by propagation of poison.
-      SmallVector<const Value *, 4> NonPoisonOps;
-      SmallPtrSet<const Value *, 4> SeenNonPoisonOps;
-      getGuaranteedNonPoisonOps(&I, NonPoisonOps);
-      for (const Value *Op : NonPoisonOps)
-        if (SeenNonPoisonOps.insert(Op).second)
-          CreateAssertNot(B,
-                          getPoisonFor(ValToPoison, const_cast<Value *>(Op)));
-
-      if (LocalCheck)
-        if (auto *RI = dyn_cast<ReturnInst>(&I))
-          if (RI->getNumOperands() != 0) {
-            Value *Op = RI->getOperand(0);
-            CreateAssertNot(B, getPoisonFor(ValToPoison, Op));
-          }
-
-      SmallVector<Value*, 4> Checks;
-      for (const Use &U : I.operands()) {
-        if (ValToPoison.count(U) && propagatesPoison(U))
-          Checks.push_back(getPoisonFor(ValToPoison, U));
-      }
-
-      if (canCreatePoison(cast<Operator>(&I)))
-        generateCreationChecks(I, Checks);
-      ValToPoison[&I] = buildOrChain(B, Checks);
-    }
-
-  for (BasicBlock &BB : F)
-    for (auto I = BB.begin(); isa<PHINode>(&*I); I++) {
-      auto *OldPHI = cast<PHINode>(&*I);
-      if (!ValToPoison.count(OldPHI))
-        continue; // skip the newly inserted phis
-      auto *NewPHI = cast<PHINode>(ValToPoison[OldPHI]);
-      for (unsigned i = 0; i < OldPHI->getNumIncomingValues(); i++) {
-        auto *OldVal = OldPHI->getIncomingValue(i);
-        NewPHI->setIncomingValue(i, getPoisonFor(ValToPoison, OldVal));
-      }
-    }
-  return true;
-}
-
-
-PreservedAnalyses PoisonCheckingPass::run(Module &M,
-                                          ModuleAnalysisManager &AM) {
-  bool Changed = false;
-  for (auto &F : M)
-    Changed |= rewrite(F);
-
-  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
-}
-
-PreservedAnalyses PoisonCheckingPass::run(Function &F,
-                                          FunctionAnalysisManager &AM) {
-  return rewrite(F) ? PreservedAnalyses::none() : PreservedAnalyses::all();
-}
-
-/* Major TODO Items:
-   - Control dependent poison UB
-   - Strict mode - (i.e. must analyze every operand)
-     - Poison through memory
-     - Function ABIs
-     - Full coverage of intrinsics, etc.. (ouch)
-
-   Instructions w/Unclear Semantics:
-   - shufflevector - It would seem reasonable for an out of bounds mask element
-     to produce poison, but the LangRef does not state.
-   - all binary ops w/vector operands - The likely interpretation would be that
-     any element overflowing should produce poison for the entire result, but
-     the LangRef does not state.
-   - Floating point binary ops w/fmf flags other than (nnan, noinfs).  It seems
-     strange that only certian flags should be documented as producing poison.
-
-   Cases of clear poison semantics not yet implemented:
-   - Exact flags on ashr/lshr produce poison
-   - NSW/NUW flags on shl produce poison
-   - Inbounds flag on getelementptr produce poison
-   - fptosi/fptoui (out of bounds input) produce poison
-   - Scalable vector types for insertelement/extractelement
-   - Floating point binary ops w/fmf nnan/noinfs flags produce poison
- */
diff --git a/llvm/test/Instrumentation/PoisonChecking/basic-flag-validation.ll b/llvm/test/Instrumentation/PoisonChecking/basic-flag-validation.ll
deleted file mode 100644
index d32af40f3b0ab..0000000000000
--- a/llvm/test/Instrumentation/PoisonChecking/basic-flag-validation.ll
+++ /dev/null
@@ -1,322 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=poison-checking -S -poison-checking-function-local < %s | FileCheck %s
-
-; This file contains tests to exercise the custom flag validation rules
-
-define i32 @add_noflags(i32 %a, i32 %b) {
-; CHECK-LABEL: @add_noflags(
-; CHECK-NEXT:    [[RES:%.*]] = add i32 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = add i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @add_nsw(i32 %a, i32 %b) {
-; CHECK-LABEL: @add_nsw(
-; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A:%.*]], i32 [[B:%.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[RES:%.*]] = add nsw i32 [[A]], [[B]]
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP3]])
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = add nsw i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @add_nuw(i32 %a, i32 %b) {
-; CHECK-LABEL: @add_nuw(
-; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A:%.*]], i32 [[B:%.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[RES:%.*]] = add nuw i32 [[A]], [[B]]
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP3]])
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = add nuw i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @add_nsw_nuw(i32 %a, i32 %b) {
-; CHECK-LABEL: @add_nsw_nuw(
-; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A:%.*]], i32 [[B:%.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[RES:%.*]] = add nuw nsw i32 [[A]], [[B]]
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i1 [[TMP5]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP6]])
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = add nsw nuw i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @sub_noflags(i32 %a, i32 %b) {
-; CHECK-LABEL: @sub_noflags(
-; CHECK-NEXT:    [[RES:%.*]] = sub i32 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = sub i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @sub_nsw(i32 %a, i32 %b) {
-; CHECK-LABEL: @sub_nsw(
-; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A:%.*]], i32 [[B:%.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[RES:%.*]] = sub nsw i32 [[A]], [[B]]
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP3]])
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = sub nsw i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @sub_nuw(i32 %a, i32 %b) {
-; CHECK-LABEL: @sub_nuw(
-; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A:%.*]], i32 [[B:%.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[RES:%.*]] = sub nuw i32 [[A]], [[B]]
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP3]])
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = sub nuw i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @sub_nsw_nuw(i32 %a, i32 %b) {
-; CHECK-LABEL: @sub_nsw_nuw(
-; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 [[A:%.*]], i32 [[B:%.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[RES:%.*]] = sub nuw nsw i32 [[A]], [[B]]
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i1 [[TMP5]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP6]])
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = sub nsw nuw i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @mul_noflags(i32 %a, i32 %b) {
-; CHECK-LABEL: @mul_noflags(
-; CHECK-NEXT:    [[RES:%.*]] = mul i32 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = mul i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @mul_nsw(i32 %a, i32 %b) {
-; CHECK-LABEL: @mul_nsw(
-; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A:%.*]], i32 [[B:%.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[RES:%.*]] = mul nsw i32 [[A]], [[B]]
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP3]])
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = mul nsw i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @mul_nuw(i32 %a, i32 %b) {
-; CHECK-LABEL: @mul_nuw(
-; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A:%.*]], i32 [[B:%.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[RES:%.*]] = mul nuw i32 [[A]], [[B]]
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP3]])
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = mul nuw i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @mul_nsw_nuw(i32 %a, i32 %b) {
-; CHECK-LABEL: @mul_nsw_nuw(
-; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 [[A:%.*]], i32 [[B:%.*]])
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 [[A]], i32 [[B]])
-; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[RES:%.*]] = mul nuw nsw i32 [[A]], [[B]]
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i1 [[TMP5]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP6]])
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = mul nsw nuw i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @sdiv_noflags(i32 %a, i32 %b) {
-; CHECK-LABEL: @sdiv_noflags(
-; CHECK-NEXT:    [[RES:%.*]] = sdiv i32 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = sdiv i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @sdiv_exact(i32 %a, i32 %b) {
-; CHECK-LABEL: @sdiv_exact(
-; CHECK-NEXT:    [[TMP1:%.*]] = srem i32 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-NEXT:    [[RES:%.*]] = sdiv exact i32 [[A]], [[B]]
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP3]])
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = sdiv exact i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @udiv_noflags(i32 %a, i32 %b) {
-; CHECK-LABEL: @udiv_noflags(
-; CHECK-NEXT:    [[RES:%.*]] = udiv i32 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = udiv i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @udiv_exact(i32 %a, i32 %b) {
-; CHECK-LABEL: @udiv_exact(
-; CHECK-NEXT:    [[TMP1:%.*]] = urem i32 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-NEXT:    [[RES:%.*]] = udiv exact i32 [[A]], [[B]]
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP3]])
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = udiv exact i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @ashr_noflags(i32 %a, i32 %b) {
-; CHECK-LABEL: @ashr_noflags(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge i32 [[B:%.*]], 32
-; CHECK-NEXT:    [[RES:%.*]] = ashr i32 [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP1]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP2]])
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = ashr i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @ashr_exact(i32 %a, i32 %b) {
-; CHECK-LABEL: @ashr_exact(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge i32 [[B:%.*]], 32
-; CHECK-NEXT:    [[RES:%.*]] = ashr exact i32 [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP1]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP2]])
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = ashr exact i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @lshr_noflags(i32 %a, i32 %b) {
-; CHECK-LABEL: @lshr_noflags(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge i32 [[B:%.*]], 32
-; CHECK-NEXT:    [[RES:%.*]] = lshr i32 [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP1]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP2]])
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = lshr i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @lshr_exact(i32 %a, i32 %b) {
-; CHECK-LABEL: @lshr_exact(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge i32 [[B:%.*]], 32
-; CHECK-NEXT:    [[RES:%.*]] = lshr exact i32 [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP1]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP2]])
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = lshr exact i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @shl_noflags(i32 %a, i32 %b) {
-; CHECK-LABEL: @shl_noflags(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge i32 [[B:%.*]], 32
-; CHECK-NEXT:    [[RES:%.*]] = shl i32 [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP1]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP2]])
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = shl i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @shl_nsw(i32 %a, i32 %b) {
-; CHECK-LABEL: @shl_nsw(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge i32 [[B:%.*]], 32
-; CHECK-NEXT:    [[RES:%.*]] = shl nsw i32 [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP1]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP2]])
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = shl nsw i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @shl_nuw(i32 %a, i32 %b) {
-; CHECK-LABEL: @shl_nuw(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge i32 [[B:%.*]], 32
-; CHECK-NEXT:    [[RES:%.*]] = shl nuw i32 [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP1]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP2]])
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = shl nuw i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @shl_nsw_nuw(i32 %a, i32 %b) {
-; CHECK-LABEL: @shl_nsw_nuw(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge i32 [[B:%.*]], 32
-; CHECK-NEXT:    [[RES:%.*]] = shl nuw nsw i32 [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP1]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP2]])
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = shl nsw nuw i32 %a, %b
-  ret i32 %res
-}
-
-define i32 @extractelement(<4 x i32> %v, i32 %idx) {
-; CHECK-LABEL: @extractelement(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge i32 [[IDX:%.*]], 4
-; CHECK-NEXT:    [[RES:%.*]] = extractelement <4 x i32> [[V:%.*]], i32 [[IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP1]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP2]])
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = extractelement <4 x i32> %v, i32 %idx
-  ret i32 %res
-}
-
-define <4 x i32> @insertelement(<4 x i32> %v, i32 %idx, i32 %val) {
-; CHECK-LABEL: @insertelement(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge i32 [[IDX:%.*]], 4
-; CHECK-NEXT:    [[RES:%.*]] = insertelement <4 x i32> [[V:%.*]], i32 [[VAL:%.*]], i32 [[IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP1]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP2]])
-; CHECK-NEXT:    ret <4 x i32> [[RES]]
-;
-  %res = insertelement <4 x i32> %v, i32 %val, i32 %idx
-  ret <4 x i32> %res
-}
-
diff --git a/llvm/test/Instrumentation/PoisonChecking/ub-checks.ll b/llvm/test/Instrumentation/PoisonChecking/ub-checks.ll
deleted file mode 100644
index 9f9b16eaa8821..0000000000000
--- a/llvm/test/Instrumentation/PoisonChecking/ub-checks.ll
+++ /dev/null
@@ -1,192 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=poison-checking -S < %s | FileCheck %s
-
-; This file contains tests to exercise the UB triggering instructions with
-; a potential source of UB.  The UB source is kept simple; we focus on the
-; UB triggering instructions here.
-
-define void @store(ptr %base, i32 %a) {
-; CHECK-LABEL: @store(
-; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A:%.*]], i32 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[A]], 1
-; CHECK-NEXT:    [[P:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i32 [[ADD]]
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP3]])
-; CHECK-NEXT:    store i8 0, ptr [[P]], align 1
-; CHECK-NEXT:    ret void
-;
-  %add = add nsw i32 %a, 1
-  %p = getelementptr i8, ptr %base, i32 %add
-  store i8 0, ptr %p
-  ret void
-}
-
-define void @load(ptr %base, i32 %a) {
-; CHECK-LABEL: @load(
-; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A:%.*]], i32 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[A]], 1
-; CHECK-NEXT:    [[P:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i32 [[ADD]]
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP3]])
-; CHECK-NEXT:    [[TMP4:%.*]] = load volatile i8, ptr [[P]], align 1
-; CHECK-NEXT:    ret void
-;
-  %add = add nsw i32 %a, 1
-  %p = getelementptr i8, ptr %base, i32 %add
-  load volatile i8, ptr %p
-  ret void
-}
-
-define void @atomicrmw(ptr %base, i32 %a) {
-; CHECK-LABEL: @atomicrmw(
-; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A:%.*]], i32 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[A]], 1
-; CHECK-NEXT:    [[P:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i32 [[ADD]]
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP3]])
-; CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw add ptr [[P]], i8 1 seq_cst, align 1
-; CHECK-NEXT:    ret void
-;
-  %add = add nsw i32 %a, 1
-  %p = getelementptr i8, ptr %base, i32 %add
-  atomicrmw add ptr %p, i8 1 seq_cst
-  ret void
-}
-
-define void @cmpxchg(ptr %base, i32 %a) {
-; CHECK-LABEL: @cmpxchg(
-; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 [[A:%.*]], i32 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[A]], 1
-; CHECK-NEXT:    [[P:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i32 [[ADD]]
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP3]])
-; CHECK-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[P]], i8 1, i8 0 seq_cst seq_cst, align 1
-; CHECK-NEXT:    ret void
-;
-  %add = add nsw i32 %a, 1
-  %p = getelementptr i8, ptr %base, i32 %add
-  cmpxchg ptr %p, i8 1, i8 0 seq_cst seq_cst
-  ret void
-}
-
-define i32 @udiv(ptr %base, i32 %a) {
-; CHECK-LABEL: @udiv(
-; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A:%.*]], i32 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[A]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP3]])
-; CHECK-NEXT:    [[RES:%.*]] = udiv i32 2048, [[ADD]]
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %add = add nuw i32 %a, 1
-  %res = udiv i32 2048, %add
-  ret i32 %res
-}
-
-define i32 @sdiv(ptr %base, i32 %a) {
-; CHECK-LABEL: @sdiv(
-; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A:%.*]], i32 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[A]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP3]])
-; CHECK-NEXT:    [[RES:%.*]] = sdiv i32 2048, [[ADD]]
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %add = add nuw i32 %a, 1
-  %res = sdiv i32 2048, %add
-  ret i32 %res
-}
-
-define i32 @urem(ptr %base, i32 %a) {
-; CHECK-LABEL: @urem(
-; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A:%.*]], i32 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[A]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP3]])
-; CHECK-NEXT:    [[RES:%.*]] = urem i32 2048, [[ADD]]
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %add = add nuw i32 %a, 1
-  %res = urem i32 2048, %add
-  ret i32 %res
-}
-
-define i32 @srem(ptr %base, i32 %a) {
-; CHECK-LABEL: @srem(
-; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A:%.*]], i32 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[A]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP3]])
-; CHECK-NEXT:    [[RES:%.*]] = srem i32 2048, [[ADD]]
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %add = add nuw i32 %a, 1
-  %res = srem i32 2048, %add
-  ret i32 %res
-}
-
-define noundef i32 @select_cond_may_be_poison(i32 %a, i32 %b) {
-; CHECK-LABEL: @select_cond_may_be_poison(
-; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A:%.*]], i32 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[A]], 1
-; CHECK-NEXT:    [[T:%.*]] = trunc i32 [[ADD]] to i1
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[T]], i32 [[ADD]], i32 [[B:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = xor i1 [[TMP2]], true
-; CHECK-NEXT:    call void @__poison_checker_assert(i1 [[TMP3]])
-; CHECK-NEXT:    ret i32 [[SEL]]
-;
-  %add = add nuw i32 %a, 1
-  %t = trunc i32 %add to i1
-  %sel = select i1 %t, i32 %add, i32 %b
-  ret i32 %sel
-}
-
-define noundef i32 @select_cond_true_third_op_may_be_poison(i32 %a, i32 %b) {
-; CHECK-LABEL: @select_cond_true_third_op_may_be_poison(
-; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A:%.*]], i32 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[A]], 1
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 true, i32 [[B:%.*]], i32 [[ADD]]
-; CHECK-NEXT:    ret i32 [[SEL]]
-;
-  %add = add nuw i32 %a, 1
-  %sel = select i1 true, i32 %b, i32 %add
-  ret i32 %sel
-}
-
-define noundef i32 @select_cond_false_second_op_may_be_poison(i1 %c, i32 %a, i32 %b) {
-; CHECK-LABEL: @select_cond_false_second_op_may_be_poison(
-; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A:%.*]], i32 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[A]], 1
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 false, i32 [[ADD]], i32 [[B:%.*]]
-; CHECK-NEXT:    ret i32 [[SEL]]
-;
-  %add = add nuw i32 %a, 1
-  %sel = select i1 false, i32 %add, i32 %b
-  ret i32 %sel
-}
-
-define noundef i32 @select_unknown_true_third_op_may_be_poison(i1 %c, i32 %a, i32 %b) {
-; CHECK-LABEL: @select_unknown_true_third_op_may_be_poison(
-; CHECK-NEXT:    [[TMP1:%.*]] = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 [[A:%.*]], i32 1)
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i32, i1 } [[TMP1]], 1
-; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[A]], 1
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C:%.*]], i32 [[B:%.*]], i32 [[ADD]]
-; CHECK-NEXT:    ret i32 [[SEL]]
-;
-  %add = add nuw i32 %a, 1
-  %sel = select i1 %c, i32 %b, i32 %add
-  ret i32 %sel
-}
-
-
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
index 7bda1302c6f36..97996b3d402fb 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn
@@ -31,7 +31,6 @@ static_library("Instrumentation") {
     "PGOForceFunctionAttrs.cpp",
     "PGOInstrumentation.cpp",
     "PGOMemOPSizeOpt.cpp",
-    "PoisonChecking.cpp",
     "RealtimeSanitizer.cpp",
     "SanitizerBinaryMetadata.cpp",
     "SanitizerCoverage.cpp",

From 6a01ac7d06df875206f746fc982f58c161249285 Mon Sep 17 00:00:00 2001
From: Joshua Batista <jbatista@microsoft.com>
Date: Thu, 19 Dec 2024 18:04:39 -0500
Subject: [PATCH 118/209] [HLSL] Add concepts for Structured buffers (#119643)

This PR adds concept validation to structured buffers, in the same way
that it was done for typed buffers (like RWBuffer) in
https://github.com/llvm/llvm-project/pull/116413.
This PR should also be responsible for introducing rejection of 0 size
elements for structured buffers.
Fixes https://github.com/llvm/llvm-project/issues/117406
---
 clang/lib/AST/Type.cpp                        |  2 +-
 clang/lib/Sema/HLSLExternalSemaSource.cpp     | 91 ++++++++++++++++---
 .../test/AST/HLSL/StructuredBuffers-AST.hlsl  | 16 ++++
 ...d_resource_element_compatible_concept.hlsl | 15 +++
 .../SemaHLSL/BuiltIns/StructuredBuffers.hlsl  | 16 +++-
 5 files changed, 124 insertions(+), 16 deletions(-)
 create mode 100644 clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl

diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 976361d07b68b..caa0ac858a1be 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -5118,7 +5118,7 @@ bool Type::isHLSLIntangibleType() const {
 
   CXXRecordDecl *RD = RT->getAsCXXRecordDecl();
   assert(RD != nullptr &&
-         "all HLSL struct and classes should be CXXRecordDecl");
+         "all HLSL structs and classes should be CXXRecordDecl");
   assert(RD->isCompleteDefinition() && "expecting complete type");
   return RD->isHLSLIntangible();
 }
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index f3362fb619afc..4faeda856c469 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -864,6 +864,10 @@ static BuiltinTypeDeclBuilder setupBufferType(CXXRecordDecl *Decl, Sema &S,
       .addDefaultHandleConstructor();
 }
 
+// This function is responsible for constructing the constraint expression for
+// this concept:
+// template<typename T> concept is_typed_resource_element_compatible =
+// __is_typed_resource_element_compatible<T>;
 static Expr *constructTypedBufferConstraintExpr(Sema &S, SourceLocation NameLoc,
                                                 TemplateTypeParmDecl *T) {
   ASTContext &Context = S.getASTContext();
@@ -885,8 +889,58 @@ static Expr *constructTypedBufferConstraintExpr(Sema &S, SourceLocation NameLoc,
   return TypedResExpr;
 }
 
-static ConceptDecl *constructTypedBufferConceptDecl(Sema &S,
-                                                    NamespaceDecl *NSD) {
+// This function is responsible for constructing the constraint expression for
+// this concept:
+// template<typename T> concept is_structured_resource_element_compatible =
+// !__is_intangible<T> && sizeof(T) >= 1;
+static Expr *constructStructuredBufferConstraintExpr(Sema &S,
+                                                     SourceLocation NameLoc,
+                                                     TemplateTypeParmDecl *T) {
+  ASTContext &Context = S.getASTContext();
+
+  // Obtain the QualType for 'bool'
+  QualType BoolTy = Context.BoolTy;
+
+  // Create a QualType that points to this TemplateTypeParmDecl
+  QualType TType = Context.getTypeDeclType(T);
+
+  // Create a TypeSourceInfo for the template type parameter 'T'
+  TypeSourceInfo *TTypeSourceInfo =
+      Context.getTrivialTypeSourceInfo(TType, NameLoc);
+
+  TypeTraitExpr *IsIntangibleExpr =
+      TypeTraitExpr::Create(Context, BoolTy, NameLoc, UTT_IsIntangibleType,
+                            {TTypeSourceInfo}, NameLoc, true);
+
+  // negate IsIntangibleExpr
+  UnaryOperator *NotIntangibleExpr = UnaryOperator::Create(
+      Context, IsIntangibleExpr, UO_LNot, BoolTy, VK_LValue, OK_Ordinary,
+      NameLoc, false, FPOptionsOverride());
+
+  // element types also may not be of 0 size
+  UnaryExprOrTypeTraitExpr *SizeOfExpr = new (Context) UnaryExprOrTypeTraitExpr(
+      UETT_SizeOf, TTypeSourceInfo, BoolTy, NameLoc, NameLoc);
+
+  // Create a BinaryOperator that checks if the size of the type is not equal to
+  // 1 Empty structs have a size of 1 in HLSL, so we need to check for that
+  IntegerLiteral *rhs = IntegerLiteral::Create(
+      Context, llvm::APInt(Context.getTypeSize(Context.getSizeType()), 1, true),
+      Context.getSizeType(), NameLoc);
+
+  BinaryOperator *SizeGEQOneExpr =
+      BinaryOperator::Create(Context, SizeOfExpr, rhs, BO_GE, BoolTy, VK_LValue,
+                             OK_Ordinary, NameLoc, FPOptionsOverride());
+
+  // Combine the two constraints
+  BinaryOperator *CombinedExpr = BinaryOperator::Create(
+      Context, NotIntangibleExpr, SizeGEQOneExpr, BO_LAnd, BoolTy, VK_LValue,
+      OK_Ordinary, NameLoc, FPOptionsOverride());
+
+  return CombinedExpr;
+}
+
+static ConceptDecl *constructBufferConceptDecl(Sema &S, NamespaceDecl *NSD,
+                                               bool isTypedBuffer) {
   ASTContext &Context = S.getASTContext();
   DeclContext *DC = NSD->getDeclContext();
   SourceLocation DeclLoc = SourceLocation();
@@ -907,9 +961,18 @@ static ConceptDecl *constructTypedBufferConceptDecl(Sema &S,
   TemplateParameterList *ConceptParams = TemplateParameterList::Create(
       Context, DeclLoc, DeclLoc, {T}, DeclLoc, nullptr);
 
-  DeclarationName DeclName = DeclarationName(
-      &Context.Idents.get("__is_typed_resource_element_compatible"));
-  Expr *ConstraintExpr = constructTypedBufferConstraintExpr(S, DeclLoc, T);
+  DeclarationName DeclName;
+  Expr *ConstraintExpr = nullptr;
+
+  if (isTypedBuffer) {
+    DeclName = DeclarationName(
+        &Context.Idents.get("__is_typed_resource_element_compatible"));
+    ConstraintExpr = constructTypedBufferConstraintExpr(S, DeclLoc, T);
+  } else {
+    DeclName = DeclarationName(
+        &Context.Idents.get("__is_structured_resource_element_compatible"));
+    ConstraintExpr = constructStructuredBufferConstraintExpr(S, DeclLoc, T);
+  }
 
   // Create a ConceptDecl
   ConceptDecl *CD =
@@ -927,8 +990,10 @@ static ConceptDecl *constructTypedBufferConceptDecl(Sema &S,
 
 void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
   CXXRecordDecl *Decl;
-  ConceptDecl *TypedBufferConcept =
-      constructTypedBufferConceptDecl(*SemaPtr, HLSLNamespace);
+  ConceptDecl *TypedBufferConcept = constructBufferConceptDecl(
+      *SemaPtr, HLSLNamespace, /*isTypedBuffer*/ true);
+  ConceptDecl *StructuredBufferConcept = constructBufferConceptDecl(
+      *SemaPtr, HLSLNamespace, /*isTypedBuffer*/ false);
   Decl = BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "RWBuffer")
              .addSimpleTemplateParams({"element_type"}, TypedBufferConcept)
              .finalizeForwardDeclaration();
@@ -944,7 +1009,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
 
   Decl =
       BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "RasterizerOrderedBuffer")
-          .addSimpleTemplateParams({"element_type"})
+          .addSimpleTemplateParams({"element_type"}, StructuredBufferConcept)
           .finalizeForwardDeclaration();
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV,
@@ -956,7 +1021,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
   });
 
   Decl = BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "StructuredBuffer")
-             .addSimpleTemplateParams({"element_type"})
+             .addSimpleTemplateParams({"element_type"}, StructuredBufferConcept)
              .finalizeForwardDeclaration();
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::SRV, ResourceKind::RawBuffer,
@@ -966,7 +1031,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
   });
 
   Decl = BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "RWStructuredBuffer")
-             .addSimpleTemplateParams({"element_type"})
+             .addSimpleTemplateParams({"element_type"}, StructuredBufferConcept)
              .finalizeForwardDeclaration();
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, ResourceKind::RawBuffer,
@@ -979,7 +1044,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
 
   Decl =
       BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "AppendStructuredBuffer")
-          .addSimpleTemplateParams({"element_type"})
+          .addSimpleTemplateParams({"element_type"}, StructuredBufferConcept)
           .finalizeForwardDeclaration();
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, ResourceKind::RawBuffer,
@@ -990,7 +1055,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
 
   Decl =
       BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace, "ConsumeStructuredBuffer")
-          .addSimpleTemplateParams({"element_type"})
+          .addSimpleTemplateParams({"element_type"}, StructuredBufferConcept)
           .finalizeForwardDeclaration();
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, ResourceKind::RawBuffer,
@@ -1001,7 +1066,7 @@ void HLSLExternalSemaSource::defineHLSLTypesWithForwardDeclarations() {
 
   Decl = BuiltinTypeDeclBuilder(*SemaPtr, HLSLNamespace,
                                 "RasterizerOrderedStructuredBuffer")
-             .addSimpleTemplateParams({"element_type"})
+             .addSimpleTemplateParams({"element_type"}, StructuredBufferConcept)
              .finalizeForwardDeclaration();
   onCompletion(Decl, [this](CXXRecordDecl *Decl) {
     setupBufferType(Decl, *SemaPtr, ResourceClass::UAV, ResourceKind::RawBuffer,
diff --git a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
index 6cb4379ef5f55..fd4aa58f5891b 100644
--- a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
+++ b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
@@ -50,6 +50,14 @@
 
 // EMPTY: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit [[RESOURCE]]
 // EMPTY-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> typename depth 0 index 0 element_type
+// EMPTY-NEXT: ConceptSpecializationExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'bool' Concept 0x{{[0-9A-Fa-f]+}} '__is_structured_resource_element_compatible'
+// EMPTY-NEXT: ImplicitConceptSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc>
+// EMPTY-NEXT: TemplateArgument type 'type-parameter-0-0'
+// EMPTY-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-0' dependent depth 0 index 0
+// EMPTY-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} ''
+// EMPTY-NEXT: TemplateArgument type 'element_type':'type-parameter-0-0'
+// EMPTY-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'element_type' dependent depth 0 index 0
+// EMPTY-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} 'element_type'
 // EMPTY-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit <undeserialized declarations> class [[RESOURCE]]
 // EMPTY-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
 
@@ -64,6 +72,14 @@ RESOURCE<float> Buffer;
 
 // CHECK: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit [[RESOURCE]]
 // CHECK-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> typename depth 0 index 0 element_type
+// CHECK-NEXT: ConceptSpecializationExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'bool' Concept 0x{{[0-9A-Fa-f]+}} '__is_structured_resource_element_compatible'
+// CHECK-NEXT: ImplicitConceptSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc>
+// CHECK-NEXT: TemplateArgument type 'type-parameter-0-0'
+// CHECK-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-0' dependent depth 0 index 0
+// CHECK-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} ''
+// CHECK-NEXT: TemplateArgument type 'element_type':'type-parameter-0-0'
+// CHECK-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'element_type' dependent depth 0 index 0
+// CHECK-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} 'element_type'
 // CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit class [[RESOURCE]] definition
 
 // CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
diff --git a/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl b/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl
new file mode 100644
index 0000000000000..2ecd102d524c8
--- /dev/null
+++ b/clang/test/AST/HLSL/is_structured_resource_element_compatible_concept.hlsl
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -ast-dump-filter=__is_structured_resource_element_compatible %s | FileCheck %s
+
+// CHECK: ConceptDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> __is_structured_resource_element_compatible
+// CHECK: |-TemplateTypeParmDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> referenced typename depth 0 index 0 element_type
+// CHECK: `-BinaryOperator 0x{{[0-9a-f]+}} <<invalid sloc>> 'bool' lvalue '&&'
+// CHECK:   |-UnaryOperator 0x{{[0-9a-f]+}} <<invalid sloc>> 'bool' lvalue prefix '!' cannot overflow
+// CHECK:   | `-TypeTraitExpr 0x{{[0-9a-f]+}} <<invalid sloc>> 'bool' __builtin_hlsl_is_intangible
+// CHECK:   |   `-TemplateTypeParmType 0x{{[0-9a-f]+}} 'element_type' dependent depth 0 index 0
+// CHECK:   |     `-TemplateTypeParm 0x{{[0-9a-f]+}} 'element_type'
+// CHECK:   `-BinaryOperator 0x{{[0-9a-f]+}} <<invalid sloc>> 'bool' lvalue '>='
+// CHECK:     |-UnaryExprOrTypeTraitExpr 0x{{[0-9a-f]+}} <<invalid sloc>> 'bool' sizeof 'element_type'
+// CHECK:     `-IntegerLiteral 0x{{[0-9a-f]+}} <<invalid sloc>> 'unsigned long' 1
+
+
+StructuredBuffer<float> Buffer;
diff --git a/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl
index edfc1b48037c9..fb14429025d5a 100644
--- a/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl
@@ -5,13 +5,25 @@ typedef vector<float, 3> float3;
 StructuredBuffer<float3> Buffer;
 
 // expected-error@+2 {{class template 'StructuredBuffer' requires template arguments}}
-// expected-note@*:* {{template declaration from hidden source: template <typename element_type> class StructuredBuffer {}}}
+// expected-note@*:* {{template declaration from hidden source: template <typename element_type> requires __is_structured_resource_element_compatible<element_type> class StructuredBuffer {}}}
 StructuredBuffer BufferErr1;
 
 // expected-error@+2 {{too few template arguments for class template 'StructuredBuffer'}}
-// expected-note@*:* {{template declaration from hidden source: template <typename element_type> class StructuredBuffer {}}}
+// expected-note@*:* {{template declaration from hidden source: template <typename element_type> requires __is_structured_resource_element_compatible<element_type> class StructuredBuffer {}}}
 StructuredBuffer<> BufferErr2;
 
+// test elements of 0 size
+// expected-error@+3{{constraints not satisfied for class template 'StructuredBuffer' [with element_type = int[0]]}}
+// expected-note@*:*{{because 'int[0]' does not satisfy '__is_structured_resource_element_compatible'}}
+// expected-note@*:*{{because 'sizeof(int[0]) >= 1UL' (0 >= 1) evaluated to false}}
+StructuredBuffer<int[0]> BufferErr3;
+
+// In C++, empty structs do have a size of 1. So should HLSL.
+// The concept will accept empty structs as element types, despite it being unintuitive.
+struct Empty {};
+StructuredBuffer<Empty> BufferErr4;
+
+
 [numthreads(1,1,1)]
 void main() {
   (void)Buffer.__handle; // expected-error {{'__handle' is a private member of 'hlsl::StructuredBuffer<vector<float, 3>>'}}

From af5a65685964e064704b46947ab0102d41caa785 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 19 Dec 2024 23:27:50 +0000
Subject: [PATCH 119/209] [IR] Remove unused method InstIterator::atEnd
 (#120611)

---
 llvm/include/llvm/IR/InstIterator.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/include/llvm/IR/InstIterator.h b/llvm/include/llvm/IR/InstIterator.h
index 054fe4e9cbe99..8ce73a9b9addf 100644
--- a/llvm/include/llvm/IR/InstIterator.h
+++ b/llvm/include/llvm/IR/InstIterator.h
@@ -104,8 +104,6 @@ template <class BB_t, class BB_i_t, class BI_t, class II_t> class InstIterator {
     InstIterator tmp = *this; --*this; return tmp;
   }
 
-  inline bool atEnd() const { return BB == BBs->end(); }
-
 private:
   inline void advanceToNextBB() {
     // The only way that the II could be broken is if it is now pointing to

From 6f983f88537415952ec528c42f89f1d5b620fe68 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Thu, 19 Dec 2024 18:33:29 -0500
Subject: [PATCH 120/209] [compiler-rt] Set the default C++ library to libc++
 on OpenBSD (#107694)

---
 compiler-rt/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index 80d5aaabfd8c3..2c52788de56af 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -221,7 +221,7 @@ macro(handle_default_cxx_lib var)
     set(${var}_LIBNAME "${CMAKE_MATCH_1}")
     set(${var}_SYSTEM 1)
   elseif (${var} STREQUAL "default")
-    if (APPLE OR CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
+    if (APPLE OR CMAKE_SYSTEM_NAME MATCHES "FreeBSD" OR CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
       set(${var}_LIBNAME "libc++")
       set(${var}_SYSTEM 1)
     elseif (FUCHSIA)

From bcd32ef0ba6b33a4aa3a5d0a96526f86634035da Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Thu, 19 Dec 2024 19:22:27 -0500
Subject: [PATCH 121/209] [Driver][FreeBSD][NFC] Remove some code duplication
 (#120652)

---
 clang/lib/Driver/ToolChains/FreeBSD.cpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp
index be44fc4fe1a84..88a27e3192827 100644
--- a/clang/lib/Driver/ToolChains/FreeBSD.cpp
+++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp
@@ -225,13 +225,7 @@ void freebsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     break;
   }
 
-  if (Triple.isRISCV64()) {
-    CmdArgs.push_back("-X");
-    if (Args.hasArg(options::OPT_mno_relax))
-      CmdArgs.push_back("--no-relax");
-  }
-
-  if (Triple.isLoongArch64()) {
+  if (Triple.isLoongArch64() || Triple.isRISCV64()) {
     CmdArgs.push_back("-X");
     if (Args.hasArg(options::OPT_mno_relax))
       CmdArgs.push_back("--no-relax");

From 3d297514e2c3f10b3fd8fcf0e5b1c745dfe644cf Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 19 Dec 2024 16:24:02 -0800
Subject: [PATCH 122/209] [BoundsChecking] Add support for runtime handlers
 (#120513)

This is a step forward to have reporting consistent with other UBSAN
checks.

Runtime and clang parts are here #120515.
---
 .../Instrumentation/BoundsChecking.cpp        | 103 ++++++++++++++----
 .../BoundsChecking/runtimes.ll                |  12 +-
 2 files changed, 89 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 1a2dc5984523e..f639d0628d605 100644
--- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Transforms/Instrumentation/BoundsChecking.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -104,6 +105,30 @@ static Value *getBoundsCheckCond(Value *Ptr, Value *InstVal,
   return Or;
 }
 
+static CallInst *InsertTrap(BuilderTy &IRB) {
+  if (!DebugTrapBB)
+    return IRB.CreateIntrinsic(Intrinsic::trap, {}, {});
+  // FIXME: Ideally we would use the SanitizerHandler::OutOfBounds constant.
+  return IRB.CreateIntrinsic(
+      Intrinsic::ubsantrap, {},
+      ConstantInt::get(IRB.getInt8Ty(),
+                       IRB.GetInsertBlock()->getParent()->size()));
+}
+
+static CallInst *InsertCall(BuilderTy &IRB, bool MayReturn, StringRef Name) {
+  Function *Fn = IRB.GetInsertBlock()->getParent();
+  LLVMContext &Ctx = Fn->getContext();
+  llvm::AttrBuilder B(Ctx);
+  B.addAttribute(llvm::Attribute::NoUnwind);
+  if (!MayReturn)
+    B.addAttribute(llvm::Attribute::NoReturn);
+  FunctionCallee Callee = Fn->getParent()->getOrInsertFunction(
+      Name,
+      llvm::AttributeList::get(Ctx, llvm::AttributeList::FunctionIndex, B),
+      Type::getVoidTy(Ctx));
+  return IRB.CreateCall(Callee);
+}
+
 /// Adds run-time bounds checks to memory accessing instructions.
 ///
 /// \p Or is the condition that should guard the trap.
@@ -126,20 +151,53 @@ static void insertBoundsCheck(Value *Or, BuilderTy &IRB, GetTrapBBT GetTrapBB) {
   BasicBlock *Cont = OldBB->splitBasicBlock(SplitI);
   OldBB->getTerminator()->eraseFromParent();
 
+  BasicBlock *TrapBB = GetTrapBB(IRB, Cont);
+
   if (C) {
     // If we have a constant zero, unconditionally branch.
     // FIXME: We should really handle this differently to bypass the splitting
     // the block.
-    BranchInst::Create(GetTrapBB(IRB), OldBB);
+    BranchInst::Create(TrapBB, OldBB);
     return;
   }
 
   // Create the conditional branch.
-  BranchInst::Create(GetTrapBB(IRB), Cont, Or, OldBB);
+  BranchInst::Create(TrapBB, Cont, Or, OldBB);
 }
 
+struct ReportingOpts {
+  bool MayReturn = false;
+  bool UseTrap = false;
+  bool MinRuntime = false;
+  StringRef Name;
+
+  ReportingOpts(BoundsCheckingPass::ReportingMode Mode) {
+    switch (Mode) {
+    case BoundsCheckingPass::ReportingMode::Trap:
+      UseTrap = true;
+      break;
+    case BoundsCheckingPass::ReportingMode::MinRuntime:
+      Name = "__ubsan_handle_local_out_of_bounds_minimal";
+      MinRuntime = true;
+      MayReturn = true;
+      break;
+    case BoundsCheckingPass::ReportingMode::MinRuntimeAbort:
+      Name = "__ubsan_handle_local_out_of_bounds_minimal_abort";
+      MinRuntime = true;
+      break;
+    case BoundsCheckingPass::ReportingMode::FullRuntime:
+      Name = "__ubsan_handle_local_out_of_bounds";
+      MayReturn = true;
+      break;
+    case BoundsCheckingPass::ReportingMode::FullRuntimeAbort:
+      Name = "__ubsan_handle_local_out_of_bounds_abort";
+      break;
+    }
+  }
+};
+
 static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
-                              ScalarEvolution &SE) {
+                              ScalarEvolution &SE, const ReportingOpts &Opts) {
   if (F.hasFnAttribute(Attribute::NoSanitizeBounds))
     return false;
 
@@ -180,39 +238,44 @@ static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
   // Create a trapping basic block on demand using a callback. Depending on
   // flags, this will either create a single block for the entire function or
   // will create a fresh block every time it is called.
-  BasicBlock *TrapBB = nullptr;
-  auto GetTrapBB = [&TrapBB](BuilderTy &IRB) {
+  BasicBlock *ReuseTrapBB = nullptr;
+  auto GetTrapBB = [&ReuseTrapBB, &Opts](BuilderTy &IRB, BasicBlock *Cont) {
     Function *Fn = IRB.GetInsertBlock()->getParent();
     auto DebugLoc = IRB.getCurrentDebugLocation();
     IRBuilder<>::InsertPointGuard Guard(IRB);
 
-    if (TrapBB && SingleTrapBB && !DebugTrapBB)
-      return TrapBB;
+    // Create a trapping basic block on demand using a callback. Depending on
+    // flags, this will either create a single block for the entire function or
+    // will create a fresh block every time it is called.
+    if (ReuseTrapBB)
+      return ReuseTrapBB;
 
-    TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn);
+    BasicBlock *TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn);
     IRB.SetInsertPoint(TrapBB);
 
-    Intrinsic::ID IntrID = DebugTrapBB ? Intrinsic::ubsantrap : Intrinsic::trap;
-
-    CallInst *TrapCall;
+    CallInst *TrapCall = Opts.UseTrap
+                             ? InsertTrap(IRB)
+                             : InsertCall(IRB, Opts.MayReturn, Opts.Name);
     if (DebugTrapBB) {
-      // Ideally we would use the SanitizerHandler::OutOfBounds constant
-      TrapCall = IRB.CreateIntrinsic(
-          IntrID, {}, ConstantInt::get(IRB.getInt8Ty(), Fn->size()));
+      // FIXME: Pass option form clang.
       TrapCall->addFnAttr(llvm::Attribute::NoMerge);
-    } else {
-      TrapCall = IRB.CreateIntrinsic(IntrID, {}, {});
     }
 
-    TrapCall->setDoesNotReturn();
     TrapCall->setDoesNotThrow();
     TrapCall->setDebugLoc(DebugLoc);
-    IRB.CreateUnreachable();
+    if (Opts.MayReturn) {
+      IRB.CreateBr(Cont);
+    } else {
+      TrapCall->setDoesNotReturn();
+      IRB.CreateUnreachable();
+    }
+
+    if (!Opts.MayReturn && SingleTrapBB && !DebugTrapBB)
+      ReuseTrapBB = TrapBB;
 
     return TrapBB;
   };
 
-  // Add the checks.
   for (const auto &Entry : TrapInfo) {
     Instruction *Inst = Entry.first;
     BuilderTy IRB(Inst->getParent(), BasicBlock::iterator(Inst), TargetFolder(DL));
@@ -226,7 +289,7 @@ PreservedAnalyses BoundsCheckingPass::run(Function &F, FunctionAnalysisManager &
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
 
-  if (!addBoundsChecking(F, TLI, SE))
+  if (!addBoundsChecking(F, TLI, SE, ReportingOpts(Mode)))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
diff --git a/llvm/test/Instrumentation/BoundsChecking/runtimes.ll b/llvm/test/Instrumentation/BoundsChecking/runtimes.ll
index fd27694c155d2..357f92aca85c0 100644
--- a/llvm/test/Instrumentation/BoundsChecking/runtimes.ll
+++ b/llvm/test/Instrumentation/BoundsChecking/runtimes.ll
@@ -38,8 +38,8 @@ define void @f1(i64 %x) nounwind {
 ; RT-NEXT:    [[TMP8:%.*]] = load i128, ptr [[TMP2]], align 4
 ; RT-NEXT:    ret void
 ; RT:       [[TRAP]]:
-; RT-NEXT:    call void @llvm.trap() #[[ATTR2:[0-9]+]]
-; RT-NEXT:    unreachable
+; RT-NEXT:    call void @__ubsan_handle_local_out_of_bounds() #[[ATTR0]]
+; RT-NEXT:    br label %[[BB7]]
 ;
 ; RTABORT-LABEL: define void @f1(
 ; RTABORT-SAME: i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -54,7 +54,7 @@ define void @f1(i64 %x) nounwind {
 ; RTABORT-NEXT:    [[TMP8:%.*]] = load i128, ptr [[TMP2]], align 4
 ; RTABORT-NEXT:    ret void
 ; RTABORT:       [[TRAP]]:
-; RTABORT-NEXT:    call void @llvm.trap() #[[ATTR2:[0-9]+]]
+; RTABORT-NEXT:    call void @__ubsan_handle_local_out_of_bounds_abort() #[[ATTR1:[0-9]+]]
 ; RTABORT-NEXT:    unreachable
 ;
 ; MINRT-LABEL: define void @f1(
@@ -70,8 +70,8 @@ define void @f1(i64 %x) nounwind {
 ; MINRT-NEXT:    [[TMP8:%.*]] = load i128, ptr [[TMP2]], align 4
 ; MINRT-NEXT:    ret void
 ; MINRT:       [[TRAP]]:
-; MINRT-NEXT:    call void @llvm.trap() #[[ATTR2:[0-9]+]]
-; MINRT-NEXT:    unreachable
+; MINRT-NEXT:    call void @__ubsan_handle_local_out_of_bounds_minimal() #[[ATTR0]]
+; MINRT-NEXT:    br label %[[BB7]]
 ;
 ; MINRTABORT-LABEL: define void @f1(
 ; MINRTABORT-SAME: i64 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
@@ -86,7 +86,7 @@ define void @f1(i64 %x) nounwind {
 ; MINRTABORT-NEXT:    [[TMP8:%.*]] = load i128, ptr [[TMP2]], align 4
 ; MINRTABORT-NEXT:    ret void
 ; MINRTABORT:       [[TRAP]]:
-; MINRTABORT-NEXT:    call void @llvm.trap() #[[ATTR2:[0-9]+]]
+; MINRTABORT-NEXT:    call void @__ubsan_handle_local_out_of_bounds_minimal_abort() #[[ATTR1:[0-9]+]]
 ; MINRTABORT-NEXT:    unreachable
 ;
   %1 = alloca i128, i64 %x

From b13592219c421820b7d774dfe360f9f2d9bd94f6 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Dec 2024 01:28:36 +0100
Subject: [PATCH 123/209] [Doc] Add a section on CI to the GitHub documentation
 (#85376)

See
https://discourse.llvm.org/t/rfc-add-a-warning-when-bypassing-the-premerge-testing/77610
for context.
---
 llvm/docs/Contributing.rst | 22 +++++++++++++++++-----
 llvm/docs/GitHub.rst       | 22 ++++++++++++++++++++++
 2 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/llvm/docs/Contributing.rst b/llvm/docs/Contributing.rst
index 38c0581182864..cf48c66dc0d06 100644
--- a/llvm/docs/Contributing.rst
+++ b/llvm/docs/Contributing.rst
@@ -118,12 +118,24 @@ For developers to commit changes from Git
 -----------------------------------------
 
 Once a patch is reviewed, you can select the "Squash and merge" button in the
-GitHub web interface. You might need to rebase your change before pushing
-it to the repo.
+GitHub web interface.
 
-LLVM currently has a linear-history policy, which means that merge commits are
-not allowed. The `llvm-project` repo on github is configured to reject pushes
-that include merges, so the `git rebase` step above is required.
+When pushing directly from the command-line to the ``main`` branch, you will need
+to rebase your change. LLVM has a linear-history policy, which means
+that merge commits are not allowed and the ``main`` branch is configured to reject
+pushes that include merges.
+
+GitHub will display a message that looks like this:
+
+.. code-block:: console
+
+  remote: Bypassed rule violations for refs/heads/main:
+  remote:
+  remote: - Required status check “buildkite/github-pull-requests” is expected.
+
+This can seem scary, but this is just an artifact of the GitHub setup: it is
+intended as a warning for people merging pull-requests with failing CI. We can't
+disable it for people pushing on the command-line.
 
 Please ask for help if you're having trouble with your particular git workflow.
 
diff --git a/llvm/docs/GitHub.rst b/llvm/docs/GitHub.rst
index 37995969b6df3..85766bfe94afd 100644
--- a/llvm/docs/GitHub.rst
+++ b/llvm/docs/GitHub.rst
@@ -215,6 +215,28 @@ commonly used first:
   this result correctly with a note that a force push did occur.
 
 
+Pre-merge Continuous Integration (CI)
+-------------------------------------
+
+Multiple checks will be applied on a pull-request, either for linting/formatting
+or some build and tests. None of these are perfect and you will encounter
+false positive, infrastructure failures (unstable or unavailable worker), or
+you will be unlucky and based your change on a broken revision of the main branch.
+
+None of the checks are strictly mandatory: these are tools to help us build a
+better codebase and be more productive (by avoiding issues found post-merge and
+possible reverts). As a developer you're empowered to exercise your judgement
+about bypassing any of the checks when merging code.
+
+The infrastructure can print messages that make it seem like these are mandatory,
+but this is just an artifact of GitHub infrastructure and not a policy of the
+project.
+
+However, please make sure you do not force-merge any changes that have clear
+test failures directly linked to your changes. Our policy is still to keep the
+``main`` branch in a good condition, and introducing failures to be fixed later
+violates that policy.
+
 Problems After Landing Your Change
 ==================================
 

From 44201679c6ec597a8624b38ff8f056c5a8dab901 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 20 Dec 2024 07:37:19 +0700
Subject: [PATCH 124/209] AMDGPU: Fix mishandling of search for constantexpr
 addrspacecasts (#120346)

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   | 37 ++++++-----
 ...120256-annotate-constexpr-addrspacecast.ll | 62 +++++++++++++++++++
 2 files changed, 84 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/issue120256-annotate-constexpr-addrspacecast.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 21f9c50c35256..546db318c17d5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -154,7 +154,14 @@ class AMDGPUInformationCache : public InformationCache {
 
   TargetMachine &TM;
 
-  enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 };
+  enum ConstantStatus : uint8_t {
+    NONE = 0,
+    DS_GLOBAL = 1 << 0,
+    ADDR_SPACE_CAST_PRIVATE_TO_FLAT = 1 << 1,
+    ADDR_SPACE_CAST_LOCAL_TO_FLAT = 1 << 2,
+    ADDR_SPACE_CAST_BOTH_TO_FLAT =
+        ADDR_SPACE_CAST_PRIVATE_TO_FLAT | ADDR_SPACE_CAST_LOCAL_TO_FLAT
+  };
 
   /// Check if the subtarget has aperture regs.
   bool hasApertureRegs(Function &F) {
@@ -233,13 +240,20 @@ class AMDGPUInformationCache : public InformationCache {
   }
 
 private:
-  /// Check if the ConstantExpr \p CE requires the queue pointer.
-  static bool visitConstExpr(const ConstantExpr *CE) {
+  /// Check if the ConstantExpr \p CE uses an addrspacecast from private or
+  /// local to flat. These casts may require the queue pointer.
+  static uint8_t visitConstExpr(const ConstantExpr *CE) {
+    uint8_t Status = NONE;
+
     if (CE->getOpcode() == Instruction::AddrSpaceCast) {
       unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
-      return castRequiresQueuePtr(SrcAS);
+      if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS)
+        Status |= ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
+      else if (SrcAS == AMDGPUAS::LOCAL_ADDRESS)
+        Status |= ADDR_SPACE_CAST_LOCAL_TO_FLAT;
     }
-    return false;
+
+    return Status;
   }
 
   /// Get the constant access bitmap for \p C.
@@ -254,8 +268,7 @@ class AMDGPUInformationCache : public InformationCache {
       Result = DS_GLOBAL;
 
     if (const auto *CE = dyn_cast<ConstantExpr>(C))
-      if (visitConstExpr(CE))
-        Result |= ADDR_SPACE_CAST;
+      Result |= visitConstExpr(CE);
 
     for (const Use &U : C->operands()) {
       const auto *OpC = dyn_cast<Constant>(U);
@@ -284,19 +297,13 @@ class AMDGPUInformationCache : public InformationCache {
     if (IsNonEntryFunc && (Access & DS_GLOBAL))
       return true;
 
-    return !HasAperture && (Access & ADDR_SPACE_CAST);
+    return !HasAperture && (Access & ADDR_SPACE_CAST_BOTH_TO_FLAT);
   }
 
   bool checkConstForAddrSpaceCastFromPrivate(const Constant *C) {
     SmallPtrSet<const Constant *, 8> Visited;
     uint8_t Access = getConstantAccess(C, Visited);
-
-    if (Access & ADDR_SPACE_CAST)
-      if (const auto *CE = dyn_cast<ConstantExpr>(C))
-        if (CE->getOperand(0)->getType()->getPointerAddressSpace() ==
-            AMDGPUAS::PRIVATE_ADDRESS)
-          return true;
-    return false;
+    return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
   }
 
 private:
diff --git a/llvm/test/CodeGen/AMDGPU/issue120256-annotate-constexpr-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/issue120256-annotate-constexpr-addrspacecast.ll
new file mode 100644
index 0000000000000..e8b23f3bf3a70
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/issue120256-annotate-constexpr-addrspacecast.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -passes=amdgpu-attributor %s | FileCheck %s
+
+@buf_shared = internal addrspace(3) global [2080 x i8] poison, align 16
+
+; Constant expression element may not have a pointer type and the
+; addrspacecast may not be the toplevel operation.
+
+
+; This should infer "amdgpu-no-flat-scratch-init". It should not infer "amdgpu-no-queue-ptr"
+;.
+; CHECK: @buf_shared = internal addrspace(3) global [2080 x i8] poison, align 16
+; CHECK: @buf_private = internal addrspace(5) global [2080 x i8] poison, align 16
+;.
+define amdgpu_kernel void @issue120256(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @issue120256(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[CONV_I:%.*]] = and i32 trunc (i64 sub (i64 16, i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @buf_shared to ptr) to i64)) to i32), 15
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(3) @buf_shared, i32 [[CONV_I]]
+; CHECK-NEXT:    [[LD:%.*]] = load i8, ptr addrspace(3) [[ADD_PTR]], align 1
+; CHECK-NEXT:    store i8 [[LD]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-NEXT:    ret void
+;
+  %conv.i = and i32 trunc (i64 sub (i64 16, i64 ptrtoint (ptr addrspacecast (ptr addrspace(3) @buf_shared to ptr) to i64)) to i32), 15
+  %add.ptr = getelementptr inbounds nuw i8, ptr addrspace(3) @buf_shared, i32 %conv.i
+  %ld = load i8, ptr addrspace(3) %add.ptr, align 1
+  store i8 %ld, ptr addrspace(1) %out, align 1
+  ret void
+}
+
+@buf_private = internal addrspace(5) global [2080 x i8] poison, align 16
+
+; Constant expression element may not have a pointer type and the
+; addrspacecast may not be the toplevel operation.
+
+; This should not infer "amdgpu-no-flat-scratch-init" nor "amdgpu-no-queue-ptr"
+define amdgpu_kernel void @issue120256_private(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @issue120256_private(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[CONV_I:%.*]] = and i32 trunc (i64 sub (i64 16, i64 ptrtoint (ptr addrspacecast (ptr addrspace(5) @buf_private to ptr) to i64)) to i32), 15
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(5) @buf_private, i32 [[CONV_I]]
+; CHECK-NEXT:    [[LD:%.*]] = load i8, ptr addrspace(5) [[ADD_PTR]], align 1
+; CHECK-NEXT:    store i8 [[LD]], ptr addrspace(1) [[OUT]], align 1
+; CHECK-NEXT:    ret void
+;
+  %conv.i = and i32 trunc (i64 sub (i64 16, i64 ptrtoint (ptr addrspacecast (ptr addrspace(5) @buf_private to ptr) to i64)) to i32), 15
+  %add.ptr = getelementptr inbounds nuw i8, ptr addrspace(5) @buf_private, i32 %conv.i
+  %ld = load i8, ptr addrspace(5) %add.ptr, align 1
+  store i8 %ld, ptr addrspace(1) %out, align 1
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+
+; FIXME: Inference of amdgpu-no-queue-ptr should not depend on code object version.
+!0 = !{i32 1, !"amdhsa_code_object_version", i32 400}
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx803" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx803" "uniform-work-group-size"="false" }
+;.
+; CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 400}
+;.

From c2aee5062087f193cb5756f378c248c7d91b7245 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 19 Dec 2024 16:38:07 -0800
Subject: [PATCH 125/209] [ubsan]  Runtime and driver support for local-bounds
 (#120515)

Implements ``-f[no-]sanitize-trap=local-bounds``,
and ``-f[no-]sanitize-recover=local-bounds``.

LLVM part is here #120513.
---
 clang/docs/ReleaseNotes.rst                   |  2 ++
 clang/lib/CodeGen/BackendUtil.cpp             | 17 +++++++++++---
 clang/lib/Driver/SanitizerArgs.cpp            |  9 ++++----
 clang/test/CodeGen/bounds-checking.c          |  2 +-
 compiler-rt/lib/ubsan/ubsan_checks.inc        |  1 +
 compiler-rt/lib/ubsan/ubsan_handlers.cpp      | 22 +++++++++++++++++++
 compiler-rt/lib/ubsan/ubsan_handlers.h        |  3 +++
 compiler-rt/lib/ubsan/ubsan_interface.inc     |  2 ++
 .../ubsan_minimal/ubsan_minimal_handlers.cpp  |  1 +
 .../ubsan/TestCases/Misc/local_bounds.cpp     | 12 +++++++---
 .../ubsan_minimal/TestCases/local_bounds.cpp  |  5 +++--
 11 files changed, 63 insertions(+), 13 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index edb2e4a10ded0..b8d92a6c881c6 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1206,6 +1206,8 @@ Sanitizers
   ``-fsanitize=type`` flag. This sanitizer detects violations of C/C++ type-based
   aliasing rules.
 
+- Implemented ``-f[no-]sanitize-trap=local-bounds``, and ``-f[no-]sanitize-recover=local-bounds``.
+
 Python Binding Changes
 ----------------------
 - Fixed an issue that led to crashes when calling ``Type.get_exception_specification_kind``.
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index b1003f2ce5032..e6c9d77d29f6f 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1028,9 +1028,20 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     // of the pipeline.
     if (LangOpts.Sanitize.has(SanitizerKind::LocalBounds))
       PB.registerScalarOptimizerLateEPCallback(
-          [](FunctionPassManager &FPM, OptimizationLevel Level) {
-            FPM.addPass(
-                BoundsCheckingPass(BoundsCheckingPass::ReportingMode::Trap));
+          [this](FunctionPassManager &FPM, OptimizationLevel Level) {
+            BoundsCheckingPass::ReportingMode Mode;
+            if (CodeGenOpts.SanitizeTrap.has(SanitizerKind::LocalBounds)) {
+              Mode = BoundsCheckingPass::ReportingMode::Trap;
+            } else if (CodeGenOpts.SanitizeMinimalRuntime) {
+              Mode = CodeGenOpts.SanitizeRecover.has(SanitizerKind::LocalBounds)
+                         ? BoundsCheckingPass::ReportingMode::MinRuntime
+                         : BoundsCheckingPass::ReportingMode::MinRuntimeAbort;
+            } else {
+              Mode = CodeGenOpts.SanitizeRecover.has(SanitizerKind::LocalBounds)
+                         ? BoundsCheckingPass::ReportingMode::FullRuntime
+                         : BoundsCheckingPass::ReportingMode::FullRuntimeAbort;
+            }
+            FPM.addPass(BoundsCheckingPass(Mode));
           });
 
     // Don't add sanitizers if we are here from ThinLTO PostLink. That already
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 0b1f4f0112ac0..7726e464f2b45 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -27,9 +27,9 @@ using namespace llvm::opt;
 
 static const SanitizerMask NeedsUbsanRt =
     SanitizerKind::Undefined | SanitizerKind::Integer |
-    SanitizerKind::ImplicitConversion | SanitizerKind::Nullability |
-    SanitizerKind::CFI | SanitizerKind::FloatDivideByZero |
-    SanitizerKind::ObjCCast;
+    SanitizerKind::LocalBounds | SanitizerKind::ImplicitConversion |
+    SanitizerKind::Nullability | SanitizerKind::CFI |
+    SanitizerKind::FloatDivideByZero | SanitizerKind::ObjCCast;
 static const SanitizerMask NeedsUbsanCxxRt =
     SanitizerKind::Vptr | SanitizerKind::CFI;
 static const SanitizerMask NotAllowedWithTrap = SanitizerKind::Vptr;
@@ -69,7 +69,8 @@ static const SanitizerMask TrappingSupported =
     SanitizerKind::LocalBounds | SanitizerKind::CFI |
     SanitizerKind::FloatDivideByZero | SanitizerKind::ObjCCast;
 static const SanitizerMask MergeDefault = SanitizerKind::Undefined;
-static const SanitizerMask TrappingDefault = SanitizerKind::CFI;
+static const SanitizerMask TrappingDefault =
+    SanitizerKind::CFI | SanitizerKind::LocalBounds;
 static const SanitizerMask CFIClasses =
     SanitizerKind::CFIVCall | SanitizerKind::CFINVCall |
     SanitizerKind::CFIMFCall | SanitizerKind::CFIDerivedCast |
diff --git a/clang/test/CodeGen/bounds-checking.c b/clang/test/CodeGen/bounds-checking.c
index 1eedae9d8be0f..f9319ca61670c 100644
--- a/clang/test/CodeGen/bounds-checking.c
+++ b/clang/test/CodeGen/bounds-checking.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsanitize=local-bounds                                 -emit-llvm -triple x86_64-apple-darwin10              %s -o - |     FileCheck %s
+// RUN: %clang_cc1 -fsanitize=local-bounds    -fsanitize-trap=local-bounds -emit-llvm -triple x86_64-apple-darwin10              %s -o - |     FileCheck %s
 // RUN: %clang_cc1 -fsanitize=array-bounds -O                              -emit-llvm -triple x86_64-apple-darwin10 %s -o -              | not FileCheck %s
 // RUN: %clang_cc1 -fsanitize=array-bounds -O -fsanitize-trap=array-bounds -emit-llvm -triple x86_64-apple-darwin10 -DNO_DYNAMIC %s -o - |     FileCheck %s
 //
diff --git a/compiler-rt/lib/ubsan/ubsan_checks.inc b/compiler-rt/lib/ubsan/ubsan_checks.inc
index 846cd89ee19f8..b1d09a9024e7e 100644
--- a/compiler-rt/lib/ubsan/ubsan_checks.inc
+++ b/compiler-rt/lib/ubsan/ubsan_checks.inc
@@ -53,6 +53,7 @@ UBSAN_CHECK(ImplicitSignedIntegerTruncationOrSignChange,
 UBSAN_CHECK(InvalidShiftBase, "invalid-shift-base", "shift-base")
 UBSAN_CHECK(InvalidShiftExponent, "invalid-shift-exponent", "shift-exponent")
 UBSAN_CHECK(OutOfBoundsIndex, "out-of-bounds-index", "bounds")
+UBSAN_CHECK(LocalOutOfBounds, "local-out-of-bounds", "local-bounds")
 UBSAN_CHECK(UnreachableCall, "unreachable-call", "unreachable")
 UBSAN_CHECK(MissingReturn, "missing-return", "return")
 UBSAN_CHECK(NonPositiveVLAIndex, "non-positive-vla-index", "vla-bound")
diff --git a/compiler-rt/lib/ubsan/ubsan_handlers.cpp b/compiler-rt/lib/ubsan/ubsan_handlers.cpp
index a419cf0b2b555..ac7001c74afb5 100644
--- a/compiler-rt/lib/ubsan/ubsan_handlers.cpp
+++ b/compiler-rt/lib/ubsan/ubsan_handlers.cpp
@@ -405,6 +405,28 @@ void __ubsan::__ubsan_handle_out_of_bounds_abort(OutOfBoundsData *Data,
   Die();
 }
 
+static void handleLocalOutOfBoundsImpl(ReportOptions Opts) {
+  // FIXME: Pass more diagnostic info.
+  SymbolizedStackHolder CallerLoc;
+  CallerLoc.reset(getCallerLocation(Opts.pc));
+  Location Loc;
+  Loc = CallerLoc;
+  ErrorType ET = ErrorType::LocalOutOfBounds;
+  ScopedReport R(Opts, Loc, ET);
+  Diag(Loc, DL_Error, ET, "access out of bounds");
+}
+
+void __ubsan::__ubsan_handle_local_out_of_bounds() {
+  GET_REPORT_OPTIONS(false);
+  handleLocalOutOfBoundsImpl(Opts);
+}
+
+void __ubsan::__ubsan_handle_local_out_of_bounds_abort() {
+  GET_REPORT_OPTIONS(true);
+  handleLocalOutOfBoundsImpl(Opts);
+  Die();
+}
+
 static void handleBuiltinUnreachableImpl(UnreachableData *Data,
                                          ReportOptions Opts) {
   ErrorType ET = ErrorType::UnreachableCall;
diff --git a/compiler-rt/lib/ubsan/ubsan_handlers.h b/compiler-rt/lib/ubsan/ubsan_handlers.h
index 4ffa1439a1323..521caa96bc771 100644
--- a/compiler-rt/lib/ubsan/ubsan_handlers.h
+++ b/compiler-rt/lib/ubsan/ubsan_handlers.h
@@ -90,6 +90,9 @@ struct OutOfBoundsData {
 /// \brief Handle an array index out of bounds error.
 RECOVERABLE(out_of_bounds, OutOfBoundsData *Data, ValueHandle Index)
 
+/// \brief Handle an local object access out of bounds error.
+RECOVERABLE(local_out_of_bounds)
+
 struct UnreachableData {
   SourceLocation Loc;
 };
diff --git a/compiler-rt/lib/ubsan/ubsan_interface.inc b/compiler-rt/lib/ubsan/ubsan_interface.inc
index cb27feb5d7e99..0eb109f37d445 100644
--- a/compiler-rt/lib/ubsan/ubsan_interface.inc
+++ b/compiler-rt/lib/ubsan/ubsan_interface.inc
@@ -46,6 +46,8 @@ INTERFACE_FUNCTION(__ubsan_handle_nullability_return_v1)
 INTERFACE_FUNCTION(__ubsan_handle_nullability_return_v1_abort)
 INTERFACE_FUNCTION(__ubsan_handle_out_of_bounds)
 INTERFACE_FUNCTION(__ubsan_handle_out_of_bounds_abort)
+INTERFACE_FUNCTION(__ubsan_handle_local_out_of_bounds)
+INTERFACE_FUNCTION(__ubsan_handle_local_out_of_bounds_abort)
 INTERFACE_FUNCTION(__ubsan_handle_pointer_overflow)
 INTERFACE_FUNCTION(__ubsan_handle_pointer_overflow_abort)
 INTERFACE_FUNCTION(__ubsan_handle_shift_out_of_bounds)
diff --git a/compiler-rt/lib/ubsan_minimal/ubsan_minimal_handlers.cpp b/compiler-rt/lib/ubsan_minimal/ubsan_minimal_handlers.cpp
index 98662c5881c9f..c3ffd41bcacc0 100644
--- a/compiler-rt/lib/ubsan_minimal/ubsan_minimal_handlers.cpp
+++ b/compiler-rt/lib/ubsan_minimal/ubsan_minimal_handlers.cpp
@@ -138,6 +138,7 @@ HANDLER(negate_overflow, "negate-overflow")
 HANDLER(divrem_overflow, "divrem-overflow")
 HANDLER(shift_out_of_bounds, "shift-out-of-bounds")
 HANDLER(out_of_bounds, "out-of-bounds")
+HANDLER(local_out_of_bounds, "local-out-of-bounds")
 HANDLER_RECOVER(builtin_unreachable, "builtin-unreachable")
 HANDLER_RECOVER(missing_return, "missing-return")
 HANDLER(vla_bound_not_positive, "vla-bound-not-positive")
diff --git a/compiler-rt/test/ubsan/TestCases/Misc/local_bounds.cpp b/compiler-rt/test/ubsan/TestCases/Misc/local_bounds.cpp
index edfe439c92790..d5e0b46a0f8be 100644
--- a/compiler-rt/test/ubsan/TestCases/Misc/local_bounds.cpp
+++ b/compiler-rt/test/ubsan/TestCases/Misc/local_bounds.cpp
@@ -1,7 +1,8 @@
 // RUN: %clangxx -fsanitize=local-bounds %s -O3 -o %t && %run %t 1
 // RUN: %clangxx -fsanitize=local-bounds %s -O3 -o %t && not --crash %run %t 3
-
-// FIXME: it's always trap for now.
+// RUN: %clangxx -fsanitize=local-bounds -fno-sanitize-trap=local-bounds %s -O3 -o %t && not %run %t 3 2>&1 | FileCheck %s
+// RUN: %clangxx -fsanitize=local-bounds -fno-sanitize-trap=local-bounds -fsanitize-recover=local-bounds %s -O3 -o %t && %run %t 3 2>&1 | FileCheck %s
+// RUN: %clangxx -fsanitize=local-bounds -fno-sanitize-trap=local-bounds -fsanitize-recover=local-bounds -g %s -O3 -o %t && %run %t 3 2>&1 | FileCheck %s --check-prefixes=LINE
 
 #include <cstdlib>
 
@@ -14,12 +15,17 @@ __attribute__((noinline)) void init(S *s) {
   __asm__ __volatile__("" : : "r"(s) : "memory");
 }
 
-__attribute__((noinline, no_sanitize("memory"))) int test(char i) {
+__attribute__((noinline, no_sanitize("memory", "address", "hwaddress"))) int
+test(char i) {
   S a;
   init(&a);
   S b;
   init(&b);
   return ((int *)(&a))[i];
+  // CHECK: error: access out of bounds
+  // CHECK: SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior
+  // LINE: local_bounds.cpp:[[#@LINE-3]]:{{.*}}runtime error: access out of bounds
+  // LINE: SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior {{.*}}local_bounds.cpp:[[#@LINE-4]]:
 }
 
 int main(int argc, char **argv) {
diff --git a/compiler-rt/test/ubsan_minimal/TestCases/local_bounds.cpp b/compiler-rt/test/ubsan_minimal/TestCases/local_bounds.cpp
index edfe439c92790..c972e1ecfc017 100644
--- a/compiler-rt/test/ubsan_minimal/TestCases/local_bounds.cpp
+++ b/compiler-rt/test/ubsan_minimal/TestCases/local_bounds.cpp
@@ -1,7 +1,7 @@
 // RUN: %clangxx -fsanitize=local-bounds %s -O3 -o %t && %run %t 1
 // RUN: %clangxx -fsanitize=local-bounds %s -O3 -o %t && not --crash %run %t 3
-
-// FIXME: it's always trap for now.
+// RUN: %clangxx -fsanitize=local-bounds -fno-sanitize-trap=local-bounds %s -O3 -o %t && not --crash %run %t 3 2>&1 | FileCheck %s
+// RUN: %clangxx -fsanitize=local-bounds -fno-sanitize-trap=local-bounds -fsanitize-recover=local-bounds %s -O3 -o %t && %run %t 3 2>&1 | FileCheck %s
 
 #include <cstdlib>
 
@@ -20,6 +20,7 @@ __attribute__((noinline, no_sanitize("memory"))) int test(char i) {
   S b;
   init(&b);
   return ((int *)(&a))[i];
+  // CHECK: ubsan: local-out-of-bounds by 0x{{[[:xdigit:]]+$}}
 }
 
 int main(int argc, char **argv) {

From 52574b5f40606b8319952e45d0c407675d9ee0fa Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Thu, 19 Dec 2024 19:41:42 -0500
Subject: [PATCH 126/209] [ELF] Add support for PT_OPENBSD_NOBTCFI (#120005)

---
 lld/ELF/Config.h            |  1 +
 lld/ELF/Driver.cpp          |  3 +++
 lld/ELF/Writer.cpp          |  5 +++++
 lld/docs/ld.lld.1           |  5 +++++
 lld/test/ELF/openbsd-phdr.s | 10 ++++++++--
 5 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 5b6b332cd597d..48f0db6c21498 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -363,6 +363,7 @@ struct Config {
   bool zInterpose;
   bool zKeepTextSectionPrefix;
   bool zLrodataAfterBss;
+  bool zNoBtCfi = false;
   bool zNodefaultlib;
   bool zNodelete;
   bool zNodlopen;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 9240f29d98d61..dfb17f9257c8f 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -1897,6 +1897,9 @@ static void setConfigs(Ctx &ctx, opt::InputArgList &args) {
       ErrAlways(ctx) << "cannot open --why-extract= file " << ctx.arg.whyExtract
                      << ": " << e.message();
   }
+
+  if (ctx.arg.osabi == ELFOSABI_OPENBSD)
+    ctx.arg.zNoBtCfi = hasZOption(args, "nobtcfi");
 }
 
 static bool isFormatBinary(Ctx &ctx, StringRef s) {
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index d5581ca3e1c92..3e92b7653e31a 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -2405,6 +2405,11 @@ Writer<ELFT>::createPhdrs(Partition &part) {
     addHdr(PT_GNU_STACK, perm)->p_memsz = ctx.arg.zStackSize;
   }
 
+  // PT_OPENBSD_NOBTCFI is an OpenBSD-specific header to mark that the
+  // executable is expected to violate branch-target CFI checks.
+  if (ctx.arg.zNoBtCfi)
+    addHdr(PT_OPENBSD_NOBTCFI, PF_X);
+
   // PT_OPENBSD_WXNEEDED is a OpenBSD-specific header to mark the executable
   // is expected to perform W^X violations, such as calling mprotect(2) or
   // mmap(2) with PROT_WRITE | PROT_EXEC, which is prohibited by default on
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index 2fa6f64b2d203..b28c6082f68b0 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -952,6 +952,11 @@ Let __start_/__stop_ references retain the associated C identifier name sections
 Do not allow relocations against read-only segments.
 This is the default.
 .Pp
+.It Cm nobtcfi
+Create a
+.Dv PT_OPENBSD_NOBTCFI
+segment.
+.Pp
 .It Cm wxneeded
 Create a
 .Dv PT_OPENBSD_WXNEEDED
diff --git a/lld/test/ELF/openbsd-phdr.s b/lld/test/ELF/openbsd-phdr.s
index 275f944511701..e0269e642f759 100644
--- a/lld/test/ELF/openbsd-phdr.s
+++ b/lld/test/ELF/openbsd-phdr.s
@@ -4,8 +4,11 @@
 # RUN: ld.lld randomdata.o -o randomdata
 # RUN: llvm-readelf -S -l randomdata | FileCheck %s --check-prefix=RANDOMDATA
 
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-openbsd /dev/null -o wxneeded.o
-# RUN: ld.lld -z wxneeded wxneeded.o -o wxneeded
+# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-openbsd /dev/null -o empty.o
+# RUN: ld.lld -z nobtcfi empty.o -o nobtcfi
+# RUN: llvm-readelf -l nobtcfi | FileCheck %s --check-prefix=NOBTCFI
+
+# RUN: ld.lld -z wxneeded empty.o -o wxneeded
 # RUN: llvm-readelf -l wxneeded | FileCheck %s --check-prefix=WXNEEDED
 
 # RUN: ld.lld -T lds randomdata.o -o out
@@ -14,6 +17,9 @@
 # RANDOMDATA: Name                Type     Address            Off             Size   ES Flg Lk Inf Al
 # RANDOMDATA: .openbsd.randomdata PROGBITS [[ADDR:[0-9a-f]+]] [[O:[0-9a-f]+]] 000008 00   A  0   0  1
 
+# NOBTCFI:    Type              Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
+# NOBTCFI:    OPENBSD_NOBTCFI   0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 E   0
+
 # WXNEEDED:   Type              Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
 # WXNEEDED:   OPENBSD_WXNEEDED  0x000000 0x0000000000000000 0x0000000000000000 0x000000 0x000000 E   0
 

From cc7d0841cc27b709f83a4194f45914c00a69a9ea Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Thu, 19 Dec 2024 19:42:32 -0500
Subject: [PATCH 127/209] [clang][Sema] Enable the kprintf format attribute
 (#100669)

---
 clang/lib/Sema/SemaChecking.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index be5d3694aec15..f6c4def289255 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -8103,8 +8103,9 @@ static void CheckFormatString(
   }
 
   if (Type == Sema::FST_Printf || Type == Sema::FST_NSString ||
-      Type == Sema::FST_FreeBSDKPrintf || Type == Sema::FST_OSLog ||
-      Type == Sema::FST_OSTrace || Type == Sema::FST_Syslog) {
+      Type == Sema::FST_Kprintf || Type == Sema::FST_FreeBSDKPrintf ||
+      Type == Sema::FST_OSLog || Type == Sema::FST_OSTrace ||
+      Type == Sema::FST_Syslog) {
     CheckPrintfHandler H(
         S, FExpr, OrigFormatExpr, Type, firstDataArg, numDataArgs,
         (Type == Sema::FST_NSString || Type == Sema::FST_OSTrace), Str, APK,
@@ -8113,7 +8114,7 @@ static void CheckFormatString(
 
     if (!analyze_format_string::ParsePrintfString(
             H, Str, Str + StrLen, S.getLangOpts(), S.Context.getTargetInfo(),
-            Type == Sema::FST_FreeBSDKPrintf))
+            Type == Sema::FST_Kprintf || Type == Sema::FST_FreeBSDKPrintf))
       H.DoneProcessing();
   } else if (Type == Sema::FST_Scanf) {
     CheckScanfHandler H(S, FExpr, OrigFormatExpr, Type, firstDataArg,

From eb1b9fca9ca66d051179d8d2f35edb4624118b26 Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Fri, 20 Dec 2024 09:03:08 +0800
Subject: [PATCH 128/209] [lldb][LoongArch] Fix the incorrect floating-point
 register dwarf number

According to the documentation described at
https://github.com/loongson/la-abi-specs/blob/release/ladwarf.adoc, the
dwarf numbers for floating-point registers range from 32 to 63.

An incorrect dwarf number will prevent the register values from being
properly restored during unwinding.

This test reflects this problem:

```
loongson@linux:~$ cat test.c

void foo() {
  asm volatile ("movgr2fr.d $fs2, $ra":::"$fs2");
}
int main() {
  asm volatile ("movgr2fr.d $fs2, $sp":::"$fs2");
  foo();
  return 0;
}

loongson@linux:~$ clang -g test.c  -o test

```
Without this patch:
```
loongson@linux:~$ ./_build/bin/lldb ./t
(lldb) target create "./t"
Current executable set to
'/home/loongson/llvm-project/_build_lldb/t' (loongarch64).
(lldb) b foo
Breakpoint 1: where = t`foo + 20 at test.c:4:1, address =
0x0000000000000714
(lldb) r
Process 2455626 launched: '/home/loongson/llvm-project/_build_lldb/t' (loongarch64)
Process 2455626 stopped
* thread #1, name = 't', stop reason = breakpoint 1.1
    frame #0: 0x0000555555554714 t`foo at test.c:4:1
   1    #include <stdio.h>
   2
   3    void foo() {
-> 4    asm volatile ("movgr2fr.d $fs2, $ra":::"$fs2");
   5    }
   6    int main() {
   7    asm volatile ("movgr2fr.d $fs2, $sp":::"$fs2");
(lldb) si
Process 2455626 stopped
* thread #1, name = 't', stop reason = instruction step into
    frame #0: 0x0000555555554718 t`foo at test.c:4:1
   1    #include <stdio.h>
   2
   3    void foo() {
-> 4    asm volatile ("movgr2fr.d $fs2, $ra":::"$fs2");
   5    }
   6    int main() {
   7    asm volatile ("movgr2fr.d $fs2, $sp":::"$fs2");
(lldb) f 1
frame #1: 0x0000555555554768 t`main at test.c:8:1
   5    }
   6    int main() {
   7    asm volatile ("movgr2fr.d $fs2, $sp":::"$fs2");
-> 8    foo();
   9    return 0;
   10   }
(lldb) register read -a
General Purpose Registers:
        r1 = 0x0000555555554768  t`main + 40 at test.c:8:1
        r3 = 0x00007ffffffef780
       r22 = 0x00007ffffffef7b0
       r23 = 0x00007ffffffef918
       r24 = 0x0000000000000001
       r25 = 0x0000000000000000
       r26 = 0x000055555555be08  t`__do_global_dtors_aux_fini_array_entry
       r27 = 0x0000555555554740  t`main at test.c:6
       r28 = 0x00007ffffffef928
       r29 = 0x00007ffff7febc88  ld-linux-loongarch-lp64d.so.1`_rtld_global_ro
       r30 = 0x000055555555be08  t`__do_global_dtors_aux_fini_array_entry
        pc = 0x0000555555554768  t`main + 40 at test.c:8:1
33 registers were unavailable.

Floating Point Registers:
       f13 = 0x00007ffffffef780 !!!!! wrong register
       f24 = 0xffffffffffffffff
       f25 = 0xffffffffffffffff
       f26 = 0x0000555555554768  t`main + 40 at test.c:8:1
       f27 = 0xffffffffffffffff
       f28 = 0xffffffffffffffff
       f29 = 0xffffffffffffffff
       f30 = 0xffffffffffffffff
       f31 = 0xffffffffffffffff
32 registers were unavailable.
```
With this patch:
```
The previous operations are the same.
(lldb) register read -a
General Purpose Registers:
        r1 = 0x0000555555554768  t`main + 40 at test.c:8:1
        r3 = 0x00007ffffffef780
       r22 = 0x00007ffffffef7b0
       r23 = 0x00007ffffffef918
       r24 = 0x0000000000000001
       r25 = 0x0000000000000000
       r26 = 0x000055555555be08  t`__do_global_dtors_aux_fini_array_entry
       r27 = 0x0000555555554740  t`main at test.c:6
       r28 = 0x00007ffffffef928
       r29 = 0x00007ffff7febc88  ld-linux-loongarch-lp64d.so.1`_rtld_global_ro
       r30 = 0x000055555555be08  t`__do_global_dtors_aux_fini_array_entry
        pc = 0x0000555555554768  t`main + 40 at test.c:8:1
33 registers were unavailable.

Floating Point Registers:
       f24 = 0xffffffffffffffff
       f25 = 0xffffffffffffffff
       f26 = 0x00007ffffffef780
       f27 = 0xffffffffffffffff
       f28 = 0xffffffffffffffff
       f29 = 0xffffffffffffffff
       f30 = 0xffffffffffffffff
       f31 = 0xffffffffffffffff
33 registers were unavailable.
```

Reviewed By: SixWeining

Pull Request: https://github.com/llvm/llvm-project/pull/120391
---
 .../Utility/LoongArch_DWARF_Registers.h       | 36 ++++++++++---------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/lldb/source/Utility/LoongArch_DWARF_Registers.h b/lldb/source/Utility/LoongArch_DWARF_Registers.h
index 34e40a066051e..596806348ee24 100644
--- a/lldb/source/Utility/LoongArch_DWARF_Registers.h
+++ b/lldb/source/Utility/LoongArch_DWARF_Registers.h
@@ -47,22 +47,7 @@ enum {
   dwarf_gpr_r30,
   dwarf_gpr_r31 = 31,
 
-  dwarf_gpr_orig_a0,
-  dwarf_gpr_pc,
-  dwarf_gpr_badv,
-
-  dwarf_gpr_reserved0 = 35,
-  dwarf_gpr_reserved1,
-  dwarf_gpr_reserved2,
-  dwarf_gpr_reserved3,
-  dwarf_gpr_reserved4,
-  dwarf_gpr_reserved5,
-  dwarf_gpr_reserved6,
-  dwarf_gpr_reserved7,
-  dwarf_gpr_reserved8,
-  dwarf_gpr_reserved9,
-
-  dwarf_fpr_f0 = 45,
+  dwarf_fpr_f0 = 32,
   dwarf_fpr_f1,
   dwarf_fpr_f2,
   dwarf_fpr_f3,
@@ -93,7 +78,7 @@ enum {
   dwarf_fpr_f28,
   dwarf_fpr_f29,
   dwarf_fpr_f30,
-  dwarf_fpr_f31 = 76,
+  dwarf_fpr_f31 = 63,
 
   dwarf_fpr_fcc0,
   dwarf_fpr_fcc1,
@@ -170,6 +155,23 @@ enum {
   dwarf_fpr_fs5 = dwarf_fpr_f29,
   dwarf_fpr_fs6 = dwarf_fpr_f30,
   dwarf_fpr_fs7 = dwarf_fpr_f31,
+
+  // mock pc regnum
+  dwarf_gpr_pc = 57005,
+
+  // fake registers are only used to define `RegisterInfo`
+  dwarf_gpr_orig_a0,
+  dwarf_gpr_badv,
+  dwarf_gpr_reserved0,
+  dwarf_gpr_reserved1,
+  dwarf_gpr_reserved2,
+  dwarf_gpr_reserved3,
+  dwarf_gpr_reserved4,
+  dwarf_gpr_reserved5,
+  dwarf_gpr_reserved6,
+  dwarf_gpr_reserved7,
+  dwarf_gpr_reserved8,
+  dwarf_gpr_reserved9,
 };
 
 } // namespace loongarch_dwarf

From ee7ca0dddafb609090ad1789570c099d95c0afb6 Mon Sep 17 00:00:00 2001
From: Paul Bowen-Huggett <paulhuggett@mac.com>
Date: Fri, 20 Dec 2024 02:29:18 +0100
Subject: [PATCH 129/209] Make CombinerHelper methods const (#119529)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There are a number of backends (specifically AArch64, AMDGPU, Mips, and
RISCV) which contain a “TODO: make CombinerHelper methods const”
comment. This PR does just that and makes all of the CombinerHelper
methods const, removes the TODO comments and makes the associated
instances const. This change makes some sense because the CombinerHelper
class simply modifies the state of _other_ objects to which it holds
pointers or references.

Note that AMDGPU contains an identical comment for an instance of
AMDGPUCombinerHelper (a subclass of CombinerHelper). I deliberately
haven’t modified the methods of that class in order to limit the scope
of the change. I’m happy to do so either now or as a follow-up.
---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  | 571 ++++++++++--------
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 541 +++++++++--------
 .../GlobalISel/CombinerHelperArtifacts.cpp    |   4 +-
 .../GlobalISel/CombinerHelperCasts.cpp        |  18 +-
 .../GlobalISel/CombinerHelperCompares.cpp     |   8 +-
 .../GlobalISel/CombinerHelperVectorOps.cpp    |  22 +-
 .../GISel/AArch64O0PreLegalizerCombiner.cpp   |   3 +-
 .../GISel/AArch64PostLegalizerCombiner.cpp    |   3 +-
 .../GISel/AArch64PostLegalizerLowering.cpp    |   3 +-
 .../GISel/AArch64PreLegalizerCombiner.cpp     |   6 +-
 .../Target/AMDGPU/AMDGPURegBankCombiner.cpp   |   3 +-
 .../Target/Mips/MipsPostLegalizerCombiner.cpp |   3 +-
 .../Target/Mips/MipsPreLegalizerCombiner.cpp  |   3 +-
 .../GISel/RISCVO0PreLegalizerCombiner.cpp     |   3 +-
 .../GISel/RISCVPostLegalizerCombiner.cpp      |   3 +-
 .../RISCV/GISel/RISCVPreLegalizerCombiner.cpp |   3 +-
 16 files changed, 643 insertions(+), 554 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 55c3b72c8e027..871456d2a55b5 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -172,17 +172,18 @@ class CombinerHelper {
   /// Set the register bank of \p Reg.
   /// Does nothing if the RegBank is null.
   /// This is the counterpart to getRegBank.
-  void setRegBank(Register Reg, const RegisterBank *RegBank);
+  void setRegBank(Register Reg, const RegisterBank *RegBank) const;
 
   /// If \p MI is COPY, try to combine it.
   /// Returns true if MI changed.
-  bool tryCombineCopy(MachineInstr &MI);
-  bool matchCombineCopy(MachineInstr &MI);
-  void applyCombineCopy(MachineInstr &MI);
+  bool tryCombineCopy(MachineInstr &MI) const;
+  bool matchCombineCopy(MachineInstr &MI) const;
+  void applyCombineCopy(MachineInstr &MI) const;
 
   /// Returns true if \p DefMI precedes \p UseMI or they are the same
   /// instruction. Both must be in the same basic block.
-  bool isPredecessor(const MachineInstr &DefMI, const MachineInstr &UseMI);
+  bool isPredecessor(const MachineInstr &DefMI,
+                     const MachineInstr &UseMI) const;
 
   /// Returns true if \p DefMI dominates \p UseMI. By definition an
   /// instruction dominates itself.
@@ -190,40 +191,50 @@ class CombinerHelper {
   /// If we haven't been provided with a MachineDominatorTree during
   /// construction, this function returns a conservative result that tracks just
   /// a single basic block.
-  bool dominates(const MachineInstr &DefMI, const MachineInstr &UseMI);
+  bool dominates(const MachineInstr &DefMI, const MachineInstr &UseMI) const;
 
   /// If \p MI is extend that consumes the result of a load, try to combine it.
   /// Returns true if MI changed.
-  bool tryCombineExtendingLoads(MachineInstr &MI);
-  bool matchCombineExtendingLoads(MachineInstr &MI, PreferredTuple &MatchInfo);
-  void applyCombineExtendingLoads(MachineInstr &MI, PreferredTuple &MatchInfo);
+  bool tryCombineExtendingLoads(MachineInstr &MI) const;
+  bool matchCombineExtendingLoads(MachineInstr &MI,
+                                  PreferredTuple &MatchInfo) const;
+  void applyCombineExtendingLoads(MachineInstr &MI,
+                                  PreferredTuple &MatchInfo) const;
 
   /// Match (and (load x), mask) -> zextload x
-  bool matchCombineLoadWithAndMask(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchCombineLoadWithAndMask(MachineInstr &MI,
+                                   BuildFnTy &MatchInfo) const;
 
   /// Combine a G_EXTRACT_VECTOR_ELT of a load into a narrowed
   /// load.
-  bool matchCombineExtractedVectorLoad(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchCombineExtractedVectorLoad(MachineInstr &MI,
+                                       BuildFnTy &MatchInfo) const;
 
-  bool matchCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo);
-  void applyCombineIndexedLoadStore(MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo);
+  bool matchCombineIndexedLoadStore(MachineInstr &MI,
+                                    IndexedLoadStoreMatchInfo &MatchInfo) const;
+  void applyCombineIndexedLoadStore(MachineInstr &MI,
+                                    IndexedLoadStoreMatchInfo &MatchInfo) const;
 
-  bool matchSextTruncSextLoad(MachineInstr &MI);
-  void applySextTruncSextLoad(MachineInstr &MI);
+  bool matchSextTruncSextLoad(MachineInstr &MI) const;
+  void applySextTruncSextLoad(MachineInstr &MI) const;
 
   /// Match sext_inreg(load p), imm -> sextload p
-  bool matchSextInRegOfLoad(MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo);
-  void applySextInRegOfLoad(MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo);
+  bool matchSextInRegOfLoad(MachineInstr &MI,
+                            std::tuple<Register, unsigned> &MatchInfo) const;
+  void applySextInRegOfLoad(MachineInstr &MI,
+                            std::tuple<Register, unsigned> &MatchInfo) const;
 
   /// Try to combine G_[SU]DIV and G_[SU]REM into a single G_[SU]DIVREM
   /// when their source operands are identical.
-  bool matchCombineDivRem(MachineInstr &MI, MachineInstr *&OtherMI);
-  void applyCombineDivRem(MachineInstr &MI, MachineInstr *&OtherMI);
+  bool matchCombineDivRem(MachineInstr &MI, MachineInstr *&OtherMI) const;
+  void applyCombineDivRem(MachineInstr &MI, MachineInstr *&OtherMI) const;
 
   /// If a brcond's true block is not the fallthrough, make it so by inverting
   /// the condition and swapping operands.
-  bool matchOptBrCondByInvertingCond(MachineInstr &MI, MachineInstr *&BrCond);
-  void applyOptBrCondByInvertingCond(MachineInstr &MI, MachineInstr *&BrCond);
+  bool matchOptBrCondByInvertingCond(MachineInstr &MI,
+                                     MachineInstr *&BrCond) const;
+  void applyOptBrCondByInvertingCond(MachineInstr &MI,
+                                     MachineInstr *&BrCond) const;
 
   /// If \p MI is G_CONCAT_VECTORS, try to combine it.
   /// Returns true if MI changed.
@@ -239,21 +250,25 @@ class CombinerHelper {
   /// needed to produce the flattened build_vector.
   ///
   /// \pre MI.getOpcode() == G_CONCAT_VECTORS.
-  bool matchCombineConcatVectors(MachineInstr &MI, SmallVector<Register> &Ops);
+  bool matchCombineConcatVectors(MachineInstr &MI,
+                                 SmallVector<Register> &Ops) const;
   /// Replace \p MI with a flattened build_vector with \p Ops
   /// or an implicit_def if \p Ops is empty.
-  void applyCombineConcatVectors(MachineInstr &MI, SmallVector<Register> &Ops);
+  void applyCombineConcatVectors(MachineInstr &MI,
+                                 SmallVector<Register> &Ops) const;
 
-  bool matchCombineShuffleConcat(MachineInstr &MI, SmallVector<Register> &Ops);
+  bool matchCombineShuffleConcat(MachineInstr &MI,
+                                 SmallVector<Register> &Ops) const;
   /// Replace \p MI with a flattened build_vector with \p Ops
   /// or an implicit_def if \p Ops is empty.
-  void applyCombineShuffleConcat(MachineInstr &MI, SmallVector<Register> &Ops);
+  void applyCombineShuffleConcat(MachineInstr &MI,
+                                 SmallVector<Register> &Ops) const;
 
   /// Try to combine G_SHUFFLE_VECTOR into G_CONCAT_VECTORS.
   /// Returns true if MI changed.
   ///
   /// \pre MI.getOpcode() == G_SHUFFLE_VECTOR.
-  bool tryCombineShuffleVector(MachineInstr &MI);
+  bool tryCombineShuffleVector(MachineInstr &MI) const;
   /// Check if the G_SHUFFLE_VECTOR \p MI can be replaced by a
   /// concat_vectors.
   /// \p Ops will contain the operands needed to produce the flattened
@@ -261,12 +276,12 @@ class CombinerHelper {
   ///
   /// \pre MI.getOpcode() == G_SHUFFLE_VECTOR.
   bool matchCombineShuffleVector(MachineInstr &MI,
-                                 SmallVectorImpl<Register> &Ops);
+                                 SmallVectorImpl<Register> &Ops) const;
   /// Replace \p MI with a concat_vectors with \p Ops.
   void applyCombineShuffleVector(MachineInstr &MI,
-                                 const ArrayRef<Register> Ops);
-  bool matchShuffleToExtract(MachineInstr &MI);
-  void applyShuffleToExtract(MachineInstr &MI);
+                                 const ArrayRef<Register> Ops) const;
+  bool matchShuffleToExtract(MachineInstr &MI) const;
+  void applyShuffleToExtract(MachineInstr &MI) const;
 
   /// Optimize memcpy intrinsics et al, e.g. constant len calls.
   /// /p MaxLen if non-zero specifies the max length of a mem libcall to inline.
@@ -298,101 +313,105 @@ class CombinerHelper {
   ///     $addr = G_INDEXED_STORE $val, $base, $offset
   ///     [...]
   ///     $whatever = COPY $addr
-  bool tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen = 0);
+  bool tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen = 0) const;
 
-  bool matchPtrAddImmedChain(MachineInstr &MI, PtrAddChain &MatchInfo);
-  void applyPtrAddImmedChain(MachineInstr &MI, PtrAddChain &MatchInfo);
+  bool matchPtrAddImmedChain(MachineInstr &MI, PtrAddChain &MatchInfo) const;
+  void applyPtrAddImmedChain(MachineInstr &MI, PtrAddChain &MatchInfo) const;
 
   /// Fold (shift (shift base, x), y) -> (shift base (x+y))
-  bool matchShiftImmedChain(MachineInstr &MI, RegisterImmPair &MatchInfo);
-  void applyShiftImmedChain(MachineInstr &MI, RegisterImmPair &MatchInfo);
+  bool matchShiftImmedChain(MachineInstr &MI, RegisterImmPair &MatchInfo) const;
+  void applyShiftImmedChain(MachineInstr &MI, RegisterImmPair &MatchInfo) const;
 
   /// If we have a shift-by-constant of a bitwise logic op that itself has a
   /// shift-by-constant operand with identical opcode, we may be able to convert
   /// that into 2 independent shifts followed by the logic op.
   bool matchShiftOfShiftedLogic(MachineInstr &MI,
-                                ShiftOfShiftedLogic &MatchInfo);
+                                ShiftOfShiftedLogic &MatchInfo) const;
   void applyShiftOfShiftedLogic(MachineInstr &MI,
-                                ShiftOfShiftedLogic &MatchInfo);
+                                ShiftOfShiftedLogic &MatchInfo) const;
 
-  bool matchCommuteShift(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchCommuteShift(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
   /// Transform a multiply by a power-of-2 value to a left shift.
-  bool matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal);
-  void applyCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal);
+  bool matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal) const;
+  void applyCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal) const;
 
   // Transform a G_SUB with constant on the RHS to G_ADD.
-  bool matchCombineSubToAdd(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchCombineSubToAdd(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
   // Transform a G_SHL with an extended source into a narrower shift if
   // possible.
-  bool matchCombineShlOfExtend(MachineInstr &MI, RegisterImmPair &MatchData);
+  bool matchCombineShlOfExtend(MachineInstr &MI,
+                               RegisterImmPair &MatchData) const;
   void applyCombineShlOfExtend(MachineInstr &MI,
-                               const RegisterImmPair &MatchData);
+                               const RegisterImmPair &MatchData) const;
 
   /// Fold away a merge of an unmerge of the corresponding values.
-  bool matchCombineMergeUnmerge(MachineInstr &MI, Register &MatchInfo);
+  bool matchCombineMergeUnmerge(MachineInstr &MI, Register &MatchInfo) const;
 
   /// Reduce a shift by a constant to an unmerge and a shift on a half sized
   /// type. This will not produce a shift smaller than \p TargetShiftSize.
   bool matchCombineShiftToUnmerge(MachineInstr &MI, unsigned TargetShiftSize,
-                                 unsigned &ShiftVal);
-  void applyCombineShiftToUnmerge(MachineInstr &MI, const unsigned &ShiftVal);
-  bool tryCombineShiftToUnmerge(MachineInstr &MI, unsigned TargetShiftAmount);
+                                  unsigned &ShiftVal) const;
+  void applyCombineShiftToUnmerge(MachineInstr &MI,
+                                  const unsigned &ShiftVal) const;
+  bool tryCombineShiftToUnmerge(MachineInstr &MI,
+                                unsigned TargetShiftAmount) const;
 
   /// Transform <ty,...> G_UNMERGE(G_MERGE ty X, Y, Z) -> ty X, Y, Z.
-  bool
-  matchCombineUnmergeMergeToPlainValues(MachineInstr &MI,
-                                        SmallVectorImpl<Register> &Operands);
-  void
-  applyCombineUnmergeMergeToPlainValues(MachineInstr &MI,
-                                        SmallVectorImpl<Register> &Operands);
+  bool matchCombineUnmergeMergeToPlainValues(
+      MachineInstr &MI, SmallVectorImpl<Register> &Operands) const;
+  void applyCombineUnmergeMergeToPlainValues(
+      MachineInstr &MI, SmallVectorImpl<Register> &Operands) const;
 
   /// Transform G_UNMERGE Constant -> Constant1, Constant2, ...
   bool matchCombineUnmergeConstant(MachineInstr &MI,
-                                   SmallVectorImpl<APInt> &Csts);
+                                   SmallVectorImpl<APInt> &Csts) const;
   void applyCombineUnmergeConstant(MachineInstr &MI,
-                                   SmallVectorImpl<APInt> &Csts);
+                                   SmallVectorImpl<APInt> &Csts) const;
 
   /// Transform G_UNMERGE G_IMPLICIT_DEF -> G_IMPLICIT_DEF, G_IMPLICIT_DEF, ...
-  bool
-  matchCombineUnmergeUndef(MachineInstr &MI,
-                           std::function<void(MachineIRBuilder &)> &MatchInfo);
+  bool matchCombineUnmergeUndef(
+      MachineInstr &MI,
+      std::function<void(MachineIRBuilder &)> &MatchInfo) const;
 
   /// Transform X, Y<dead> = G_UNMERGE Z -> X = G_TRUNC Z.
-  bool matchCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI);
-  void applyCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI);
+  bool matchCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI) const;
+  void applyCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI) const;
 
   /// Transform X, Y = G_UNMERGE(G_ZEXT(Z)) -> X = G_ZEXT(Z); Y = G_CONSTANT 0
-  bool matchCombineUnmergeZExtToZExt(MachineInstr &MI);
-  void applyCombineUnmergeZExtToZExt(MachineInstr &MI);
+  bool matchCombineUnmergeZExtToZExt(MachineInstr &MI) const;
+  void applyCombineUnmergeZExtToZExt(MachineInstr &MI) const;
 
   /// Transform fp_instr(cst) to constant result of the fp operation.
-  void applyCombineConstantFoldFpUnary(MachineInstr &MI, const ConstantFP *Cst);
+  void applyCombineConstantFoldFpUnary(MachineInstr &MI,
+                                       const ConstantFP *Cst) const;
 
   /// Transform IntToPtr(PtrToInt(x)) to x if cast is in the same address space.
-  bool matchCombineI2PToP2I(MachineInstr &MI, Register &Reg);
-  void applyCombineI2PToP2I(MachineInstr &MI, Register &Reg);
+  bool matchCombineI2PToP2I(MachineInstr &MI, Register &Reg) const;
+  void applyCombineI2PToP2I(MachineInstr &MI, Register &Reg) const;
 
   /// Transform PtrToInt(IntToPtr(x)) to x.
-  void applyCombineP2IToI2P(MachineInstr &MI, Register &Reg);
+  void applyCombineP2IToI2P(MachineInstr &MI, Register &Reg) const;
 
   /// Transform G_ADD (G_PTRTOINT x), y -> G_PTRTOINT (G_PTR_ADD x, y)
   /// Transform G_ADD y, (G_PTRTOINT x) -> G_PTRTOINT (G_PTR_ADD x, y)
-  bool matchCombineAddP2IToPtrAdd(MachineInstr &MI,
-                                  std::pair<Register, bool> &PtrRegAndCommute);
-  void applyCombineAddP2IToPtrAdd(MachineInstr &MI,
-                                  std::pair<Register, bool> &PtrRegAndCommute);
+  bool
+  matchCombineAddP2IToPtrAdd(MachineInstr &MI,
+                             std::pair<Register, bool> &PtrRegAndCommute) const;
+  void
+  applyCombineAddP2IToPtrAdd(MachineInstr &MI,
+                             std::pair<Register, bool> &PtrRegAndCommute) const;
 
   // Transform G_PTR_ADD (G_PTRTOINT C1), C2 -> C1 + C2
-  bool matchCombineConstPtrAddToI2P(MachineInstr &MI, APInt &NewCst);
-  void applyCombineConstPtrAddToI2P(MachineInstr &MI, APInt &NewCst);
+  bool matchCombineConstPtrAddToI2P(MachineInstr &MI, APInt &NewCst) const;
+  void applyCombineConstPtrAddToI2P(MachineInstr &MI, APInt &NewCst) const;
 
   /// Transform anyext(trunc(x)) to x.
-  bool matchCombineAnyExtTrunc(MachineInstr &MI, Register &Reg);
+  bool matchCombineAnyExtTrunc(MachineInstr &MI, Register &Reg) const;
 
   /// Transform zext(trunc(x)) to x.
-  bool matchCombineZextTrunc(MachineInstr &MI, Register &Reg);
+  bool matchCombineZextTrunc(MachineInstr &MI, Register &Reg) const;
 
   /// Transform trunc (shl x, K) to shl (trunc x), K
   ///    if K < VT.getScalarSizeInBits().
@@ -401,118 +420,121 @@ class CombinerHelper {
   ///    if K <= (MidVT.getScalarSizeInBits() - VT.getScalarSizeInBits())
   /// MidVT is obtained by finding a legal type between the trunc's src and dst
   /// types.
-  bool matchCombineTruncOfShift(MachineInstr &MI,
-                                std::pair<MachineInstr *, LLT> &MatchInfo);
-  void applyCombineTruncOfShift(MachineInstr &MI,
-                                std::pair<MachineInstr *, LLT> &MatchInfo);
+  bool
+  matchCombineTruncOfShift(MachineInstr &MI,
+                           std::pair<MachineInstr *, LLT> &MatchInfo) const;
+  void
+  applyCombineTruncOfShift(MachineInstr &MI,
+                           std::pair<MachineInstr *, LLT> &MatchInfo) const;
 
   /// Return true if any explicit use operand on \p MI is defined by a
   /// G_IMPLICIT_DEF.
-  bool matchAnyExplicitUseIsUndef(MachineInstr &MI);
+  bool matchAnyExplicitUseIsUndef(MachineInstr &MI) const;
 
   /// Return true if all register explicit use operands on \p MI are defined by
   /// a G_IMPLICIT_DEF.
-  bool matchAllExplicitUsesAreUndef(MachineInstr &MI);
+  bool matchAllExplicitUsesAreUndef(MachineInstr &MI) const;
 
   /// Return true if a G_SHUFFLE_VECTOR instruction \p MI has an undef mask.
-  bool matchUndefShuffleVectorMask(MachineInstr &MI);
+  bool matchUndefShuffleVectorMask(MachineInstr &MI) const;
 
   /// Return true if a G_STORE instruction \p MI is storing an undef value.
-  bool matchUndefStore(MachineInstr &MI);
+  bool matchUndefStore(MachineInstr &MI) const;
 
   /// Return true if a G_SELECT instruction \p MI has an undef comparison.
-  bool matchUndefSelectCmp(MachineInstr &MI);
+  bool matchUndefSelectCmp(MachineInstr &MI) const;
 
   /// Return true if a G_{EXTRACT,INSERT}_VECTOR_ELT has an out of range index.
-  bool matchInsertExtractVecEltOutOfBounds(MachineInstr &MI);
+  bool matchInsertExtractVecEltOutOfBounds(MachineInstr &MI) const;
 
   /// Return true if a G_SELECT instruction \p MI has a constant comparison. If
   /// true, \p OpIdx will store the operand index of the known selected value.
-  bool matchConstantSelectCmp(MachineInstr &MI, unsigned &OpIdx);
+  bool matchConstantSelectCmp(MachineInstr &MI, unsigned &OpIdx) const;
 
   /// Replace an instruction with a G_FCONSTANT with value \p C.
-  void replaceInstWithFConstant(MachineInstr &MI, double C);
+  void replaceInstWithFConstant(MachineInstr &MI, double C) const;
 
   /// Replace an instruction with an G_FCONSTANT with value \p CFP.
-  void replaceInstWithFConstant(MachineInstr &MI, ConstantFP *CFP);
+  void replaceInstWithFConstant(MachineInstr &MI, ConstantFP *CFP) const;
 
   /// Replace an instruction with a G_CONSTANT with value \p C.
-  void replaceInstWithConstant(MachineInstr &MI, int64_t C);
+  void replaceInstWithConstant(MachineInstr &MI, int64_t C) const;
 
   /// Replace an instruction with a G_CONSTANT with value \p C.
-  void replaceInstWithConstant(MachineInstr &MI, APInt C);
+  void replaceInstWithConstant(MachineInstr &MI, APInt C) const;
 
   /// Replace an instruction with a G_IMPLICIT_DEF.
-  void replaceInstWithUndef(MachineInstr &MI);
+  void replaceInstWithUndef(MachineInstr &MI) const;
 
   /// Delete \p MI and replace all of its uses with its \p OpIdx-th operand.
-  void replaceSingleDefInstWithOperand(MachineInstr &MI, unsigned OpIdx);
+  void replaceSingleDefInstWithOperand(MachineInstr &MI, unsigned OpIdx) const;
 
   /// Delete \p MI and replace all of its uses with \p Replacement.
-  void replaceSingleDefInstWithReg(MachineInstr &MI, Register Replacement);
+  void replaceSingleDefInstWithReg(MachineInstr &MI,
+                                   Register Replacement) const;
 
   /// @brief Replaces the shift amount in \p MI with ShiftAmt % BW
   /// @param MI
-  void applyFunnelShiftConstantModulo(MachineInstr &MI);
+  void applyFunnelShiftConstantModulo(MachineInstr &MI) const;
 
   /// Return true if \p MOP1 and \p MOP2 are register operands are defined by
   /// equivalent instructions.
-  bool matchEqualDefs(const MachineOperand &MOP1, const MachineOperand &MOP2);
+  bool matchEqualDefs(const MachineOperand &MOP1,
+                      const MachineOperand &MOP2) const;
 
   /// Return true if \p MOP is defined by a G_CONSTANT or splat with a value equal to
   /// \p C.
-  bool matchConstantOp(const MachineOperand &MOP, int64_t C);
+  bool matchConstantOp(const MachineOperand &MOP, int64_t C) const;
 
   /// Return true if \p MOP is defined by a G_FCONSTANT or splat with a value exactly
   /// equal to \p C.
-  bool matchConstantFPOp(const MachineOperand &MOP, double C);
+  bool matchConstantFPOp(const MachineOperand &MOP, double C) const;
 
   /// @brief Checks if constant at \p ConstIdx is larger than \p MI 's bitwidth
   /// @param ConstIdx Index of the constant
-  bool matchConstantLargerBitWidth(MachineInstr &MI, unsigned ConstIdx);
+  bool matchConstantLargerBitWidth(MachineInstr &MI, unsigned ConstIdx) const;
 
   /// Optimize (cond ? x : x) -> x
-  bool matchSelectSameVal(MachineInstr &MI);
+  bool matchSelectSameVal(MachineInstr &MI) const;
 
   /// Optimize (x op x) -> x
-  bool matchBinOpSameVal(MachineInstr &MI);
+  bool matchBinOpSameVal(MachineInstr &MI) const;
 
   /// Check if operand \p OpIdx is zero.
-  bool matchOperandIsZero(MachineInstr &MI, unsigned OpIdx);
+  bool matchOperandIsZero(MachineInstr &MI, unsigned OpIdx) const;
 
   /// Check if operand \p OpIdx is undef.
-  bool matchOperandIsUndef(MachineInstr &MI, unsigned OpIdx);
+  bool matchOperandIsUndef(MachineInstr &MI, unsigned OpIdx) const;
 
   /// Check if operand \p OpIdx is known to be a power of 2.
-  bool matchOperandIsKnownToBeAPowerOfTwo(MachineInstr &MI, unsigned OpIdx);
+  bool matchOperandIsKnownToBeAPowerOfTwo(MachineInstr &MI,
+                                          unsigned OpIdx) const;
 
   /// Erase \p MI
-  void eraseInst(MachineInstr &MI);
+  void eraseInst(MachineInstr &MI) const;
 
   /// Return true if MI is a G_ADD which can be simplified to a G_SUB.
   bool matchSimplifyAddToSub(MachineInstr &MI,
-                             std::tuple<Register, Register> &MatchInfo);
+                             std::tuple<Register, Register> &MatchInfo) const;
   void applySimplifyAddToSub(MachineInstr &MI,
-                             std::tuple<Register, Register> &MatchInfo);
+                             std::tuple<Register, Register> &MatchInfo) const;
 
   /// Match (logic_op (op x...), (op y...)) -> (op (logic_op x, y))
-  bool
-  matchHoistLogicOpWithSameOpcodeHands(MachineInstr &MI,
-                                       InstructionStepsMatchInfo &MatchInfo);
+  bool matchHoistLogicOpWithSameOpcodeHands(
+      MachineInstr &MI, InstructionStepsMatchInfo &MatchInfo) const;
 
   /// Replace \p MI with a series of instructions described in \p MatchInfo.
   void applyBuildInstructionSteps(MachineInstr &MI,
-                                  InstructionStepsMatchInfo &MatchInfo);
+                                  InstructionStepsMatchInfo &MatchInfo) const;
 
   /// Match ashr (shl x, C), C -> sext_inreg (C)
   bool matchAshrShlToSextInreg(MachineInstr &MI,
-                               std::tuple<Register, int64_t> &MatchInfo);
+                               std::tuple<Register, int64_t> &MatchInfo) const;
   void applyAshShlToSextInreg(MachineInstr &MI,
-                              std::tuple<Register, int64_t> &MatchInfo);
+                              std::tuple<Register, int64_t> &MatchInfo) const;
 
   /// Fold and(and(x, C1), C2) -> C1&C2 ? and(x, C1&C2) : 0
-  bool matchOverlappingAnd(MachineInstr &MI,
-                           BuildFnTy &MatchInfo);
+  bool matchOverlappingAnd(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
   /// \return true if \p MI is a G_AND instruction whose operands are x and y
   /// where x & y == x or x & y == y. (E.g., one of operands is all-ones value.)
@@ -520,7 +542,7 @@ class CombinerHelper {
   /// \param [in] MI - The G_AND instruction.
   /// \param [out] Replacement - A register the G_AND should be replaced with on
   /// success.
-  bool matchRedundantAnd(MachineInstr &MI, Register &Replacement);
+  bool matchRedundantAnd(MachineInstr &MI, Register &Replacement) const;
 
   /// \return true if \p MI is a G_OR instruction whose operands are x and y
   /// where x | y == x or x | y == y. (E.g., one of operands is all-zeros
@@ -529,42 +551,45 @@ class CombinerHelper {
   /// \param [in] MI - The G_OR instruction.
   /// \param [out] Replacement - A register the G_OR should be replaced with on
   /// success.
-  bool matchRedundantOr(MachineInstr &MI, Register &Replacement);
+  bool matchRedundantOr(MachineInstr &MI, Register &Replacement) const;
 
   /// \return true if \p MI is a G_SEXT_INREG that can be erased.
-  bool matchRedundantSExtInReg(MachineInstr &MI);
+  bool matchRedundantSExtInReg(MachineInstr &MI) const;
 
   /// Combine inverting a result of a compare into the opposite cond code.
-  bool matchNotCmp(MachineInstr &MI, SmallVectorImpl<Register> &RegsToNegate);
-  void applyNotCmp(MachineInstr &MI, SmallVectorImpl<Register> &RegsToNegate);
+  bool matchNotCmp(MachineInstr &MI,
+                   SmallVectorImpl<Register> &RegsToNegate) const;
+  void applyNotCmp(MachineInstr &MI,
+                   SmallVectorImpl<Register> &RegsToNegate) const;
 
   /// Fold (xor (and x, y), y) -> (and (not x), y)
   ///{
   bool matchXorOfAndWithSameReg(MachineInstr &MI,
-                                std::pair<Register, Register> &MatchInfo);
+                                std::pair<Register, Register> &MatchInfo) const;
   void applyXorOfAndWithSameReg(MachineInstr &MI,
-                                std::pair<Register, Register> &MatchInfo);
+                                std::pair<Register, Register> &MatchInfo) const;
   ///}
 
   /// Combine G_PTR_ADD with nullptr to G_INTTOPTR
-  bool matchPtrAddZero(MachineInstr &MI);
-  void applyPtrAddZero(MachineInstr &MI);
+  bool matchPtrAddZero(MachineInstr &MI) const;
+  void applyPtrAddZero(MachineInstr &MI) const;
 
   /// Combine G_UREM x, (known power of 2) to an add and bitmasking.
-  void applySimplifyURemByPow2(MachineInstr &MI);
+  void applySimplifyURemByPow2(MachineInstr &MI) const;
 
   /// Push a binary operator through a select on constants.
   ///
   /// binop (select cond, K0, K1), K2 ->
   ///   select cond, (binop K0, K2), (binop K1, K2)
-  bool matchFoldBinOpIntoSelect(MachineInstr &MI, unsigned &SelectOpNo);
-  void applyFoldBinOpIntoSelect(MachineInstr &MI, const unsigned &SelectOpNo);
+  bool matchFoldBinOpIntoSelect(MachineInstr &MI, unsigned &SelectOpNo) const;
+  void applyFoldBinOpIntoSelect(MachineInstr &MI,
+                                const unsigned &SelectOpNo) const;
 
   bool matchCombineInsertVecElts(MachineInstr &MI,
-                                 SmallVectorImpl<Register> &MatchInfo);
+                                 SmallVectorImpl<Register> &MatchInfo) const;
 
   void applyCombineInsertVecElts(MachineInstr &MI,
-                             SmallVectorImpl<Register> &MatchInfo);
+                                 SmallVectorImpl<Register> &MatchInfo) const;
 
   /// Match expression trees of the form
   ///
@@ -575,145 +600,148 @@ class CombinerHelper {
   ///
   /// And check if the tree can be replaced with a M-bit load + possibly a
   /// bswap.
-  bool matchLoadOrCombine(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchLoadOrCombine(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
-  bool matchExtendThroughPhis(MachineInstr &MI, MachineInstr *&ExtMI);
-  void applyExtendThroughPhis(MachineInstr &MI, MachineInstr *&ExtMI);
+  bool matchExtendThroughPhis(MachineInstr &MI, MachineInstr *&ExtMI) const;
+  void applyExtendThroughPhis(MachineInstr &MI, MachineInstr *&ExtMI) const;
 
-  bool matchExtractVecEltBuildVec(MachineInstr &MI, Register &Reg);
-  void applyExtractVecEltBuildVec(MachineInstr &MI, Register &Reg);
+  bool matchExtractVecEltBuildVec(MachineInstr &MI, Register &Reg) const;
+  void applyExtractVecEltBuildVec(MachineInstr &MI, Register &Reg) const;
 
   bool matchExtractAllEltsFromBuildVector(
       MachineInstr &MI,
-      SmallVectorImpl<std::pair<Register, MachineInstr *>> &MatchInfo);
+      SmallVectorImpl<std::pair<Register, MachineInstr *>> &MatchInfo) const;
   void applyExtractAllEltsFromBuildVector(
       MachineInstr &MI,
-      SmallVectorImpl<std::pair<Register, MachineInstr *>> &MatchInfo);
+      SmallVectorImpl<std::pair<Register, MachineInstr *>> &MatchInfo) const;
 
   /// Use a function which takes in a MachineIRBuilder to perform a combine.
   /// By default, it erases the instruction \p MI from the function.
-  void applyBuildFn(MachineInstr &MI, BuildFnTy &MatchInfo);
+  void applyBuildFn(MachineInstr &MI, BuildFnTy &MatchInfo) const;
   /// Use a function which takes in a MachineIRBuilder to perform a combine.
   /// This variant does not erase \p MI after calling the build function.
-  void applyBuildFnNoErase(MachineInstr &MI, BuildFnTy &MatchInfo);
+  void applyBuildFnNoErase(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
-  bool matchOrShiftToFunnelShift(MachineInstr &MI, BuildFnTy &MatchInfo);
-  bool matchFunnelShiftToRotate(MachineInstr &MI);
-  void applyFunnelShiftToRotate(MachineInstr &MI);
-  bool matchRotateOutOfRange(MachineInstr &MI);
-  void applyRotateOutOfRange(MachineInstr &MI);
+  bool matchOrShiftToFunnelShift(MachineInstr &MI, BuildFnTy &MatchInfo) const;
+  bool matchFunnelShiftToRotate(MachineInstr &MI) const;
+  void applyFunnelShiftToRotate(MachineInstr &MI) const;
+  bool matchRotateOutOfRange(MachineInstr &MI) const;
+  void applyRotateOutOfRange(MachineInstr &MI) const;
 
-  bool matchUseVectorTruncate(MachineInstr &MI, Register &MatchInfo);
-  void applyUseVectorTruncate(MachineInstr &MI, Register &MatchInfo);
+  bool matchUseVectorTruncate(MachineInstr &MI, Register &MatchInfo) const;
+  void applyUseVectorTruncate(MachineInstr &MI, Register &MatchInfo) const;
 
   /// \returns true if a G_ICMP instruction \p MI can be replaced with a true
   /// or false constant based off of KnownBits information.
-  bool matchICmpToTrueFalseKnownBits(MachineInstr &MI, int64_t &MatchInfo);
+  bool matchICmpToTrueFalseKnownBits(MachineInstr &MI,
+                                     int64_t &MatchInfo) const;
 
   /// \returns true if a G_ICMP \p MI can be replaced with its LHS based off of
   /// KnownBits information.
-  bool
-  matchICmpToLHSKnownBits(MachineInstr &MI,
-                          BuildFnTy &MatchInfo);
+  bool matchICmpToLHSKnownBits(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
   /// \returns true if (and (or x, c1), c2) can be replaced with (and x, c2)
-  bool matchAndOrDisjointMask(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchAndOrDisjointMask(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
   bool matchBitfieldExtractFromSExtInReg(MachineInstr &MI,
-                                         BuildFnTy &MatchInfo);
+                                         BuildFnTy &MatchInfo) const;
   /// Match: and (lshr x, cst), mask -> ubfx x, cst, width
-  bool matchBitfieldExtractFromAnd(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchBitfieldExtractFromAnd(MachineInstr &MI,
+                                   BuildFnTy &MatchInfo) const;
 
   /// Match: shr (shl x, n), k -> sbfx/ubfx x, pos, width
-  bool matchBitfieldExtractFromShr(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchBitfieldExtractFromShr(MachineInstr &MI,
+                                   BuildFnTy &MatchInfo) const;
 
   /// Match: shr (and x, n), k -> ubfx x, pos, width
-  bool matchBitfieldExtractFromShrAnd(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchBitfieldExtractFromShrAnd(MachineInstr &MI,
+                                      BuildFnTy &MatchInfo) const;
 
   // Helpers for reassociation:
   bool matchReassocConstantInnerRHS(GPtrAdd &MI, MachineInstr *RHS,
-                                    BuildFnTy &MatchInfo);
+                                    BuildFnTy &MatchInfo) const;
   bool matchReassocFoldConstantsInSubTree(GPtrAdd &MI, MachineInstr *LHS,
                                           MachineInstr *RHS,
-                                          BuildFnTy &MatchInfo);
+                                          BuildFnTy &MatchInfo) const;
   bool matchReassocConstantInnerLHS(GPtrAdd &MI, MachineInstr *LHS,
-                                    MachineInstr *RHS, BuildFnTy &MatchInfo);
+                                    MachineInstr *RHS,
+                                    BuildFnTy &MatchInfo) const;
   /// Reassociate pointer calculations with G_ADD involved, to allow better
   /// addressing mode usage.
-  bool matchReassocPtrAdd(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchReassocPtrAdd(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
   /// Try to reassociate to reassociate operands of a commutative binop.
   bool tryReassocBinOp(unsigned Opc, Register DstReg, Register Op0,
-                       Register Op1, BuildFnTy &MatchInfo);
+                       Register Op1, BuildFnTy &MatchInfo) const;
   /// Reassociate commutative binary operations like G_ADD.
-  bool matchReassocCommBinOp(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchReassocCommBinOp(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
   /// Do constant folding when opportunities are exposed after MIR building.
-  bool matchConstantFoldCastOp(MachineInstr &MI, APInt &MatchInfo);
+  bool matchConstantFoldCastOp(MachineInstr &MI, APInt &MatchInfo) const;
 
   /// Do constant folding when opportunities are exposed after MIR building.
-  bool matchConstantFoldBinOp(MachineInstr &MI, APInt &MatchInfo);
+  bool matchConstantFoldBinOp(MachineInstr &MI, APInt &MatchInfo) const;
 
   /// Do constant FP folding when opportunities are exposed after MIR building.
-  bool matchConstantFoldFPBinOp(MachineInstr &MI, ConstantFP* &MatchInfo);
+  bool matchConstantFoldFPBinOp(MachineInstr &MI, ConstantFP *&MatchInfo) const;
 
   /// Constant fold G_FMA/G_FMAD.
-  bool matchConstantFoldFMA(MachineInstr &MI, ConstantFP *&MatchInfo);
+  bool matchConstantFoldFMA(MachineInstr &MI, ConstantFP *&MatchInfo) const;
 
   /// \returns true if it is possible to narrow the width of a scalar binop
   /// feeding a G_AND instruction \p MI.
-  bool matchNarrowBinopFeedingAnd(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchNarrowBinopFeedingAnd(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
   /// Given an G_UDIV \p MI expressing a divide by constant, return an
   /// expression that implements it by multiplying by a magic number.
   /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
-  MachineInstr *buildUDivUsingMul(MachineInstr &MI);
+  MachineInstr *buildUDivUsingMul(MachineInstr &MI) const;
   /// Combine G_UDIV by constant into a multiply by magic constant.
-  bool matchUDivByConst(MachineInstr &MI);
-  void applyUDivByConst(MachineInstr &MI);
+  bool matchUDivByConst(MachineInstr &MI) const;
+  void applyUDivByConst(MachineInstr &MI) const;
 
   /// Given an G_SDIV \p MI expressing a signed divide by constant, return an
   /// expression that implements it by multiplying by a magic number.
   /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
-  MachineInstr *buildSDivUsingMul(MachineInstr &MI);
-  bool matchSDivByConst(MachineInstr &MI);
-  void applySDivByConst(MachineInstr &MI);
+  MachineInstr *buildSDivUsingMul(MachineInstr &MI) const;
+  bool matchSDivByConst(MachineInstr &MI) const;
+  void applySDivByConst(MachineInstr &MI) const;
 
   /// Given an G_SDIV \p MI expressing a signed divided by a pow2 constant,
   /// return expressions that implements it by shifting.
-  bool matchDivByPow2(MachineInstr &MI, bool IsSigned);
-  void applySDivByPow2(MachineInstr &MI);
+  bool matchDivByPow2(MachineInstr &MI, bool IsSigned) const;
+  void applySDivByPow2(MachineInstr &MI) const;
   /// Given an G_UDIV \p MI expressing an unsigned divided by a pow2 constant,
   /// return expressions that implements it by shifting.
-  void applyUDivByPow2(MachineInstr &MI);
+  void applyUDivByPow2(MachineInstr &MI) const;
 
   // G_UMULH x, (1 << c)) -> x >> (bitwidth - c)
-  bool matchUMulHToLShr(MachineInstr &MI);
-  void applyUMulHToLShr(MachineInstr &MI);
+  bool matchUMulHToLShr(MachineInstr &MI) const;
+  void applyUMulHToLShr(MachineInstr &MI) const;
 
   /// Try to transform \p MI by using all of the above
   /// combine functions. Returns true if changed.
-  bool tryCombine(MachineInstr &MI);
+  bool tryCombine(MachineInstr &MI) const;
 
   /// Emit loads and stores that perform the given memcpy.
   /// Assumes \p MI is a G_MEMCPY_INLINE
   /// TODO: implement dynamically sized inline memcpy,
   ///       and rename: s/bool tryEmit/void emit/
-  bool tryEmitMemcpyInline(MachineInstr &MI);
+  bool tryEmitMemcpyInline(MachineInstr &MI) const;
 
   /// Match:
   ///   (G_UMULO x, 2) -> (G_UADDO x, x)
   ///   (G_SMULO x, 2) -> (G_SADDO x, x)
-  bool matchMulOBy2(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchMulOBy2(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
   /// Match:
   /// (G_*MULO x, 0) -> 0 + no carry out
-  bool matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
   /// Match:
   /// (G_*ADDE x, y, 0) -> (G_*ADDO x, y)
   /// (G_*SUBE x, y, 0) -> (G_*SUBO x, y)
-  bool matchAddEToAddO(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchAddEToAddO(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
   /// Transform (fadd x, fneg(y)) -> (fsub x, y)
   ///           (fadd fneg(x), y) -> (fsub y, x)
@@ -722,79 +750,85 @@ class CombinerHelper {
   ///           (fdiv fneg(x), fneg(y)) -> (fdiv x, y)
   ///           (fmad fneg(x), fneg(y), z) -> (fmad x, y, z)
   ///           (fma fneg(x), fneg(y), z) -> (fma x, y, z)
-  bool matchRedundantNegOperands(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchRedundantNegOperands(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
-  bool matchFsubToFneg(MachineInstr &MI, Register &MatchInfo);
-  void applyFsubToFneg(MachineInstr &MI, Register &MatchInfo);
+  bool matchFsubToFneg(MachineInstr &MI, Register &MatchInfo) const;
+  void applyFsubToFneg(MachineInstr &MI, Register &MatchInfo) const;
 
   bool canCombineFMadOrFMA(MachineInstr &MI, bool &AllowFusionGlobally,
                            bool &HasFMAD, bool &Aggressive,
-                           bool CanReassociate = false);
+                           bool CanReassociate = false) const;
 
   /// Transform (fadd (fmul x, y), z) -> (fma x, y, z)
   ///           (fadd (fmul x, y), z) -> (fmad x, y, z)
-  bool matchCombineFAddFMulToFMadOrFMA(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchCombineFAddFMulToFMadOrFMA(MachineInstr &MI,
+                                       BuildFnTy &MatchInfo) const;
 
   /// Transform (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
   ///           (fadd (fpext (fmul x, y)), z) -> (fmad (fpext x), (fpext y), z)
   bool matchCombineFAddFpExtFMulToFMadOrFMA(MachineInstr &MI,
-                                            BuildFnTy &MatchInfo);
+                                            BuildFnTy &MatchInfo) const;
 
   /// Transform (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y, (fma u, v, z))
   ///          (fadd (fmad x, y, (fmul u, v)), z) -> (fmad x, y, (fmad u, v, z))
   bool matchCombineFAddFMAFMulToFMadOrFMA(MachineInstr &MI,
-                                          BuildFnTy &MatchInfo);
+                                          BuildFnTy &MatchInfo) const;
 
   // Transform (fadd (fma x, y, (fpext (fmul u, v))), z)
   //            -> (fma x, y, (fma (fpext u), (fpext v), z))
   //           (fadd (fmad x, y, (fpext (fmul u, v))), z)
   //            -> (fmad x, y, (fmad (fpext u), (fpext v), z))
-  bool matchCombineFAddFpExtFMulToFMadOrFMAAggressive(MachineInstr &MI,
-                                                      BuildFnTy &MatchInfo);
+  bool
+  matchCombineFAddFpExtFMulToFMadOrFMAAggressive(MachineInstr &MI,
+                                                 BuildFnTy &MatchInfo) const;
 
   /// Transform (fsub (fmul x, y), z) -> (fma x, y, -z)
   ///           (fsub (fmul x, y), z) -> (fmad x, y, -z)
-  bool matchCombineFSubFMulToFMadOrFMA(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchCombineFSubFMulToFMadOrFMA(MachineInstr &MI,
+                                       BuildFnTy &MatchInfo) const;
 
   /// Transform (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
   ///           (fsub (fneg (fmul, x, y)), z) -> (fmad (fneg x), y, (fneg z))
   bool matchCombineFSubFNegFMulToFMadOrFMA(MachineInstr &MI,
-                                           BuildFnTy &MatchInfo);
+                                           BuildFnTy &MatchInfo) const;
 
   /// Transform (fsub (fpext (fmul x, y)), z)
   ///           -> (fma (fpext x), (fpext y), (fneg z))
   ///           (fsub (fpext (fmul x, y)), z)
   ///           -> (fmad (fpext x), (fpext y), (fneg z))
   bool matchCombineFSubFpExtFMulToFMadOrFMA(MachineInstr &MI,
-                                            BuildFnTy &MatchInfo);
+                                            BuildFnTy &MatchInfo) const;
 
   /// Transform (fsub (fpext (fneg (fmul x, y))), z)
   ///           -> (fneg (fma (fpext x), (fpext y), z))
   ///           (fsub (fpext (fneg (fmul x, y))), z)
   ///           -> (fneg (fmad (fpext x), (fpext y), z))
   bool matchCombineFSubFpExtFNegFMulToFMadOrFMA(MachineInstr &MI,
-                                                BuildFnTy &MatchInfo);
+                                                BuildFnTy &MatchInfo) const;
 
-  bool matchCombineFMinMaxNaN(MachineInstr &MI, unsigned &Info);
+  bool matchCombineFMinMaxNaN(MachineInstr &MI, unsigned &Info) const;
 
   /// Transform G_ADD(x, G_SUB(y, x)) to y.
   /// Transform G_ADD(G_SUB(y, x), x) to y.
-  bool matchAddSubSameReg(MachineInstr &MI, Register &Src);
+  bool matchAddSubSameReg(MachineInstr &MI, Register &Src) const;
 
-  bool matchBuildVectorIdentityFold(MachineInstr &MI, Register &MatchInfo);
-  bool matchTruncBuildVectorFold(MachineInstr &MI, Register &MatchInfo);
-  bool matchTruncLshrBuildVectorFold(MachineInstr &MI, Register &MatchInfo);
+  bool matchBuildVectorIdentityFold(MachineInstr &MI,
+                                    Register &MatchInfo) const;
+  bool matchTruncBuildVectorFold(MachineInstr &MI, Register &MatchInfo) const;
+  bool matchTruncLshrBuildVectorFold(MachineInstr &MI,
+                                     Register &MatchInfo) const;
 
   /// Transform:
   ///   (x + y) - y -> x
   ///   (x + y) - x -> y
   ///   x - (y + x) -> 0 - y
   ///   x - (x + z) -> 0 - z
-  bool matchSubAddSameReg(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchSubAddSameReg(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
   /// \returns true if it is possible to simplify a select instruction \p MI
   /// to a min/max instruction of some sort.
-  bool matchSimplifySelectToMinMax(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchSimplifySelectToMinMax(MachineInstr &MI,
+                                   BuildFnTy &MatchInfo) const;
 
   /// Transform:
   ///   (X + Y) == X -> Y == 0
@@ -803,144 +837,157 @@ class CombinerHelper {
   ///   (X + Y) != X -> Y != 0
   ///   (X - Y) != X -> Y != 0
   ///   (X ^ Y) != X -> Y != 0
-  bool matchRedundantBinOpInEquality(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchRedundantBinOpInEquality(MachineInstr &MI,
+                                     BuildFnTy &MatchInfo) const;
 
   /// Match shifts greater or equal to the bitwidth of the operation.
-  bool matchShiftsTooBig(MachineInstr &MI);
+  bool matchShiftsTooBig(MachineInstr &MI) const;
 
   /// Match constant LHS ops that should be commuted.
-  bool matchCommuteConstantToRHS(MachineInstr &MI);
+  bool matchCommuteConstantToRHS(MachineInstr &MI) const;
 
   /// Combine sext of trunc.
-  bool matchSextOfTrunc(const MachineOperand &MO, BuildFnTy &MatchInfo);
+  bool matchSextOfTrunc(const MachineOperand &MO, BuildFnTy &MatchInfo) const;
 
   /// Combine zext of trunc.
-  bool matchZextOfTrunc(const MachineOperand &MO, BuildFnTy &MatchInfo);
+  bool matchZextOfTrunc(const MachineOperand &MO, BuildFnTy &MatchInfo) const;
 
   /// Combine zext nneg to sext.
-  bool matchNonNegZext(const MachineOperand &MO, BuildFnTy &MatchInfo);
+  bool matchNonNegZext(const MachineOperand &MO, BuildFnTy &MatchInfo) const;
 
   /// Match constant LHS FP ops that should be commuted.
-  bool matchCommuteFPConstantToRHS(MachineInstr &MI);
+  bool matchCommuteFPConstantToRHS(MachineInstr &MI) const;
 
   // Given a binop \p MI, commute operands 1 and 2.
-  void applyCommuteBinOpOperands(MachineInstr &MI);
+  void applyCommuteBinOpOperands(MachineInstr &MI) const;
 
   /// Combine select to integer min/max.
-  bool matchSelectIMinMax(const MachineOperand &MO, BuildFnTy &MatchInfo);
+  bool matchSelectIMinMax(const MachineOperand &MO, BuildFnTy &MatchInfo) const;
 
   /// Combine selects.
-  bool matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
   /// Combine ands.
-  bool matchAnd(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchAnd(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
   /// Combine ors.
-  bool matchOr(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchOr(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
   /// trunc (binop X, C) --> binop (trunc X, trunc C).
   bool matchNarrowBinop(const MachineInstr &TruncMI,
-                        const MachineInstr &BinopMI, BuildFnTy &MatchInfo);
+                        const MachineInstr &BinopMI,
+                        BuildFnTy &MatchInfo) const;
 
-  bool matchCastOfInteger(const MachineInstr &CastMI, APInt &MatchInfo);
+  bool matchCastOfInteger(const MachineInstr &CastMI, APInt &MatchInfo) const;
 
   /// Combine addos.
-  bool matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
   /// Combine extract vector element.
-  bool matchExtractVectorElement(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchExtractVectorElement(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
   /// Combine extract vector element with a build vector on the vector register.
   bool matchExtractVectorElementWithBuildVector(const MachineInstr &MI,
                                                 const MachineInstr &MI2,
-                                                BuildFnTy &MatchInfo);
+                                                BuildFnTy &MatchInfo) const;
 
   /// Combine extract vector element with a build vector trunc on the vector
   /// register.
-  bool matchExtractVectorElementWithBuildVectorTrunc(const MachineOperand &MO,
-                                                     BuildFnTy &MatchInfo);
+  bool
+  matchExtractVectorElementWithBuildVectorTrunc(const MachineOperand &MO,
+                                                BuildFnTy &MatchInfo) const;
 
   /// Combine extract vector element with a shuffle vector on the vector
   /// register.
   bool matchExtractVectorElementWithShuffleVector(const MachineInstr &MI,
                                                   const MachineInstr &MI2,
-                                                  BuildFnTy &MatchInfo);
+                                                  BuildFnTy &MatchInfo) const;
 
   /// Combine extract vector element with a insert vector element on the vector
   /// register and different indices.
-  bool matchExtractVectorElementWithDifferentIndices(const MachineOperand &MO,
-                                                     BuildFnTy &MatchInfo);
+  bool
+  matchExtractVectorElementWithDifferentIndices(const MachineOperand &MO,
+                                                BuildFnTy &MatchInfo) const;
 
   /// Remove references to rhs if it is undef
-  bool matchShuffleUndefRHS(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchShuffleUndefRHS(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
   /// Turn shuffle a, b, mask -> shuffle undef, b, mask iff mask does not
   /// reference a.
-  bool matchShuffleDisjointMask(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchShuffleDisjointMask(MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
   /// Use a function which takes in a MachineIRBuilder to perform a combine.
   /// By default, it erases the instruction def'd on \p MO from the function.
-  void applyBuildFnMO(const MachineOperand &MO, BuildFnTy &MatchInfo);
+  void applyBuildFnMO(const MachineOperand &MO, BuildFnTy &MatchInfo) const;
 
   /// Match FPOWI if it's safe to extend it into a series of multiplications.
-  bool matchFPowIExpansion(MachineInstr &MI, int64_t Exponent);
+  bool matchFPowIExpansion(MachineInstr &MI, int64_t Exponent) const;
 
   /// Expands FPOWI into a series of multiplications and a division if the
   /// exponent is negative.
-  void applyExpandFPowI(MachineInstr &MI, int64_t Exponent);
+  void applyExpandFPowI(MachineInstr &MI, int64_t Exponent) const;
 
   /// Combine insert vector element OOB.
-  bool matchInsertVectorElementOOB(MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchInsertVectorElementOOB(MachineInstr &MI,
+                                   BuildFnTy &MatchInfo) const;
 
   bool matchFreezeOfSingleMaybePoisonOperand(MachineInstr &MI,
-                                             BuildFnTy &MatchInfo);
+                                             BuildFnTy &MatchInfo) const;
 
-  bool matchAddOfVScale(const MachineOperand &MO, BuildFnTy &MatchInfo);
+  bool matchAddOfVScale(const MachineOperand &MO, BuildFnTy &MatchInfo) const;
 
-  bool matchMulOfVScale(const MachineOperand &MO, BuildFnTy &MatchInfo);
+  bool matchMulOfVScale(const MachineOperand &MO, BuildFnTy &MatchInfo) const;
 
-  bool matchSubOfVScale(const MachineOperand &MO, BuildFnTy &MatchInfo);
+  bool matchSubOfVScale(const MachineOperand &MO, BuildFnTy &MatchInfo) const;
 
-  bool matchShlOfVScale(const MachineOperand &MO, BuildFnTy &MatchInfo);
+  bool matchShlOfVScale(const MachineOperand &MO, BuildFnTy &MatchInfo) const;
 
   /// Transform trunc ([asz]ext x) to x or ([asz]ext x) or (trunc x).
   bool matchTruncateOfExt(const MachineInstr &Root, const MachineInstr &ExtMI,
-                          BuildFnTy &MatchInfo);
+                          BuildFnTy &MatchInfo) const;
 
   bool matchCastOfSelect(const MachineInstr &Cast, const MachineInstr &SelectMI,
-                         BuildFnTy &MatchInfo);
-  bool matchFoldAPlusC1MinusC2(const MachineInstr &MI, BuildFnTy &MatchInfo);
+                         BuildFnTy &MatchInfo) const;
+  bool matchFoldAPlusC1MinusC2(const MachineInstr &MI,
+                               BuildFnTy &MatchInfo) const;
 
-  bool matchFoldC2MinusAPlusC1(const MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchFoldC2MinusAPlusC1(const MachineInstr &MI,
+                               BuildFnTy &MatchInfo) const;
 
-  bool matchFoldAMinusC1MinusC2(const MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchFoldAMinusC1MinusC2(const MachineInstr &MI,
+                                BuildFnTy &MatchInfo) const;
 
-  bool matchFoldC1Minus2MinusC2(const MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchFoldC1Minus2MinusC2(const MachineInstr &MI,
+                                BuildFnTy &MatchInfo) const;
 
   // fold ((A-C1)+C2) -> (A+(C2-C1))
-  bool matchFoldAMinusC1PlusC2(const MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchFoldAMinusC1PlusC2(const MachineInstr &MI,
+                               BuildFnTy &MatchInfo) const;
 
   bool matchExtOfExt(const MachineInstr &FirstMI, const MachineInstr &SecondMI,
-                     BuildFnTy &MatchInfo);
+                     BuildFnTy &MatchInfo) const;
 
   bool matchCastOfBuildVector(const MachineInstr &CastMI,
-                              const MachineInstr &BVMI, BuildFnTy &MatchInfo);
+                              const MachineInstr &BVMI,
+                              BuildFnTy &MatchInfo) const;
 
-  bool matchCanonicalizeICmp(const MachineInstr &MI, BuildFnTy &MatchInfo);
-  bool matchCanonicalizeFCmp(const MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchCanonicalizeICmp(const MachineInstr &MI,
+                             BuildFnTy &MatchInfo) const;
+  bool matchCanonicalizeFCmp(const MachineInstr &MI,
+                             BuildFnTy &MatchInfo) const;
 
   // unmerge_values(anyext(build vector)) -> build vector(anyext)
   bool matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI,
-                                           BuildFnTy &MatchInfo);
+                                           BuildFnTy &MatchInfo) const;
 
   // merge_values(_, undef) -> anyext
-  bool matchMergeXAndUndef(const MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchMergeXAndUndef(const MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
   // merge_values(_, zero) -> zext
-  bool matchMergeXAndZero(const MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchMergeXAndZero(const MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
   // overflow sub
-  bool matchSuboCarryOut(const MachineInstr &MI, BuildFnTy &MatchInfo);
+  bool matchSuboCarryOut(const MachineInstr &MI, BuildFnTy &MatchInfo) const;
 
 private:
   /// Checks for legality of an indexed variant of \p LdSt.
@@ -950,14 +997,14 @@ class CombinerHelper {
   ///
   /// \returns true if a candidate is found.
   bool findPostIndexCandidate(GLoadStore &MI, Register &Addr, Register &Base,
-                              Register &Offset, bool &RematOffset);
+                              Register &Offset, bool &RematOffset) const;
 
   /// Given a non-indexed load or store instruction \p MI, find an offset that
   /// can be usefully and legally folded into it as a pre-indexing operation.
   ///
   /// \returns true if a candidate is found.
   bool findPreIndexCandidate(GLoadStore &MI, Register &Addr, Register &Base,
-                             Register &Offset);
+                             Register &Offset) const;
 
   /// Helper function for matchLoadOrCombine. Searches for Registers
   /// which may have been produced by a load instruction + some arithmetic.
@@ -983,12 +1030,12 @@ class CombinerHelper {
   findLoadOffsetsForLoadOrCombine(
       SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
       const SmallVector<Register, 8> &RegsToVisit,
-      const unsigned MemSizeInBits);
+      const unsigned MemSizeInBits) const;
 
   /// Examines the G_PTR_ADD instruction \p PtrAdd and determines if performing
   /// a re-association of its operands would break an existing legal addressing
   /// mode that the address computation currently represents.
-  bool reassociationCanBreakAddressingModePattern(MachineInstr &PtrAdd);
+  bool reassociationCanBreakAddressingModePattern(MachineInstr &PtrAdd) const;
 
   /// Behavior when a floating point min/max is given one NaN and one
   /// non-NaN as input.
@@ -1031,36 +1078,36 @@ class CombinerHelper {
   /// select (fcmp uge x, 1.0) x, 1.0 -> fmax x, 1.0
   /// select (fcmp uge x, 1.0) 1.0, x -> fminnm x, 1.0
   bool matchFPSelectToMinMax(Register Dst, Register Cond, Register TrueVal,
-                             Register FalseVal, BuildFnTy &MatchInfo);
+                             Register FalseVal, BuildFnTy &MatchInfo) const;
 
   /// Try to fold selects to logical operations.
-  bool tryFoldBoolSelectToLogic(GSelect *Select, BuildFnTy &MatchInfo);
+  bool tryFoldBoolSelectToLogic(GSelect *Select, BuildFnTy &MatchInfo) const;
 
-  bool tryFoldSelectOfConstants(GSelect *Select, BuildFnTy &MatchInfo);
+  bool tryFoldSelectOfConstants(GSelect *Select, BuildFnTy &MatchInfo) const;
 
-  bool isOneOrOneSplat(Register Src, bool AllowUndefs);
-  bool isZeroOrZeroSplat(Register Src, bool AllowUndefs);
+  bool isOneOrOneSplat(Register Src, bool AllowUndefs) const;
+  bool isZeroOrZeroSplat(Register Src, bool AllowUndefs) const;
   bool isConstantSplatVector(Register Src, int64_t SplatValue,
-                             bool AllowUndefs);
+                             bool AllowUndefs) const;
   bool isConstantOrConstantVectorI(Register Src) const;
 
-  std::optional<APInt> getConstantOrConstantSplatVector(Register Src);
+  std::optional<APInt> getConstantOrConstantSplatVector(Register Src) const;
 
   /// Fold (icmp Pred1 V1, C1) && (icmp Pred2 V2, C2)
   /// or   (icmp Pred1 V1, C1) || (icmp Pred2 V2, C2)
   /// into a single comparison using range-based reasoning.
   bool tryFoldAndOrOrICmpsUsingRanges(GLogicalBinOp *Logic,
-                                      BuildFnTy &MatchInfo);
+                                      BuildFnTy &MatchInfo) const;
 
   // Simplify (cmp cc0 x, y) (&& or ||) (cmp cc1 x, y) -> cmp cc2 x, y.
-  bool tryFoldLogicOfFCmps(GLogicalBinOp *Logic, BuildFnTy &MatchInfo);
+  bool tryFoldLogicOfFCmps(GLogicalBinOp *Logic, BuildFnTy &MatchInfo) const;
 
   bool isCastFree(unsigned Opcode, LLT ToTy, LLT FromTy) const;
 
   bool constantFoldICmp(const GICmp &ICmp, const GIConstant &LHSCst,
-                        const GIConstant &RHSCst, BuildFnTy &MatchInfo);
+                        const GIConstant &RHSCst, BuildFnTy &MatchInfo) const;
   bool constantFoldFCmp(const GFCmp &FCmp, const GFConstant &LHSCst,
-                        const GFConstant &RHSCst, BuildFnTy &MatchInfo);
+                        const GFConstant &RHSCst, BuildFnTy &MatchInfo) const;
 };
 } // namespace llvm
 
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index e31980041e192..c20e9d0c6876e 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -207,26 +207,27 @@ const RegisterBank *CombinerHelper::getRegBank(Register Reg) const {
   return RBI->getRegBank(Reg, MRI, *TRI);
 }
 
-void CombinerHelper::setRegBank(Register Reg, const RegisterBank *RegBank) {
+void CombinerHelper::setRegBank(Register Reg,
+                                const RegisterBank *RegBank) const {
   if (RegBank)
     MRI.setRegBank(Reg, *RegBank);
 }
 
-bool CombinerHelper::tryCombineCopy(MachineInstr &MI) {
+bool CombinerHelper::tryCombineCopy(MachineInstr &MI) const {
   if (matchCombineCopy(MI)) {
     applyCombineCopy(MI);
     return true;
   }
   return false;
 }
-bool CombinerHelper::matchCombineCopy(MachineInstr &MI) {
+bool CombinerHelper::matchCombineCopy(MachineInstr &MI) const {
   if (MI.getOpcode() != TargetOpcode::COPY)
     return false;
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(1).getReg();
   return canReplaceReg(DstReg, SrcReg, MRI);
 }
-void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
+void CombinerHelper::applyCombineCopy(MachineInstr &MI) const {
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(1).getReg();
   replaceRegWith(MRI, DstReg, SrcReg);
@@ -234,7 +235,7 @@ void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
 }
 
 bool CombinerHelper::matchFreezeOfSingleMaybePoisonOperand(
-    MachineInstr &MI, BuildFnTy &MatchInfo) {
+    MachineInstr &MI, BuildFnTy &MatchInfo) const {
   // Ported from InstCombinerImpl::pushFreezeToPreventPoisonFromPropagating.
   Register DstOp = MI.getOperand(0).getReg();
   Register OrigOp = MI.getOperand(1).getReg();
@@ -303,8 +304,8 @@ bool CombinerHelper::matchFreezeOfSingleMaybePoisonOperand(
   return true;
 }
 
-bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI,
-                                               SmallVector<Register> &Ops) {
+bool CombinerHelper::matchCombineConcatVectors(
+    MachineInstr &MI, SmallVector<Register> &Ops) const {
   assert(MI.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
          "Invalid instruction");
   bool IsUndef = true;
@@ -361,8 +362,8 @@ bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI,
 
   return true;
 }
-void CombinerHelper::applyCombineConcatVectors(MachineInstr &MI,
-                                               SmallVector<Register> &Ops) {
+void CombinerHelper::applyCombineConcatVectors(
+    MachineInstr &MI, SmallVector<Register> &Ops) const {
   // We determined that the concat_vectors can be flatten.
   // Generate the flattened build_vector.
   Register DstReg = MI.getOperand(0).getReg();
@@ -383,8 +384,8 @@ void CombinerHelper::applyCombineConcatVectors(MachineInstr &MI,
   MI.eraseFromParent();
 }
 
-bool CombinerHelper::matchCombineShuffleConcat(MachineInstr &MI,
-                                               SmallVector<Register> &Ops) {
+bool CombinerHelper::matchCombineShuffleConcat(
+    MachineInstr &MI, SmallVector<Register> &Ops) const {
   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
   auto ConcatMI1 =
       dyn_cast<GConcatVectors>(MRI.getVRegDef(MI.getOperand(1).getReg()));
@@ -443,8 +444,8 @@ bool CombinerHelper::matchCombineShuffleConcat(MachineInstr &MI,
   return !Ops.empty();
 }
 
-void CombinerHelper::applyCombineShuffleConcat(MachineInstr &MI,
-                                               SmallVector<Register> &Ops) {
+void CombinerHelper::applyCombineShuffleConcat(
+    MachineInstr &MI, SmallVector<Register> &Ops) const {
   LLT SrcTy;
   for (Register &Reg : Ops) {
     if (Reg != 0)
@@ -469,7 +470,7 @@ void CombinerHelper::applyCombineShuffleConcat(MachineInstr &MI,
   MI.eraseFromParent();
 }
 
-bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) {
+bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) const {
   SmallVector<Register, 4> Ops;
   if (matchCombineShuffleVector(MI, Ops)) {
     applyCombineShuffleVector(MI, Ops);
@@ -478,8 +479,8 @@ bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) {
   return false;
 }
 
-bool CombinerHelper::matchCombineShuffleVector(MachineInstr &MI,
-                                               SmallVectorImpl<Register> &Ops) {
+bool CombinerHelper::matchCombineShuffleVector(
+    MachineInstr &MI, SmallVectorImpl<Register> &Ops) const {
   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
          "Invalid instruction kind");
   LLT DstType = MRI.getType(MI.getOperand(0).getReg());
@@ -554,8 +555,8 @@ bool CombinerHelper::matchCombineShuffleVector(MachineInstr &MI,
   return true;
 }
 
-void CombinerHelper::applyCombineShuffleVector(MachineInstr &MI,
-                                               const ArrayRef<Register> Ops) {
+void CombinerHelper::applyCombineShuffleVector(
+    MachineInstr &MI, const ArrayRef<Register> Ops) const {
   Register DstReg = MI.getOperand(0).getReg();
   Builder.setInsertPt(*MI.getParent(), MI);
   Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
@@ -569,7 +570,7 @@ void CombinerHelper::applyCombineShuffleVector(MachineInstr &MI,
   MI.eraseFromParent();
 }
 
-bool CombinerHelper::matchShuffleToExtract(MachineInstr &MI) {
+bool CombinerHelper::matchShuffleToExtract(MachineInstr &MI) const {
   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
          "Invalid instruction kind");
 
@@ -577,7 +578,7 @@ bool CombinerHelper::matchShuffleToExtract(MachineInstr &MI) {
   return Mask.size() == 1;
 }
 
-void CombinerHelper::applyShuffleToExtract(MachineInstr &MI) {
+void CombinerHelper::applyShuffleToExtract(MachineInstr &MI) const {
   Register DstReg = MI.getOperand(0).getReg();
   Builder.setInsertPt(*MI.getParent(), MI);
 
@@ -690,7 +691,7 @@ static void InsertInsnsWithoutSideEffectsBeforeUse(
 }
 } // end anonymous namespace
 
-bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) {
+bool CombinerHelper::tryCombineExtendingLoads(MachineInstr &MI) const {
   PreferredTuple Preferred;
   if (matchCombineExtendingLoads(MI, Preferred)) {
     applyCombineExtendingLoads(MI, Preferred);
@@ -717,8 +718,8 @@ static unsigned getExtLoadOpcForExtend(unsigned ExtOpc) {
   return CandidateLoadOpc;
 }
 
-bool CombinerHelper::matchCombineExtendingLoads(MachineInstr &MI,
-                                                PreferredTuple &Preferred) {
+bool CombinerHelper::matchCombineExtendingLoads(
+    MachineInstr &MI, PreferredTuple &Preferred) const {
   // We match the loads and follow the uses to the extend instead of matching
   // the extends and following the def to the load. This is because the load
   // must remain in the same position for correctness (unless we also add code
@@ -793,8 +794,8 @@ bool CombinerHelper::matchCombineExtendingLoads(MachineInstr &MI,
   return true;
 }
 
-void CombinerHelper::applyCombineExtendingLoads(MachineInstr &MI,
-                                                PreferredTuple &Preferred) {
+void CombinerHelper::applyCombineExtendingLoads(
+    MachineInstr &MI, PreferredTuple &Preferred) const {
   // Rewrite the load to the chosen extending load.
   Register ChosenDstReg = Preferred.MI->getOperand(0).getReg();
 
@@ -900,7 +901,7 @@ void CombinerHelper::applyCombineExtendingLoads(MachineInstr &MI,
 }
 
 bool CombinerHelper::matchCombineLoadWithAndMask(MachineInstr &MI,
-                                                 BuildFnTy &MatchInfo) {
+                                                 BuildFnTy &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_AND);
 
   // If we have the following code:
@@ -982,7 +983,7 @@ bool CombinerHelper::matchCombineLoadWithAndMask(MachineInstr &MI,
 }
 
 bool CombinerHelper::isPredecessor(const MachineInstr &DefMI,
-                                   const MachineInstr &UseMI) {
+                                   const MachineInstr &UseMI) const {
   assert(!DefMI.isDebugInstr() && !UseMI.isDebugInstr() &&
          "shouldn't consider debug uses");
   assert(DefMI.getParent() == UseMI.getParent());
@@ -998,7 +999,7 @@ bool CombinerHelper::isPredecessor(const MachineInstr &DefMI,
 }
 
 bool CombinerHelper::dominates(const MachineInstr &DefMI,
-                               const MachineInstr &UseMI) {
+                               const MachineInstr &UseMI) const {
   assert(!DefMI.isDebugInstr() && !UseMI.isDebugInstr() &&
          "shouldn't consider debug uses");
   if (MDT)
@@ -1009,7 +1010,7 @@ bool CombinerHelper::dominates(const MachineInstr &DefMI,
   return isPredecessor(DefMI, UseMI);
 }
 
-bool CombinerHelper::matchSextTruncSextLoad(MachineInstr &MI) {
+bool CombinerHelper::matchSextTruncSextLoad(MachineInstr &MI) const {
   assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG);
   Register SrcReg = MI.getOperand(1).getReg();
   Register LoadUser = SrcReg;
@@ -1036,14 +1037,14 @@ bool CombinerHelper::matchSextTruncSextLoad(MachineInstr &MI) {
   return false;
 }
 
-void CombinerHelper::applySextTruncSextLoad(MachineInstr &MI) {
+void CombinerHelper::applySextTruncSextLoad(MachineInstr &MI) const {
   assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG);
   Builder.buildCopy(MI.getOperand(0).getReg(), MI.getOperand(1).getReg());
   MI.eraseFromParent();
 }
 
 bool CombinerHelper::matchSextInRegOfLoad(
-    MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo) {
+    MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG);
 
   Register DstReg = MI.getOperand(0).getReg();
@@ -1095,7 +1096,7 @@ bool CombinerHelper::matchSextInRegOfLoad(
 }
 
 void CombinerHelper::applySextInRegOfLoad(
-    MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo) {
+    MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG);
   Register LoadReg;
   unsigned ScalarSizeBits;
@@ -1185,7 +1186,7 @@ static cl::opt<unsigned> PostIndexUseThreshold(
 
 bool CombinerHelper::findPostIndexCandidate(GLoadStore &LdSt, Register &Addr,
                                             Register &Base, Register &Offset,
-                                            bool &RematOffset) {
+                                            bool &RematOffset) const {
   // We're looking for the following pattern, for either load or store:
   // %baseptr:_(p0) = ...
   // G_STORE %val(s64), %baseptr(p0)
@@ -1280,7 +1281,8 @@ bool CombinerHelper::findPostIndexCandidate(GLoadStore &LdSt, Register &Addr,
 }
 
 bool CombinerHelper::findPreIndexCandidate(GLoadStore &LdSt, Register &Addr,
-                                           Register &Base, Register &Offset) {
+                                           Register &Base,
+                                           Register &Offset) const {
   auto &MF = *LdSt.getParent()->getParent();
   const auto &TLI = *MF.getSubtarget().getTargetLowering();
 
@@ -1335,8 +1337,8 @@ bool CombinerHelper::findPreIndexCandidate(GLoadStore &LdSt, Register &Addr,
   return RealUse;
 }
 
-bool CombinerHelper::matchCombineExtractedVectorLoad(MachineInstr &MI,
-                                                     BuildFnTy &MatchInfo) {
+bool CombinerHelper::matchCombineExtractedVectorLoad(
+    MachineInstr &MI, BuildFnTy &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT);
 
   // Check if there is a load that defines the vector being extracted from.
@@ -1442,7 +1444,7 @@ bool CombinerHelper::matchCombineExtractedVectorLoad(MachineInstr &MI,
 }
 
 bool CombinerHelper::matchCombineIndexedLoadStore(
-    MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
+    MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) const {
   auto &LdSt = cast<GLoadStore>(MI);
 
   if (LdSt.isAtomic())
@@ -1459,7 +1461,7 @@ bool CombinerHelper::matchCombineIndexedLoadStore(
 }
 
 void CombinerHelper::applyCombineIndexedLoadStore(
-    MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) {
+    MachineInstr &MI, IndexedLoadStoreMatchInfo &MatchInfo) const {
   MachineInstr &AddrDef = *MRI.getUniqueVRegDef(MatchInfo.Addr);
   unsigned Opcode = MI.getOpcode();
   bool IsStore = Opcode == TargetOpcode::G_STORE;
@@ -1494,7 +1496,7 @@ void CombinerHelper::applyCombineIndexedLoadStore(
 }
 
 bool CombinerHelper::matchCombineDivRem(MachineInstr &MI,
-                                        MachineInstr *&OtherMI) {
+                                        MachineInstr *&OtherMI) const {
   unsigned Opcode = MI.getOpcode();
   bool IsDiv, IsSigned;
 
@@ -1557,7 +1559,7 @@ bool CombinerHelper::matchCombineDivRem(MachineInstr &MI,
 }
 
 void CombinerHelper::applyCombineDivRem(MachineInstr &MI,
-                                        MachineInstr *&OtherMI) {
+                                        MachineInstr *&OtherMI) const {
   unsigned Opcode = MI.getOpcode();
   assert(OtherMI && "OtherMI shouldn't be empty.");
 
@@ -1588,8 +1590,8 @@ void CombinerHelper::applyCombineDivRem(MachineInstr &MI,
   OtherMI->eraseFromParent();
 }
 
-bool CombinerHelper::matchOptBrCondByInvertingCond(MachineInstr &MI,
-                                                   MachineInstr *&BrCond) {
+bool CombinerHelper::matchOptBrCondByInvertingCond(
+    MachineInstr &MI, MachineInstr *&BrCond) const {
   assert(MI.getOpcode() == TargetOpcode::G_BR);
 
   // Try to match the following:
@@ -1622,8 +1624,8 @@ bool CombinerHelper::matchOptBrCondByInvertingCond(MachineInstr &MI,
          MBB->isLayoutSuccessor(BrCondTarget);
 }
 
-void CombinerHelper::applyOptBrCondByInvertingCond(MachineInstr &MI,
-                                                   MachineInstr *&BrCond) {
+void CombinerHelper::applyOptBrCondByInvertingCond(
+    MachineInstr &MI, MachineInstr *&BrCond) const {
   MachineBasicBlock *BrTarget = MI.getOperand(0).getMBB();
   Builder.setInstrAndDebugLoc(*BrCond);
   LLT Ty = MRI.getType(BrCond->getOperand(0).getReg());
@@ -1647,8 +1649,7 @@ void CombinerHelper::applyOptBrCondByInvertingCond(MachineInstr &MI,
   Observer.changedInstr(*BrCond);
 }
 
-
-bool CombinerHelper::tryEmitMemcpyInline(MachineInstr &MI) {
+bool CombinerHelper::tryEmitMemcpyInline(MachineInstr &MI) const {
   MachineIRBuilder HelperBuilder(MI);
   GISelObserverWrapper DummyObserver;
   LegalizerHelper Helper(HelperBuilder.getMF(), DummyObserver, HelperBuilder);
@@ -1656,7 +1657,8 @@ bool CombinerHelper::tryEmitMemcpyInline(MachineInstr &MI) {
          LegalizerHelper::LegalizeResult::Legalized;
 }
 
-bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
+bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI,
+                                            unsigned MaxLen) const {
   MachineIRBuilder HelperBuilder(MI);
   GISelObserverWrapper DummyObserver;
   LegalizerHelper Helper(HelperBuilder.getMF(), DummyObserver, HelperBuilder);
@@ -1709,8 +1711,8 @@ static APFloat constantFoldFpUnary(const MachineInstr &MI,
   return Result;
 }
 
-void CombinerHelper::applyCombineConstantFoldFpUnary(MachineInstr &MI,
-                                                     const ConstantFP *Cst) {
+void CombinerHelper::applyCombineConstantFoldFpUnary(
+    MachineInstr &MI, const ConstantFP *Cst) const {
   APFloat Folded = constantFoldFpUnary(MI, MRI, Cst->getValue());
   const ConstantFP *NewCst = ConstantFP::get(Builder.getContext(), Folded);
   Builder.buildFConstant(MI.getOperand(0), *NewCst);
@@ -1718,7 +1720,7 @@ void CombinerHelper::applyCombineConstantFoldFpUnary(MachineInstr &MI,
 }
 
 bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI,
-                                           PtrAddChain &MatchInfo) {
+                                           PtrAddChain &MatchInfo) const {
   // We're trying to match the following pattern:
   //   %t1 = G_PTR_ADD %base, G_CONSTANT imm1
   //   %root = G_PTR_ADD %t1, G_CONSTANT imm2
@@ -1780,7 +1782,7 @@ bool CombinerHelper::matchPtrAddImmedChain(MachineInstr &MI,
 }
 
 void CombinerHelper::applyPtrAddImmedChain(MachineInstr &MI,
-                                           PtrAddChain &MatchInfo) {
+                                           PtrAddChain &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
   MachineIRBuilder MIB(MI);
   LLT OffsetTy = MRI.getType(MI.getOperand(2).getReg());
@@ -1793,7 +1795,7 @@ void CombinerHelper::applyPtrAddImmedChain(MachineInstr &MI,
 }
 
 bool CombinerHelper::matchShiftImmedChain(MachineInstr &MI,
-                                          RegisterImmPair &MatchInfo) {
+                                          RegisterImmPair &MatchInfo) const {
   // We're trying to match the following pattern with any of
   // G_SHL/G_ASHR/G_LSHR/G_SSHLSAT/G_USHLSAT shift instructions:
   //   %t1 = SHIFT %base, G_CONSTANT imm1
@@ -1838,7 +1840,7 @@ bool CombinerHelper::matchShiftImmedChain(MachineInstr &MI,
 }
 
 void CombinerHelper::applyShiftImmedChain(MachineInstr &MI,
-                                          RegisterImmPair &MatchInfo) {
+                                          RegisterImmPair &MatchInfo) const {
   unsigned Opcode = MI.getOpcode();
   assert((Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_ASHR ||
           Opcode == TargetOpcode::G_LSHR || Opcode == TargetOpcode::G_SSHLSAT ||
@@ -1869,8 +1871,8 @@ void CombinerHelper::applyShiftImmedChain(MachineInstr &MI,
   Observer.changedInstr(MI);
 }
 
-bool CombinerHelper::matchShiftOfShiftedLogic(MachineInstr &MI,
-                                              ShiftOfShiftedLogic &MatchInfo) {
+bool CombinerHelper::matchShiftOfShiftedLogic(
+    MachineInstr &MI, ShiftOfShiftedLogic &MatchInfo) const {
   // We're trying to match the following pattern with any of
   // G_SHL/G_ASHR/G_LSHR/G_USHLSAT/G_SSHLSAT shift instructions in combination
   // with any of G_AND/G_OR/G_XOR logic instructions.
@@ -1950,8 +1952,8 @@ bool CombinerHelper::matchShiftOfShiftedLogic(MachineInstr &MI,
   return true;
 }
 
-void CombinerHelper::applyShiftOfShiftedLogic(MachineInstr &MI,
-                                              ShiftOfShiftedLogic &MatchInfo) {
+void CombinerHelper::applyShiftOfShiftedLogic(
+    MachineInstr &MI, ShiftOfShiftedLogic &MatchInfo) const {
   unsigned Opcode = MI.getOpcode();
   assert((Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_ASHR ||
           Opcode == TargetOpcode::G_LSHR || Opcode == TargetOpcode::G_USHLSAT ||
@@ -1989,7 +1991,8 @@ void CombinerHelper::applyShiftOfShiftedLogic(MachineInstr &MI,
   MI.eraseFromParent();
 }
 
-bool CombinerHelper::matchCommuteShift(MachineInstr &MI, BuildFnTy &MatchInfo) {
+bool CombinerHelper::matchCommuteShift(MachineInstr &MI,
+                                       BuildFnTy &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_SHL && "Expected G_SHL");
   // Combine (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
   // Combine (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
@@ -2025,7 +2028,7 @@ bool CombinerHelper::matchCommuteShift(MachineInstr &MI, BuildFnTy &MatchInfo) {
 }
 
 bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI,
-                                          unsigned &ShiftVal) {
+                                          unsigned &ShiftVal) const {
   assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL");
   auto MaybeImmVal =
       getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
@@ -2037,7 +2040,7 @@ bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI,
 }
 
 void CombinerHelper::applyCombineMulToShl(MachineInstr &MI,
-                                          unsigned &ShiftVal) {
+                                          unsigned &ShiftVal) const {
   assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL");
   MachineIRBuilder MIB(MI);
   LLT ShiftTy = MRI.getType(MI.getOperand(0).getReg());
@@ -2051,7 +2054,7 @@ void CombinerHelper::applyCombineMulToShl(MachineInstr &MI,
 }
 
 bool CombinerHelper::matchCombineSubToAdd(MachineInstr &MI,
-                                          BuildFnTy &MatchInfo) {
+                                          BuildFnTy &MatchInfo) const {
   GSub &Sub = cast<GSub>(MI);
 
   LLT Ty = MRI.getType(Sub.getReg(0));
@@ -2077,7 +2080,7 @@ bool CombinerHelper::matchCombineSubToAdd(MachineInstr &MI,
 
 // shl ([sza]ext x), y => zext (shl x, y), if shift does not overflow source
 bool CombinerHelper::matchCombineShlOfExtend(MachineInstr &MI,
-                                             RegisterImmPair &MatchData) {
+                                             RegisterImmPair &MatchData) const {
   assert(MI.getOpcode() == TargetOpcode::G_SHL && KB);
   if (!getTargetLowering().isDesirableToPullExtFromShl(MI))
     return false;
@@ -2116,8 +2119,8 @@ bool CombinerHelper::matchCombineShlOfExtend(MachineInstr &MI,
   return MinLeadingZeros >= ShiftAmt && ShiftAmt < SrcTySize;
 }
 
-void CombinerHelper::applyCombineShlOfExtend(MachineInstr &MI,
-                                             const RegisterImmPair &MatchData) {
+void CombinerHelper::applyCombineShlOfExtend(
+    MachineInstr &MI, const RegisterImmPair &MatchData) const {
   Register ExtSrcReg = MatchData.Reg;
   int64_t ShiftAmtVal = MatchData.Imm;
 
@@ -2130,7 +2133,7 @@ void CombinerHelper::applyCombineShlOfExtend(MachineInstr &MI,
 }
 
 bool CombinerHelper::matchCombineMergeUnmerge(MachineInstr &MI,
-                                              Register &MatchInfo) {
+                                              Register &MatchInfo) const {
   GMerge &Merge = cast<GMerge>(MI);
   SmallVector<Register, 16> MergedValues;
   for (unsigned I = 0; I < Merge.getNumSources(); ++I)
@@ -2157,7 +2160,7 @@ static Register peekThroughBitcast(Register Reg,
 }
 
 bool CombinerHelper::matchCombineUnmergeMergeToPlainValues(
-    MachineInstr &MI, SmallVectorImpl<Register> &Operands) {
+    MachineInstr &MI, SmallVectorImpl<Register> &Operands) const {
   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
          "Expected an unmerge");
   auto &Unmerge = cast<GUnmerge>(MI);
@@ -2181,7 +2184,7 @@ bool CombinerHelper::matchCombineUnmergeMergeToPlainValues(
 }
 
 void CombinerHelper::applyCombineUnmergeMergeToPlainValues(
-    MachineInstr &MI, SmallVectorImpl<Register> &Operands) {
+    MachineInstr &MI, SmallVectorImpl<Register> &Operands) const {
   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
          "Expected an unmerge");
   assert((MI.getNumOperands() - 1 == Operands.size()) &&
@@ -2211,8 +2214,8 @@ void CombinerHelper::applyCombineUnmergeMergeToPlainValues(
   MI.eraseFromParent();
 }
 
-bool CombinerHelper::matchCombineUnmergeConstant(MachineInstr &MI,
-                                                 SmallVectorImpl<APInt> &Csts) {
+bool CombinerHelper::matchCombineUnmergeConstant(
+    MachineInstr &MI, SmallVectorImpl<APInt> &Csts) const {
   unsigned SrcIdx = MI.getNumOperands() - 1;
   Register SrcReg = MI.getOperand(SrcIdx).getReg();
   MachineInstr *SrcInstr = MRI.getVRegDef(SrcReg);
@@ -2236,8 +2239,8 @@ bool CombinerHelper::matchCombineUnmergeConstant(MachineInstr &MI,
   return true;
 }
 
-void CombinerHelper::applyCombineUnmergeConstant(MachineInstr &MI,
-                                                 SmallVectorImpl<APInt> &Csts) {
+void CombinerHelper::applyCombineUnmergeConstant(
+    MachineInstr &MI, SmallVectorImpl<APInt> &Csts) const {
   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
          "Expected an unmerge");
   assert((MI.getNumOperands() - 1 == Csts.size()) &&
@@ -2252,7 +2255,8 @@ void CombinerHelper::applyCombineUnmergeConstant(MachineInstr &MI,
 }
 
 bool CombinerHelper::matchCombineUnmergeUndef(
-    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+    MachineInstr &MI,
+    std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   unsigned SrcIdx = MI.getNumOperands() - 1;
   Register SrcReg = MI.getOperand(SrcIdx).getReg();
   MatchInfo = [&MI](MachineIRBuilder &B) {
@@ -2265,7 +2269,8 @@ bool CombinerHelper::matchCombineUnmergeUndef(
   return isa<GImplicitDef>(MRI.getVRegDef(SrcReg));
 }
 
-bool CombinerHelper::matchCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI) {
+bool CombinerHelper::matchCombineUnmergeWithDeadLanesToTrunc(
+    MachineInstr &MI) const {
   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
          "Expected an unmerge");
   if (MRI.getType(MI.getOperand(0).getReg()).isVector() ||
@@ -2279,14 +2284,15 @@ bool CombinerHelper::matchCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI) {
   return true;
 }
 
-void CombinerHelper::applyCombineUnmergeWithDeadLanesToTrunc(MachineInstr &MI) {
+void CombinerHelper::applyCombineUnmergeWithDeadLanesToTrunc(
+    MachineInstr &MI) const {
   Register SrcReg = MI.getOperand(MI.getNumDefs()).getReg();
   Register Dst0Reg = MI.getOperand(0).getReg();
   Builder.buildTrunc(Dst0Reg, SrcReg);
   MI.eraseFromParent();
 }
 
-bool CombinerHelper::matchCombineUnmergeZExtToZExt(MachineInstr &MI) {
+bool CombinerHelper::matchCombineUnmergeZExtToZExt(MachineInstr &MI) const {
   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
          "Expected an unmerge");
   Register Dst0Reg = MI.getOperand(0).getReg();
@@ -2312,7 +2318,7 @@ bool CombinerHelper::matchCombineUnmergeZExtToZExt(MachineInstr &MI) {
   return ZExtSrcTy.getSizeInBits() <= Dst0Ty.getSizeInBits();
 }
 
-void CombinerHelper::applyCombineUnmergeZExtToZExt(MachineInstr &MI) {
+void CombinerHelper::applyCombineUnmergeZExtToZExt(MachineInstr &MI) const {
   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
          "Expected an unmerge");
 
@@ -2346,7 +2352,7 @@ void CombinerHelper::applyCombineUnmergeZExtToZExt(MachineInstr &MI) {
 
 bool CombinerHelper::matchCombineShiftToUnmerge(MachineInstr &MI,
                                                 unsigned TargetShiftSize,
-                                                unsigned &ShiftVal) {
+                                                unsigned &ShiftVal) const {
   assert((MI.getOpcode() == TargetOpcode::G_SHL ||
           MI.getOpcode() == TargetOpcode::G_LSHR ||
           MI.getOpcode() == TargetOpcode::G_ASHR) && "Expected a shift");
@@ -2369,8 +2375,8 @@ bool CombinerHelper::matchCombineShiftToUnmerge(MachineInstr &MI,
   return ShiftVal >= Size / 2 && ShiftVal < Size;
 }
 
-void CombinerHelper::applyCombineShiftToUnmerge(MachineInstr &MI,
-                                                const unsigned &ShiftVal) {
+void CombinerHelper::applyCombineShiftToUnmerge(
+    MachineInstr &MI, const unsigned &ShiftVal) const {
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(1).getReg();
   LLT Ty = MRI.getType(SrcReg);
@@ -2441,8 +2447,8 @@ void CombinerHelper::applyCombineShiftToUnmerge(MachineInstr &MI,
   MI.eraseFromParent();
 }
 
-bool CombinerHelper::tryCombineShiftToUnmerge(MachineInstr &MI,
-                                              unsigned TargetShiftAmount) {
+bool CombinerHelper::tryCombineShiftToUnmerge(
+    MachineInstr &MI, unsigned TargetShiftAmount) const {
   unsigned ShiftAmt;
   if (matchCombineShiftToUnmerge(MI, TargetShiftAmount, ShiftAmt)) {
     applyCombineShiftToUnmerge(MI, ShiftAmt);
@@ -2452,7 +2458,8 @@ bool CombinerHelper::tryCombineShiftToUnmerge(MachineInstr &MI,
   return false;
 }
 
-bool CombinerHelper::matchCombineI2PToP2I(MachineInstr &MI, Register &Reg) {
+bool CombinerHelper::matchCombineI2PToP2I(MachineInstr &MI,
+                                          Register &Reg) const {
   assert(MI.getOpcode() == TargetOpcode::G_INTTOPTR && "Expected a G_INTTOPTR");
   Register DstReg = MI.getOperand(0).getReg();
   LLT DstTy = MRI.getType(DstReg);
@@ -2461,14 +2468,16 @@ bool CombinerHelper::matchCombineI2PToP2I(MachineInstr &MI, Register &Reg) {
                   m_GPtrToInt(m_all_of(m_SpecificType(DstTy), m_Reg(Reg))));
 }
 
-void CombinerHelper::applyCombineI2PToP2I(MachineInstr &MI, Register &Reg) {
+void CombinerHelper::applyCombineI2PToP2I(MachineInstr &MI,
+                                          Register &Reg) const {
   assert(MI.getOpcode() == TargetOpcode::G_INTTOPTR && "Expected a G_INTTOPTR");
   Register DstReg = MI.getOperand(0).getReg();
   Builder.buildCopy(DstReg, Reg);
   MI.eraseFromParent();
 }
 
-void CombinerHelper::applyCombineP2IToI2P(MachineInstr &MI, Register &Reg) {
+void CombinerHelper::applyCombineP2IToI2P(MachineInstr &MI,
+                                          Register &Reg) const {
   assert(MI.getOpcode() == TargetOpcode::G_PTRTOINT && "Expected a G_PTRTOINT");
   Register DstReg = MI.getOperand(0).getReg();
   Builder.buildZExtOrTrunc(DstReg, Reg);
@@ -2476,7 +2485,7 @@ void CombinerHelper::applyCombineP2IToI2P(MachineInstr &MI, Register &Reg) {
 }
 
 bool CombinerHelper::matchCombineAddP2IToPtrAdd(
-    MachineInstr &MI, std::pair<Register, bool> &PtrReg) {
+    MachineInstr &MI, std::pair<Register, bool> &PtrReg) const {
   assert(MI.getOpcode() == TargetOpcode::G_ADD);
   Register LHS = MI.getOperand(1).getReg();
   Register RHS = MI.getOperand(2).getReg();
@@ -2501,7 +2510,7 @@ bool CombinerHelper::matchCombineAddP2IToPtrAdd(
 }
 
 void CombinerHelper::applyCombineAddP2IToPtrAdd(
-    MachineInstr &MI, std::pair<Register, bool> &PtrReg) {
+    MachineInstr &MI, std::pair<Register, bool> &PtrReg) const {
   Register Dst = MI.getOperand(0).getReg();
   Register LHS = MI.getOperand(1).getReg();
   Register RHS = MI.getOperand(2).getReg();
@@ -2519,7 +2528,7 @@ void CombinerHelper::applyCombineAddP2IToPtrAdd(
 }
 
 bool CombinerHelper::matchCombineConstPtrAddToI2P(MachineInstr &MI,
-                                                  APInt &NewCst) {
+                                                  APInt &NewCst) const {
   auto &PtrAdd = cast<GPtrAdd>(MI);
   Register LHS = PtrAdd.getBaseReg();
   Register RHS = PtrAdd.getOffsetReg();
@@ -2540,7 +2549,7 @@ bool CombinerHelper::matchCombineConstPtrAddToI2P(MachineInstr &MI,
 }
 
 void CombinerHelper::applyCombineConstPtrAddToI2P(MachineInstr &MI,
-                                                  APInt &NewCst) {
+                                                  APInt &NewCst) const {
   auto &PtrAdd = cast<GPtrAdd>(MI);
   Register Dst = PtrAdd.getReg(0);
 
@@ -2548,7 +2557,8 @@ void CombinerHelper::applyCombineConstPtrAddToI2P(MachineInstr &MI,
   PtrAdd.eraseFromParent();
 }
 
-bool CombinerHelper::matchCombineAnyExtTrunc(MachineInstr &MI, Register &Reg) {
+bool CombinerHelper::matchCombineAnyExtTrunc(MachineInstr &MI,
+                                             Register &Reg) const {
   assert(MI.getOpcode() == TargetOpcode::G_ANYEXT && "Expected a G_ANYEXT");
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(1).getReg();
@@ -2560,7 +2570,8 @@ bool CombinerHelper::matchCombineAnyExtTrunc(MachineInstr &MI, Register &Reg) {
                   m_GTrunc(m_all_of(m_Reg(Reg), m_SpecificType(DstTy))));
 }
 
-bool CombinerHelper::matchCombineZextTrunc(MachineInstr &MI, Register &Reg) {
+bool CombinerHelper::matchCombineZextTrunc(MachineInstr &MI,
+                                           Register &Reg) const {
   assert(MI.getOpcode() == TargetOpcode::G_ZEXT && "Expected a G_ZEXT");
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(1).getReg();
@@ -2592,7 +2603,7 @@ static LLT getMidVTForTruncRightShiftCombine(LLT ShiftTy, LLT TruncTy) {
 }
 
 bool CombinerHelper::matchCombineTruncOfShift(
-    MachineInstr &MI, std::pair<MachineInstr *, LLT> &MatchInfo) {
+    MachineInstr &MI, std::pair<MachineInstr *, LLT> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Expected a G_TRUNC");
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(1).getReg();
@@ -2653,7 +2664,7 @@ bool CombinerHelper::matchCombineTruncOfShift(
 }
 
 void CombinerHelper::applyCombineTruncOfShift(
-    MachineInstr &MI, std::pair<MachineInstr *, LLT> &MatchInfo) {
+    MachineInstr &MI, std::pair<MachineInstr *, LLT> &MatchInfo) const {
   MachineInstr *ShiftMI = MatchInfo.first;
   LLT NewShiftTy = MatchInfo.second;
 
@@ -2677,39 +2688,40 @@ void CombinerHelper::applyCombineTruncOfShift(
   eraseInst(MI);
 }
 
-bool CombinerHelper::matchAnyExplicitUseIsUndef(MachineInstr &MI) {
+bool CombinerHelper::matchAnyExplicitUseIsUndef(MachineInstr &MI) const {
   return any_of(MI.explicit_uses(), [this](const MachineOperand &MO) {
     return MO.isReg() &&
            getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MO.getReg(), MRI);
   });
 }
 
-bool CombinerHelper::matchAllExplicitUsesAreUndef(MachineInstr &MI) {
+bool CombinerHelper::matchAllExplicitUsesAreUndef(MachineInstr &MI) const {
   return all_of(MI.explicit_uses(), [this](const MachineOperand &MO) {
     return !MO.isReg() ||
            getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MO.getReg(), MRI);
   });
 }
 
-bool CombinerHelper::matchUndefShuffleVectorMask(MachineInstr &MI) {
+bool CombinerHelper::matchUndefShuffleVectorMask(MachineInstr &MI) const {
   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
   return all_of(Mask, [](int Elt) { return Elt < 0; });
 }
 
-bool CombinerHelper::matchUndefStore(MachineInstr &MI) {
+bool CombinerHelper::matchUndefStore(MachineInstr &MI) const {
   assert(MI.getOpcode() == TargetOpcode::G_STORE);
   return getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MI.getOperand(0).getReg(),
                       MRI);
 }
 
-bool CombinerHelper::matchUndefSelectCmp(MachineInstr &MI) {
+bool CombinerHelper::matchUndefSelectCmp(MachineInstr &MI) const {
   assert(MI.getOpcode() == TargetOpcode::G_SELECT);
   return getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MI.getOperand(1).getReg(),
                       MRI);
 }
 
-bool CombinerHelper::matchInsertExtractVecEltOutOfBounds(MachineInstr &MI) {
+bool CombinerHelper::matchInsertExtractVecEltOutOfBounds(
+    MachineInstr &MI) const {
   assert((MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT ||
           MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT) &&
          "Expected an insert/extract element op");
@@ -2725,7 +2737,8 @@ bool CombinerHelper::matchInsertExtractVecEltOutOfBounds(MachineInstr &MI) {
   return Idx->getZExtValue() >= VecTy.getNumElements();
 }
 
-bool CombinerHelper::matchConstantSelectCmp(MachineInstr &MI, unsigned &OpIdx) {
+bool CombinerHelper::matchConstantSelectCmp(MachineInstr &MI,
+                                            unsigned &OpIdx) const {
   GSelect &SelMI = cast<GSelect>(MI);
   auto Cst =
       isConstantOrConstantSplatVector(*MRI.getVRegDef(SelMI.getCondReg()), MRI);
@@ -2735,10 +2748,10 @@ bool CombinerHelper::matchConstantSelectCmp(MachineInstr &MI, unsigned &OpIdx) {
   return true;
 }
 
-void CombinerHelper::eraseInst(MachineInstr &MI) { MI.eraseFromParent(); }
+void CombinerHelper::eraseInst(MachineInstr &MI) const { MI.eraseFromParent(); }
 
 bool CombinerHelper::matchEqualDefs(const MachineOperand &MOP1,
-                                    const MachineOperand &MOP2) {
+                                    const MachineOperand &MOP2) const {
   if (!MOP1.isReg() || !MOP2.isReg())
     return false;
   auto InstAndDef1 = getDefSrcRegIgnoringCopies(MOP1.getReg(), MRI);
@@ -2834,7 +2847,8 @@ bool CombinerHelper::matchEqualDefs(const MachineOperand &MOP1,
   return false;
 }
 
-bool CombinerHelper::matchConstantOp(const MachineOperand &MOP, int64_t C) {
+bool CombinerHelper::matchConstantOp(const MachineOperand &MOP,
+                                     int64_t C) const {
   if (!MOP.isReg())
     return false;
   auto *MI = MRI.getVRegDef(MOP.getReg());
@@ -2843,7 +2857,8 @@ bool CombinerHelper::matchConstantOp(const MachineOperand &MOP, int64_t C) {
          MaybeCst->getSExtValue() == C;
 }
 
-bool CombinerHelper::matchConstantFPOp(const MachineOperand &MOP, double C) {
+bool CombinerHelper::matchConstantFPOp(const MachineOperand &MOP,
+                                       double C) const {
   if (!MOP.isReg())
     return false;
   std::optional<FPValueAndVReg> MaybeCst;
@@ -2854,7 +2869,7 @@ bool CombinerHelper::matchConstantFPOp(const MachineOperand &MOP, double C) {
 }
 
 void CombinerHelper::replaceSingleDefInstWithOperand(MachineInstr &MI,
-                                                     unsigned OpIdx) {
+                                                     unsigned OpIdx) const {
   assert(MI.getNumExplicitDefs() == 1 && "Expected one explicit def?");
   Register OldReg = MI.getOperand(0).getReg();
   Register Replacement = MI.getOperand(OpIdx).getReg();
@@ -2864,7 +2879,7 @@ void CombinerHelper::replaceSingleDefInstWithOperand(MachineInstr &MI,
 }
 
 void CombinerHelper::replaceSingleDefInstWithReg(MachineInstr &MI,
-                                                 Register Replacement) {
+                                                 Register Replacement) const {
   assert(MI.getNumExplicitDefs() == 1 && "Expected one explicit def?");
   Register OldReg = MI.getOperand(0).getReg();
   assert(canReplaceReg(OldReg, Replacement, MRI) && "Cannot replace register?");
@@ -2873,7 +2888,7 @@ void CombinerHelper::replaceSingleDefInstWithReg(MachineInstr &MI,
 }
 
 bool CombinerHelper::matchConstantLargerBitWidth(MachineInstr &MI,
-                                                 unsigned ConstIdx) {
+                                                 unsigned ConstIdx) const {
   Register ConstReg = MI.getOperand(ConstIdx).getReg();
   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
 
@@ -2886,7 +2901,7 @@ bool CombinerHelper::matchConstantLargerBitWidth(MachineInstr &MI,
   return (VRegAndVal->Value.uge(DstTy.getSizeInBits()));
 }
 
-void CombinerHelper::applyFunnelShiftConstantModulo(MachineInstr &MI) {
+void CombinerHelper::applyFunnelShiftConstantModulo(MachineInstr &MI) const {
   assert((MI.getOpcode() == TargetOpcode::G_FSHL ||
           MI.getOpcode() == TargetOpcode::G_FSHR) &&
          "This is not a funnel shift operation");
@@ -2910,7 +2925,7 @@ void CombinerHelper::applyFunnelShiftConstantModulo(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-bool CombinerHelper::matchSelectSameVal(MachineInstr &MI) {
+bool CombinerHelper::matchSelectSameVal(MachineInstr &MI) const {
   assert(MI.getOpcode() == TargetOpcode::G_SELECT);
   // Match (cond ? x : x)
   return matchEqualDefs(MI.getOperand(2), MI.getOperand(3)) &&
@@ -2918,63 +2933,67 @@ bool CombinerHelper::matchSelectSameVal(MachineInstr &MI) {
                        MRI);
 }
 
-bool CombinerHelper::matchBinOpSameVal(MachineInstr &MI) {
+bool CombinerHelper::matchBinOpSameVal(MachineInstr &MI) const {
   return matchEqualDefs(MI.getOperand(1), MI.getOperand(2)) &&
          canReplaceReg(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(),
                        MRI);
 }
 
-bool CombinerHelper::matchOperandIsZero(MachineInstr &MI, unsigned OpIdx) {
+bool CombinerHelper::matchOperandIsZero(MachineInstr &MI,
+                                        unsigned OpIdx) const {
   return matchConstantOp(MI.getOperand(OpIdx), 0) &&
          canReplaceReg(MI.getOperand(0).getReg(), MI.getOperand(OpIdx).getReg(),
                        MRI);
 }
 
-bool CombinerHelper::matchOperandIsUndef(MachineInstr &MI, unsigned OpIdx) {
+bool CombinerHelper::matchOperandIsUndef(MachineInstr &MI,
+                                         unsigned OpIdx) const {
   MachineOperand &MO = MI.getOperand(OpIdx);
   return MO.isReg() &&
          getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MO.getReg(), MRI);
 }
 
 bool CombinerHelper::matchOperandIsKnownToBeAPowerOfTwo(MachineInstr &MI,
-                                                        unsigned OpIdx) {
+                                                        unsigned OpIdx) const {
   MachineOperand &MO = MI.getOperand(OpIdx);
   return isKnownToBeAPowerOfTwo(MO.getReg(), MRI, KB);
 }
 
-void CombinerHelper::replaceInstWithFConstant(MachineInstr &MI, double C) {
+void CombinerHelper::replaceInstWithFConstant(MachineInstr &MI,
+                                              double C) const {
   assert(MI.getNumDefs() == 1 && "Expected only one def?");
   Builder.buildFConstant(MI.getOperand(0), C);
   MI.eraseFromParent();
 }
 
-void CombinerHelper::replaceInstWithConstant(MachineInstr &MI, int64_t C) {
+void CombinerHelper::replaceInstWithConstant(MachineInstr &MI,
+                                             int64_t C) const {
   assert(MI.getNumDefs() == 1 && "Expected only one def?");
   Builder.buildConstant(MI.getOperand(0), C);
   MI.eraseFromParent();
 }
 
-void CombinerHelper::replaceInstWithConstant(MachineInstr &MI, APInt C) {
+void CombinerHelper::replaceInstWithConstant(MachineInstr &MI, APInt C) const {
   assert(MI.getNumDefs() == 1 && "Expected only one def?");
   Builder.buildConstant(MI.getOperand(0), C);
   MI.eraseFromParent();
 }
 
 void CombinerHelper::replaceInstWithFConstant(MachineInstr &MI,
-                                              ConstantFP *CFP) {
+                                              ConstantFP *CFP) const {
   assert(MI.getNumDefs() == 1 && "Expected only one def?");
   Builder.buildFConstant(MI.getOperand(0), CFP->getValueAPF());
   MI.eraseFromParent();
 }
 
-void CombinerHelper::replaceInstWithUndef(MachineInstr &MI) {
+void CombinerHelper::replaceInstWithUndef(MachineInstr &MI) const {
   assert(MI.getNumDefs() == 1 && "Expected only one def?");
   Builder.buildUndef(MI.getOperand(0));
   MI.eraseFromParent();
 }
 
 bool CombinerHelper::matchSimplifyAddToSub(
-    MachineInstr &MI, std::tuple<Register, Register> &MatchInfo) {
+    MachineInstr &MI, std::tuple<Register, Register> &MatchInfo) const {
   Register LHS = MI.getOperand(1).getReg();
   Register RHS = MI.getOperand(2).getReg();
   Register &NewLHS = std::get<0>(MatchInfo);
@@ -2994,7 +3013,7 @@ bool CombinerHelper::matchSimplifyAddToSub(
 }
 
 bool CombinerHelper::matchCombineInsertVecElts(
-    MachineInstr &MI, SmallVectorImpl<Register> &MatchInfo) {
+    MachineInstr &MI, SmallVectorImpl<Register> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT &&
          "Invalid opcode");
   Register DstReg = MI.getOperand(0).getReg();
@@ -3041,7 +3060,7 @@ bool CombinerHelper::matchCombineInsertVecElts(
 }
 
 void CombinerHelper::applyCombineInsertVecElts(
-    MachineInstr &MI, SmallVectorImpl<Register> &MatchInfo) {
+    MachineInstr &MI, SmallVectorImpl<Register> &MatchInfo) const {
   Register UndefReg;
   auto GetUndef = [&]() {
     if (UndefReg)
@@ -3059,7 +3078,7 @@ void CombinerHelper::applyCombineInsertVecElts(
 }
 
 void CombinerHelper::applySimplifyAddToSub(
-    MachineInstr &MI, std::tuple<Register, Register> &MatchInfo) {
+    MachineInstr &MI, std::tuple<Register, Register> &MatchInfo) const {
   Register SubLHS, SubRHS;
   std::tie(SubLHS, SubRHS) = MatchInfo;
   Builder.buildSub(MI.getOperand(0).getReg(), SubLHS, SubRHS);
@@ -3067,7 +3086,7 @@ void CombinerHelper::applySimplifyAddToSub(
 }
 
 bool CombinerHelper::matchHoistLogicOpWithSameOpcodeHands(
-    MachineInstr &MI, InstructionStepsMatchInfo &MatchInfo) {
+    MachineInstr &MI, InstructionStepsMatchInfo &MatchInfo) const {
   // Matches: logic (hand x, ...), (hand y, ...) -> hand (logic x, y), ...
   //
   // Creates the new hand + logic instruction (but does not insert them.)
@@ -3175,7 +3194,7 @@ bool CombinerHelper::matchHoistLogicOpWithSameOpcodeHands(
 }
 
 void CombinerHelper::applyBuildInstructionSteps(
-    MachineInstr &MI, InstructionStepsMatchInfo &MatchInfo) {
+    MachineInstr &MI, InstructionStepsMatchInfo &MatchInfo) const {
   assert(MatchInfo.InstrsToBuild.size() &&
          "Expected at least one instr to build?");
   for (auto &InstrToBuild : MatchInfo.InstrsToBuild) {
@@ -3189,7 +3208,7 @@ void CombinerHelper::applyBuildInstructionSteps(
 }
 
 bool CombinerHelper::matchAshrShlToSextInreg(
-    MachineInstr &MI, std::tuple<Register, int64_t> &MatchInfo) {
+    MachineInstr &MI, std::tuple<Register, int64_t> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_ASHR);
   int64_t ShlCst, AshrCst;
   Register Src;
@@ -3207,7 +3226,7 @@ bool CombinerHelper::matchAshrShlToSextInreg(
 }
 
 void CombinerHelper::applyAshShlToSextInreg(
-    MachineInstr &MI, std::tuple<Register, int64_t> &MatchInfo) {
+    MachineInstr &MI, std::tuple<Register, int64_t> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_ASHR);
   Register Src;
   int64_t ShiftAmt;
@@ -3219,7 +3238,8 @@ void CombinerHelper::applyAshShlToSextInreg(
 
 /// and(and(x, C1), C2) -> C1&C2 ? and(x, C1&C2) : 0
 bool CombinerHelper::matchOverlappingAnd(
-    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+    MachineInstr &MI,
+    std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_AND);
 
   Register Dst = MI.getOperand(0).getReg();
@@ -3245,7 +3265,7 @@ bool CombinerHelper::matchOverlappingAnd(
 }
 
 bool CombinerHelper::matchRedundantAnd(MachineInstr &MI,
-                                       Register &Replacement) {
+                                       Register &Replacement) const {
   // Given
   //
   // %y:_(sN) = G_SOMETHING
@@ -3300,7 +3320,8 @@ bool CombinerHelper::matchRedundantAnd(MachineInstr &MI,
   return false;
 }
 
-bool CombinerHelper::matchRedundantOr(MachineInstr &MI, Register &Replacement) {
+bool CombinerHelper::matchRedundantOr(MachineInstr &MI,
+                                      Register &Replacement) const {
   // Given
   //
   // %y:_(sN) = G_SOMETHING
@@ -3341,7 +3362,7 @@ bool CombinerHelper::matchRedundantOr(MachineInstr &MI, Register &Replacement) {
   return false;
 }
 
-bool CombinerHelper::matchRedundantSExtInReg(MachineInstr &MI) {
+bool CombinerHelper::matchRedundantSExtInReg(MachineInstr &MI) const {
   // If the input is already sign extended, just drop the extension.
   Register Src = MI.getOperand(1).getReg();
   unsigned ExtBits = MI.getOperand(2).getImm();
@@ -3373,7 +3394,7 @@ static bool isConstValidTrue(const TargetLowering &TLI, unsigned ScalarSizeBits,
 //
 // Only matches sources made up of G_TRUNCs followed by G_IMPLICIT_DEFs
 bool CombinerHelper::matchUseVectorTruncate(MachineInstr &MI,
-                                            Register &MatchInfo) {
+                                            Register &MatchInfo) const {
   auto BuildMI = cast<GBuildVector>(&MI);
   unsigned NumOperands = BuildMI->getNumSources();
   LLT DstTy = MRI.getType(BuildMI->getReg(0));
@@ -3436,7 +3457,7 @@ bool CombinerHelper::matchUseVectorTruncate(MachineInstr &MI,
 }
 
 void CombinerHelper::applyUseVectorTruncate(MachineInstr &MI,
-                                            Register &MatchInfo) {
+                                            Register &MatchInfo) const {
   Register MidReg;
   auto BuildMI = cast<GBuildVector>(&MI);
   Register DstReg = BuildMI->getReg(0);
@@ -3462,8 +3483,8 @@ void CombinerHelper::applyUseVectorTruncate(MachineInstr &MI,
   MI.eraseFromParent();
 }
 
-bool CombinerHelper::matchNotCmp(MachineInstr &MI,
-                                 SmallVectorImpl<Register> &RegsToNegate) {
+bool CombinerHelper::matchNotCmp(
+    MachineInstr &MI, SmallVectorImpl<Register> &RegsToNegate) const {
   assert(MI.getOpcode() == TargetOpcode::G_XOR);
   LLT Ty = MRI.getType(MI.getOperand(0).getReg());
   const auto &TLI = *Builder.getMF().getSubtarget().getTargetLowering();
@@ -3539,8 +3560,8 @@ bool CombinerHelper::matchNotCmp(MachineInstr &MI,
   return true;
 }
 
-void CombinerHelper::applyNotCmp(MachineInstr &MI,
-                                 SmallVectorImpl<Register> &RegsToNegate) {
+void CombinerHelper::applyNotCmp(
+    MachineInstr &MI, SmallVectorImpl<Register> &RegsToNegate) const {
   for (Register Reg : RegsToNegate) {
     MachineInstr *Def = MRI.getVRegDef(Reg);
     Observer.changingInstr(*Def);
@@ -3572,7 +3593,7 @@ void CombinerHelper::applyNotCmp(MachineInstr &MI,
 }
 
 bool CombinerHelper::matchXorOfAndWithSameReg(
-    MachineInstr &MI, std::pair<Register, Register> &MatchInfo) {
+    MachineInstr &MI, std::pair<Register, Register> &MatchInfo) const {
   // Match (xor (and x, y), y) (or any of its commuted cases)
   assert(MI.getOpcode() == TargetOpcode::G_XOR);
   Register &X = MatchInfo.first;
@@ -3603,7 +3624,7 @@ bool CombinerHelper::matchXorOfAndWithSameReg(
 }
 
 void CombinerHelper::applyXorOfAndWithSameReg(
-    MachineInstr &MI, std::pair<Register, Register> &MatchInfo) {
+    MachineInstr &MI, std::pair<Register, Register> &MatchInfo) const {
   // Fold (xor (and x, y), y) -> (and (not x), y)
   Register X, Y;
   std::tie(X, Y) = MatchInfo;
@@ -3615,7 +3636,7 @@ void CombinerHelper::applyXorOfAndWithSameReg(
   Observer.changedInstr(MI);
 }
 
-bool CombinerHelper::matchPtrAddZero(MachineInstr &MI) {
+bool CombinerHelper::matchPtrAddZero(MachineInstr &MI) const {
   auto &PtrAdd = cast<GPtrAdd>(MI);
   Register DstReg = PtrAdd.getReg(0);
   LLT Ty = MRI.getType(DstReg);
@@ -3634,14 +3655,14 @@ bool CombinerHelper::matchPtrAddZero(MachineInstr &MI) {
   return isBuildVectorAllZeros(*VecMI, MRI);
 }
 
-void CombinerHelper::applyPtrAddZero(MachineInstr &MI) {
+void CombinerHelper::applyPtrAddZero(MachineInstr &MI) const {
   auto &PtrAdd = cast<GPtrAdd>(MI);
   Builder.buildIntToPtr(PtrAdd.getReg(0), PtrAdd.getOffsetReg());
   PtrAdd.eraseFromParent();
 }
 
 /// The second source operand is known to be a power of 2.
-void CombinerHelper::applySimplifyURemByPow2(MachineInstr &MI) {
+void CombinerHelper::applySimplifyURemByPow2(MachineInstr &MI) const {
   Register DstReg = MI.getOperand(0).getReg();
   Register Src0 = MI.getOperand(1).getReg();
   Register Pow2Src1 = MI.getOperand(2).getReg();
@@ -3655,7 +3676,7 @@ void CombinerHelper::applySimplifyURemByPow2(MachineInstr &MI) {
 }
 
 bool CombinerHelper::matchFoldBinOpIntoSelect(MachineInstr &MI,
-                                              unsigned &SelectOpNo) {
+                                              unsigned &SelectOpNo) const {
   Register LHS = MI.getOperand(1).getReg();
   Register RHS = MI.getOperand(2).getReg();
 
@@ -3708,8 +3729,8 @@ bool CombinerHelper::matchFoldBinOpIntoSelect(MachineInstr &MI,
 
 /// \p SelectOperand is the operand in binary operator \p MI that is the select
 /// to fold.
-void CombinerHelper::applyFoldBinOpIntoSelect(MachineInstr &MI,
-                                              const unsigned &SelectOperand) {
+void CombinerHelper::applyFoldBinOpIntoSelect(
+    MachineInstr &MI, const unsigned &SelectOperand) const {
   Register Dst = MI.getOperand(0).getReg();
   Register LHS = MI.getOperand(1).getReg();
   Register RHS = MI.getOperand(2).getReg();
@@ -3845,7 +3866,8 @@ matchLoadAndBytePosition(Register Reg, unsigned MemSizeInBits,
 std::optional<std::tuple<GZExtLoad *, int64_t, GZExtLoad *>>
 CombinerHelper::findLoadOffsetsForLoadOrCombine(
     SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
-    const SmallVector<Register, 8> &RegsToVisit, const unsigned MemSizeInBits) {
+    const SmallVector<Register, 8> &RegsToVisit,
+    const unsigned MemSizeInBits) const {
 
   // Each load found for the pattern. There should be one for each RegsToVisit.
   SmallSetVector<const MachineInstr *, 8> Loads;
@@ -3977,7 +3999,8 @@ CombinerHelper::findLoadOffsetsForLoadOrCombine(
 }
 
 bool CombinerHelper::matchLoadOrCombine(
-    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+    MachineInstr &MI,
+    std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_OR);
   MachineFunction &MF = *MI.getMF();
   // Assuming a little-endian target, transform:
@@ -4090,7 +4113,7 @@ bool CombinerHelper::matchLoadOrCombine(
 }
 
 bool CombinerHelper::matchExtendThroughPhis(MachineInstr &MI,
-                                            MachineInstr *&ExtMI) {
+                                            MachineInstr *&ExtMI) const {
   auto &PHI = cast<GPhi>(MI);
   Register DstReg = PHI.getReg(0);
 
@@ -4144,7 +4167,7 @@ bool CombinerHelper::matchExtendThroughPhis(MachineInstr &MI,
 }
 
 void CombinerHelper::applyExtendThroughPhis(MachineInstr &MI,
-                                            MachineInstr *&ExtMI) {
+                                            MachineInstr *&ExtMI) const {
   auto &PHI = cast<GPhi>(MI);
   Register DstReg = ExtMI->getOperand(0).getReg();
   LLT ExtTy = MRI.getType(DstReg);
@@ -4189,7 +4212,7 @@ void CombinerHelper::applyExtendThroughPhis(MachineInstr &MI,
 }
 
 bool CombinerHelper::matchExtractVecEltBuildVec(MachineInstr &MI,
-                                                Register &Reg) {
+                                                Register &Reg) const {
   assert(MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT);
   // If we have a constant index, look for a G_BUILD_VECTOR source
   // and find the source register that the index maps to.
@@ -4225,7 +4248,7 @@ bool CombinerHelper::matchExtractVecEltBuildVec(MachineInstr &MI,
 }
 
 void CombinerHelper::applyExtractVecEltBuildVec(MachineInstr &MI,
-                                                Register &Reg) {
+                                                Register &Reg) const {
   // Check the type of the register, since it may have come from a
   // G_BUILD_VECTOR_TRUNC.
   LLT ScalarTy = MRI.getType(Reg);
@@ -4243,7 +4266,7 @@ void CombinerHelper::applyExtractVecEltBuildVec(MachineInstr &MI,
 
 bool CombinerHelper::matchExtractAllEltsFromBuildVector(
     MachineInstr &MI,
-    SmallVectorImpl<std::pair<Register, MachineInstr *>> &SrcDstPairs) {
+    SmallVectorImpl<std::pair<Register, MachineInstr *>> &SrcDstPairs) const {
   assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
   // This combine tries to find build_vector's which have every source element
   // extracted using G_EXTRACT_VECTOR_ELT. This can happen when transforms like
@@ -4285,7 +4308,7 @@ bool CombinerHelper::matchExtractAllEltsFromBuildVector(
 
 void CombinerHelper::applyExtractAllEltsFromBuildVector(
     MachineInstr &MI,
-    SmallVectorImpl<std::pair<Register, MachineInstr *>> &SrcDstPairs) {
+    SmallVectorImpl<std::pair<Register, MachineInstr *>> &SrcDstPairs) const {
   assert(MI.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
   for (auto &Pair : SrcDstPairs) {
     auto *ExtMI = Pair.second;
@@ -4296,18 +4319,20 @@ void CombinerHelper::applyExtractAllEltsFromBuildVector(
 }
 
 void CombinerHelper::applyBuildFn(
-    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+    MachineInstr &MI,
+    std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   applyBuildFnNoErase(MI, MatchInfo);
   MI.eraseFromParent();
 }
 
 void CombinerHelper::applyBuildFnNoErase(
-    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+    MachineInstr &MI,
+    std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   MatchInfo(Builder);
 }
 
 bool CombinerHelper::matchOrShiftToFunnelShift(MachineInstr &MI,
-                                               BuildFnTy &MatchInfo) {
+                                               BuildFnTy &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_OR);
 
   Register Dst = MI.getOperand(0).getReg();
@@ -4360,7 +4385,7 @@ bool CombinerHelper::matchOrShiftToFunnelShift(MachineInstr &MI,
 }
 
 /// Match an FSHL or FSHR that can be combined to a ROTR or ROTL rotate.
-bool CombinerHelper::matchFunnelShiftToRotate(MachineInstr &MI) {
+bool CombinerHelper::matchFunnelShiftToRotate(MachineInstr &MI) const {
   unsigned Opc = MI.getOpcode();
   assert(Opc == TargetOpcode::G_FSHL || Opc == TargetOpcode::G_FSHR);
   Register X = MI.getOperand(1).getReg();
@@ -4372,7 +4397,7 @@ bool CombinerHelper::matchFunnelShiftToRotate(MachineInstr &MI) {
   return isLegalOrBeforeLegalizer({RotateOpc, {MRI.getType(X), MRI.getType(Y)}});
 }
 
-void CombinerHelper::applyFunnelShiftToRotate(MachineInstr &MI) {
+void CombinerHelper::applyFunnelShiftToRotate(MachineInstr &MI) const {
   unsigned Opc = MI.getOpcode();
   assert(Opc == TargetOpcode::G_FSHL || Opc == TargetOpcode::G_FSHR);
   bool IsFSHL = Opc == TargetOpcode::G_FSHL;
@@ -4384,7 +4409,7 @@ void CombinerHelper::applyFunnelShiftToRotate(MachineInstr &MI) {
 }
 
 // Fold (rot x, c) -> (rot x, c % BitSize)
-bool CombinerHelper::matchRotateOutOfRange(MachineInstr &MI) {
+bool CombinerHelper::matchRotateOutOfRange(MachineInstr &MI) const {
   assert(MI.getOpcode() == TargetOpcode::G_ROTL ||
          MI.getOpcode() == TargetOpcode::G_ROTR);
   unsigned Bitsize =
@@ -4399,7 +4424,7 @@ bool CombinerHelper::matchRotateOutOfRange(MachineInstr &MI) {
   return matchUnaryPredicate(MRI, AmtReg, MatchOutOfRange) && OutOfRange;
 }
 
-void CombinerHelper::applyRotateOutOfRange(MachineInstr &MI) {
+void CombinerHelper::applyRotateOutOfRange(MachineInstr &MI) const {
   assert(MI.getOpcode() == TargetOpcode::G_ROTL ||
          MI.getOpcode() == TargetOpcode::G_ROTR);
   unsigned Bitsize =
@@ -4414,7 +4439,7 @@ void CombinerHelper::applyRotateOutOfRange(MachineInstr &MI) {
 }
 
 bool CombinerHelper::matchICmpToTrueFalseKnownBits(MachineInstr &MI,
-                                                   int64_t &MatchInfo) {
+                                                   int64_t &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_ICMP);
   auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
 
@@ -4458,7 +4483,8 @@ bool CombinerHelper::matchICmpToTrueFalseKnownBits(MachineInstr &MI,
 }
 
 bool CombinerHelper::matchICmpToLHSKnownBits(
-    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+    MachineInstr &MI,
+    std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_ICMP);
   // Given:
   //
@@ -4501,7 +4527,8 @@ bool CombinerHelper::matchICmpToLHSKnownBits(
 
 // Replace (and (or x, c1), c2) with (and x, c2) iff c1 & c2 == 0
 bool CombinerHelper::matchAndOrDisjointMask(
-    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+    MachineInstr &MI,
+    std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_AND);
 
   // Ignore vector types to simplify matching the two constants.
@@ -4536,7 +4563,8 @@ bool CombinerHelper::matchAndOrDisjointMask(
 
 /// Form a G_SBFX from a G_SEXT_INREG fed by a right shift.
 bool CombinerHelper::matchBitfieldExtractFromSExtInReg(
-    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+    MachineInstr &MI,
+    std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG);
   Register Dst = MI.getOperand(0).getReg();
   Register Src = MI.getOperand(1).getReg();
@@ -4565,7 +4593,7 @@ bool CombinerHelper::matchBitfieldExtractFromSExtInReg(
 
 /// Form a G_UBFX from "(a srl b) & mask", where b and mask are constants.
 bool CombinerHelper::matchBitfieldExtractFromAnd(MachineInstr &MI,
-                                                 BuildFnTy &MatchInfo) {
+                                                 BuildFnTy &MatchInfo) const {
   GAnd *And = cast<GAnd>(&MI);
   Register Dst = And->getReg(0);
   LLT Ty = MRI.getType(Dst);
@@ -4602,7 +4630,8 @@ bool CombinerHelper::matchBitfieldExtractFromAnd(MachineInstr &MI,
 }
 
 bool CombinerHelper::matchBitfieldExtractFromShr(
-    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+    MachineInstr &MI,
+    std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   const unsigned Opcode = MI.getOpcode();
   assert(Opcode == TargetOpcode::G_ASHR || Opcode == TargetOpcode::G_LSHR);
 
@@ -4651,7 +4680,8 @@ bool CombinerHelper::matchBitfieldExtractFromShr(
 }
 
 bool CombinerHelper::matchBitfieldExtractFromShrAnd(
-    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+    MachineInstr &MI,
+    std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   const unsigned Opcode = MI.getOpcode();
   assert(Opcode == TargetOpcode::G_LSHR || Opcode == TargetOpcode::G_ASHR);
 
@@ -4708,7 +4738,7 @@ bool CombinerHelper::matchBitfieldExtractFromShrAnd(
 }
 
 bool CombinerHelper::reassociationCanBreakAddressingModePattern(
-    MachineInstr &MI) {
+    MachineInstr &MI) const {
   auto &PtrAdd = cast<GPtrAdd>(MI);
 
   Register Src1Reg = PtrAdd.getBaseReg();
@@ -4774,7 +4804,7 @@ bool CombinerHelper::reassociationCanBreakAddressingModePattern(
 
 bool CombinerHelper::matchReassocConstantInnerRHS(GPtrAdd &MI,
                                                   MachineInstr *RHS,
-                                                  BuildFnTy &MatchInfo) {
+                                                  BuildFnTy &MatchInfo) const {
   // G_PTR_ADD(BASE, G_ADD(X, C)) -> G_PTR_ADD(G_PTR_ADD(BASE, X), C)
   Register Src1Reg = MI.getOperand(1).getReg();
   if (RHS->getOpcode() != TargetOpcode::G_ADD)
@@ -4799,7 +4829,7 @@ bool CombinerHelper::matchReassocConstantInnerRHS(GPtrAdd &MI,
 bool CombinerHelper::matchReassocConstantInnerLHS(GPtrAdd &MI,
                                                   MachineInstr *LHS,
                                                   MachineInstr *RHS,
-                                                  BuildFnTy &MatchInfo) {
+                                                  BuildFnTy &MatchInfo) const {
   // G_PTR_ADD (G_PTR_ADD X, C), Y) -> (G_PTR_ADD (G_PTR_ADD(X, Y), C)
   // if and only if (G_PTR_ADD X, C) has one use.
   Register LHSBase;
@@ -4827,10 +4857,9 @@ bool CombinerHelper::matchReassocConstantInnerLHS(GPtrAdd &MI,
   return !reassociationCanBreakAddressingModePattern(MI);
 }
 
-bool CombinerHelper::matchReassocFoldConstantsInSubTree(GPtrAdd &MI,
-                                                        MachineInstr *LHS,
-                                                        MachineInstr *RHS,
-                                                        BuildFnTy &MatchInfo) {
+bool CombinerHelper::matchReassocFoldConstantsInSubTree(
+    GPtrAdd &MI, MachineInstr *LHS, MachineInstr *RHS,
+    BuildFnTy &MatchInfo) const {
   // G_PTR_ADD(G_PTR_ADD(BASE, C1), C2) -> G_PTR_ADD(BASE, C1+C2)
   auto *LHSPtrAdd = dyn_cast<GPtrAdd>(LHS);
   if (!LHSPtrAdd)
@@ -4857,7 +4886,7 @@ bool CombinerHelper::matchReassocFoldConstantsInSubTree(GPtrAdd &MI,
 }
 
 bool CombinerHelper::matchReassocPtrAdd(MachineInstr &MI,
-                                        BuildFnTy &MatchInfo) {
+                                        BuildFnTy &MatchInfo) const {
   auto &PtrAdd = cast<GPtrAdd>(MI);
   // We're trying to match a few pointer computation patterns here for
   // re-association opportunities.
@@ -4890,7 +4919,7 @@ bool CombinerHelper::matchReassocPtrAdd(MachineInstr &MI,
 }
 bool CombinerHelper::tryReassocBinOp(unsigned Opc, Register DstReg,
                                      Register OpLHS, Register OpRHS,
-                                     BuildFnTy &MatchInfo) {
+                                     BuildFnTy &MatchInfo) const {
   LLT OpRHSTy = MRI.getType(OpRHS);
   MachineInstr *OpLHSDef = MRI.getVRegDef(OpLHS);
 
@@ -4930,7 +4959,7 @@ bool CombinerHelper::tryReassocBinOp(unsigned Opc, Register DstReg,
 }
 
 bool CombinerHelper::matchReassocCommBinOp(MachineInstr &MI,
-                                           BuildFnTy &MatchInfo) {
+                                           BuildFnTy &MatchInfo) const {
   // We don't check if the reassociation will break a legal addressing mode
   // here since pointer arithmetic is handled by G_PTR_ADD.
   unsigned Opc = MI.getOpcode();
@@ -4945,7 +4974,8 @@ bool CombinerHelper::matchReassocCommBinOp(MachineInstr &MI,
   return false;
 }
 
-bool CombinerHelper::matchConstantFoldCastOp(MachineInstr &MI, APInt &MatchInfo) {
+bool CombinerHelper::matchConstantFoldCastOp(MachineInstr &MI,
+                                             APInt &MatchInfo) const {
   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
   Register SrcOp = MI.getOperand(1).getReg();
 
@@ -4957,7 +4987,8 @@ bool CombinerHelper::matchConstantFoldCastOp(MachineInstr &MI, APInt &MatchInfo)
   return false;
 }
 
-bool CombinerHelper::matchConstantFoldBinOp(MachineInstr &MI, APInt &MatchInfo) {
+bool CombinerHelper::matchConstantFoldBinOp(MachineInstr &MI,
+                                            APInt &MatchInfo) const {
   Register Op1 = MI.getOperand(1).getReg();
   Register Op2 = MI.getOperand(2).getReg();
   auto MaybeCst = ConstantFoldBinOp(MI.getOpcode(), Op1, Op2, MRI);
@@ -4967,7 +4998,8 @@ bool CombinerHelper::matchConstantFoldBinOp(MachineInstr &MI, APInt &MatchInfo)
   return true;
 }
 
-bool CombinerHelper::matchConstantFoldFPBinOp(MachineInstr &MI, ConstantFP* &MatchInfo) {
+bool CombinerHelper::matchConstantFoldFPBinOp(MachineInstr &MI,
+                                              ConstantFP *&MatchInfo) const {
   Register Op1 = MI.getOperand(1).getReg();
   Register Op2 = MI.getOperand(2).getReg();
   auto MaybeCst = ConstantFoldFPBinOp(MI.getOpcode(), Op1, Op2, MRI);
@@ -4979,7 +5011,7 @@ bool CombinerHelper::matchConstantFoldFPBinOp(MachineInstr &MI, ConstantFP* &Mat
 }
 
 bool CombinerHelper::matchConstantFoldFMA(MachineInstr &MI,
-                                          ConstantFP *&MatchInfo) {
+                                          ConstantFP *&MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_FMA ||
          MI.getOpcode() == TargetOpcode::G_FMAD);
   auto [_, Op1, Op2, Op3] = MI.getFirst4Regs();
@@ -5004,7 +5036,8 @@ bool CombinerHelper::matchConstantFoldFMA(MachineInstr &MI,
 }
 
 bool CombinerHelper::matchNarrowBinopFeedingAnd(
-    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+    MachineInstr &MI,
+    std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   // Look for a binop feeding into an AND with a mask:
   //
   // %add = G_ADD %lhs, %rhs
@@ -5093,7 +5126,8 @@ bool CombinerHelper::matchNarrowBinopFeedingAnd(
   return true;
 }
 
-bool CombinerHelper::matchMulOBy2(MachineInstr &MI, BuildFnTy &MatchInfo) {
+bool CombinerHelper::matchMulOBy2(MachineInstr &MI,
+                                  BuildFnTy &MatchInfo) const {
   unsigned Opc = MI.getOpcode();
   assert(Opc == TargetOpcode::G_UMULO || Opc == TargetOpcode::G_SMULO);
 
@@ -5111,7 +5145,8 @@ bool CombinerHelper::matchMulOBy2(MachineInstr &MI, BuildFnTy &MatchInfo) {
   return true;
 }
 
-bool CombinerHelper::matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) {
+bool CombinerHelper::matchMulOBy0(MachineInstr &MI,
+                                  BuildFnTy &MatchInfo) const {
   // (G_*MULO x, 0) -> 0 + no carry out
   assert(MI.getOpcode() == TargetOpcode::G_UMULO ||
          MI.getOpcode() == TargetOpcode::G_SMULO);
@@ -5129,7 +5164,8 @@ bool CombinerHelper::matchMulOBy0(MachineInstr &MI, BuildFnTy &MatchInfo) {
   return true;
 }
 
-bool CombinerHelper::matchAddEToAddO(MachineInstr &MI, BuildFnTy &MatchInfo) {
+bool CombinerHelper::matchAddEToAddO(MachineInstr &MI,
+                                     BuildFnTy &MatchInfo) const {
   // (G_*ADDE x, y, 0) -> (G_*ADDO x, y)
   // (G_*SUBE x, y, 0) -> (G_*SUBO x, y)
   assert(MI.getOpcode() == TargetOpcode::G_UADDE ||
@@ -5163,7 +5199,7 @@ bool CombinerHelper::matchAddEToAddO(MachineInstr &MI, BuildFnTy &MatchInfo) {
 }
 
 bool CombinerHelper::matchSubAddSameReg(MachineInstr &MI,
-                                        BuildFnTy &MatchInfo) {
+                                        BuildFnTy &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_SUB);
   Register Dst = MI.getOperand(0).getReg();
   // (x + y) - z -> x (if y == z)
@@ -5206,7 +5242,7 @@ bool CombinerHelper::matchSubAddSameReg(MachineInstr &MI,
   return false;
 }
 
-MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) {
+MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) const {
   assert(MI.getOpcode() == TargetOpcode::G_UDIV);
   auto &UDiv = cast<GenericMachineInstr>(MI);
   Register Dst = UDiv.getReg(0);
@@ -5367,7 +5403,7 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) {
   return MIB.buildSelect(Ty, IsOne, LHS, Q);
 }
 
-bool CombinerHelper::matchUDivByConst(MachineInstr &MI) {
+bool CombinerHelper::matchUDivByConst(MachineInstr &MI) const {
   assert(MI.getOpcode() == TargetOpcode::G_UDIV);
   Register Dst = MI.getOperand(0).getReg();
   Register RHS = MI.getOperand(2).getReg();
@@ -5411,12 +5447,12 @@ bool CombinerHelper::matchUDivByConst(MachineInstr &MI) {
       MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
 }
 
-void CombinerHelper::applyUDivByConst(MachineInstr &MI) {
+void CombinerHelper::applyUDivByConst(MachineInstr &MI) const {
   auto *NewMI = buildUDivUsingMul(MI);
   replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
 }
 
-bool CombinerHelper::matchSDivByConst(MachineInstr &MI) {
+bool CombinerHelper::matchSDivByConst(MachineInstr &MI) const {
   assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV");
   Register Dst = MI.getOperand(0).getReg();
   Register RHS = MI.getOperand(2).getReg();
@@ -5444,12 +5480,12 @@ bool CombinerHelper::matchSDivByConst(MachineInstr &MI) {
   return false;
 }
 
-void CombinerHelper::applySDivByConst(MachineInstr &MI) {
+void CombinerHelper::applySDivByConst(MachineInstr &MI) const {
   auto *NewMI = buildSDivUsingMul(MI);
   replaceSingleDefInstWithReg(MI, NewMI->getOperand(0).getReg());
 }
 
-MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) {
+MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) const {
   assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV");
   auto &SDiv = cast<GenericMachineInstr>(MI);
   Register Dst = SDiv.getReg(0);
@@ -5513,7 +5549,7 @@ MachineInstr *CombinerHelper::buildSDivUsingMul(MachineInstr &MI) {
   return MIB.buildMul(Ty, Res, Factor);
 }
 
-bool CombinerHelper::matchDivByPow2(MachineInstr &MI, bool IsSigned) {
+bool CombinerHelper::matchDivByPow2(MachineInstr &MI, bool IsSigned) const {
   assert((MI.getOpcode() == TargetOpcode::G_SDIV ||
           MI.getOpcode() == TargetOpcode::G_UDIV) &&
          "Expected SDIV or UDIV");
@@ -5527,7 +5563,7 @@ bool CombinerHelper::matchDivByPow2(MachineInstr &MI, bool IsSigned) {
   return matchUnaryPredicate(MRI, RHS, MatchPow2, /*AllowUndefs=*/false);
 }
 
-void CombinerHelper::applySDivByPow2(MachineInstr &MI) {
+void CombinerHelper::applySDivByPow2(MachineInstr &MI) const {
   assert(MI.getOpcode() == TargetOpcode::G_SDIV && "Expected SDIV");
   auto &SDiv = cast<GenericMachineInstr>(MI);
   Register Dst = SDiv.getReg(0);
@@ -5586,7 +5622,7 @@ void CombinerHelper::applySDivByPow2(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-void CombinerHelper::applyUDivByPow2(MachineInstr &MI) {
+void CombinerHelper::applyUDivByPow2(MachineInstr &MI) const {
   assert(MI.getOpcode() == TargetOpcode::G_UDIV && "Expected UDIV");
   auto &UDiv = cast<GenericMachineInstr>(MI);
   Register Dst = UDiv.getReg(0);
@@ -5600,7 +5636,7 @@ void CombinerHelper::applyUDivByPow2(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
-bool CombinerHelper::matchUMulHToLShr(MachineInstr &MI) {
+bool CombinerHelper::matchUMulHToLShr(MachineInstr &MI) const {
   assert(MI.getOpcode() == TargetOpcode::G_UMULH);
   Register RHS = MI.getOperand(2).getReg();
   Register Dst = MI.getOperand(0).getReg();
@@ -5616,7 +5652,7 @@ bool CombinerHelper::matchUMulHToLShr(MachineInstr &MI) {
   return isLegalOrBeforeLegalizer({TargetOpcode::G_LSHR, {Ty, ShiftAmtTy}});
 }
 
-void CombinerHelper::applyUMulHToLShr(MachineInstr &MI) {
+void CombinerHelper::applyUMulHToLShr(MachineInstr &MI) const {
   Register LHS = MI.getOperand(1).getReg();
   Register RHS = MI.getOperand(2).getReg();
   Register Dst = MI.getOperand(0).getReg();
@@ -5633,7 +5669,7 @@ void CombinerHelper::applyUMulHToLShr(MachineInstr &MI) {
 }
 
 bool CombinerHelper::matchRedundantNegOperands(MachineInstr &MI,
-                                               BuildFnTy &MatchInfo) {
+                                               BuildFnTy &MatchInfo) const {
   unsigned Opc = MI.getOpcode();
   assert(Opc == TargetOpcode::G_FADD || Opc == TargetOpcode::G_FSUB ||
          Opc == TargetOpcode::G_FMUL || Opc == TargetOpcode::G_FDIV ||
@@ -5678,7 +5714,8 @@ bool CombinerHelper::matchRedundantNegOperands(MachineInstr &MI,
   return true;
 }
 
-bool CombinerHelper::matchFsubToFneg(MachineInstr &MI, Register &MatchInfo) {
+bool CombinerHelper::matchFsubToFneg(MachineInstr &MI,
+                                     Register &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_FSUB);
 
   Register LHS = MI.getOperand(1).getReg();
@@ -5702,7 +5739,8 @@ bool CombinerHelper::matchFsubToFneg(MachineInstr &MI, Register &MatchInfo) {
   return false;
 }
 
-void CombinerHelper::applyFsubToFneg(MachineInstr &MI, Register &MatchInfo) {
+void CombinerHelper::applyFsubToFneg(MachineInstr &MI,
+                                     Register &MatchInfo) const {
   Register Dst = MI.getOperand(0).getReg();
   Builder.buildFNeg(
       Dst, Builder.buildFCanonicalize(MRI.getType(Dst), MatchInfo).getReg(0));
@@ -5728,7 +5766,7 @@ static bool hasMoreUses(const MachineInstr &MI0, const MachineInstr &MI1,
 bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI,
                                          bool &AllowFusionGlobally,
                                          bool &HasFMAD, bool &Aggressive,
-                                         bool CanReassociate) {
+                                         bool CanReassociate) const {
 
   auto *MF = MI.getMF();
   const auto &TLI = *MF->getSubtarget().getTargetLowering();
@@ -5759,7 +5797,8 @@ bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI,
 }
 
 bool CombinerHelper::matchCombineFAddFMulToFMadOrFMA(
-    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+    MachineInstr &MI,
+    std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_FADD);
 
   bool AllowFusionGlobally, HasFMAD, Aggressive;
@@ -5807,7 +5846,8 @@ bool CombinerHelper::matchCombineFAddFMulToFMadOrFMA(
 }
 
 bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMA(
-    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+    MachineInstr &MI,
+    std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_FADD);
 
   bool AllowFusionGlobally, HasFMAD, Aggressive;
@@ -5866,7 +5906,8 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMA(
 }
 
 bool CombinerHelper::matchCombineFAddFMAFMulToFMadOrFMA(
-    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+    MachineInstr &MI,
+    std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_FADD);
 
   bool AllowFusionGlobally, HasFMAD, Aggressive;
@@ -5931,7 +5972,8 @@ bool CombinerHelper::matchCombineFAddFMAFMulToFMadOrFMA(
 }
 
 bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive(
-    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+    MachineInstr &MI,
+    std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_FADD);
 
   bool AllowFusionGlobally, HasFMAD, Aggressive;
@@ -6057,7 +6099,8 @@ bool CombinerHelper::matchCombineFAddFpExtFMulToFMadOrFMAAggressive(
 }
 
 bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA(
-    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+    MachineInstr &MI,
+    std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_FSUB);
 
   bool AllowFusionGlobally, HasFMAD, Aggressive;
@@ -6109,7 +6152,8 @@ bool CombinerHelper::matchCombineFSubFMulToFMadOrFMA(
 }
 
 bool CombinerHelper::matchCombineFSubFNegFMulToFMadOrFMA(
-    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+    MachineInstr &MI,
+    std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_FSUB);
 
   bool AllowFusionGlobally, HasFMAD, Aggressive;
@@ -6156,7 +6200,8 @@ bool CombinerHelper::matchCombineFSubFNegFMulToFMadOrFMA(
 }
 
 bool CombinerHelper::matchCombineFSubFpExtFMulToFMadOrFMA(
-    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+    MachineInstr &MI,
+    std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_FSUB);
 
   bool AllowFusionGlobally, HasFMAD, Aggressive;
@@ -6207,7 +6252,8 @@ bool CombinerHelper::matchCombineFSubFpExtFMulToFMadOrFMA(
 }
 
 bool CombinerHelper::matchCombineFSubFpExtFNegFMulToFMadOrFMA(
-    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+    MachineInstr &MI,
+    std::function<void(MachineIRBuilder &)> &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_FSUB);
 
   bool AllowFusionGlobally, HasFMAD, Aggressive;
@@ -6266,7 +6312,7 @@ bool CombinerHelper::matchCombineFSubFpExtFNegFMulToFMadOrFMA(
 }
 
 bool CombinerHelper::matchCombineFMinMaxNaN(MachineInstr &MI,
-                                            unsigned &IdxToPropagate) {
+                                            unsigned &IdxToPropagate) const {
   bool PropagateNaN;
   switch (MI.getOpcode()) {
   default:
@@ -6293,7 +6339,7 @@ bool CombinerHelper::matchCombineFMinMaxNaN(MachineInstr &MI,
   return MatchNaN(1) || MatchNaN(2);
 }
 
-bool CombinerHelper::matchAddSubSameReg(MachineInstr &MI, Register &Src) {
+bool CombinerHelper::matchAddSubSameReg(MachineInstr &MI, Register &Src) const {
   assert(MI.getOpcode() == TargetOpcode::G_ADD && "Expected a G_ADD");
   Register LHS = MI.getOperand(1).getReg();
   Register RHS = MI.getOperand(2).getReg();
@@ -6310,7 +6356,7 @@ bool CombinerHelper::matchAddSubSameReg(MachineInstr &MI, Register &Src) {
 }
 
 bool CombinerHelper::matchBuildVectorIdentityFold(MachineInstr &MI,
-                                                  Register &MatchInfo) {
+                                                  Register &MatchInfo) const {
   // This combine folds the following patterns:
   //
   //  G_BUILD_VECTOR_TRUNC (G_BITCAST(x), G_LSHR(G_BITCAST(x), k))
@@ -6356,7 +6402,7 @@ bool CombinerHelper::matchBuildVectorIdentityFold(MachineInstr &MI,
 }
 
 bool CombinerHelper::matchTruncBuildVectorFold(MachineInstr &MI,
-                                               Register &MatchInfo) {
+                                               Register &MatchInfo) const {
   // Replace (G_TRUNC (G_BITCAST (G_BUILD_VECTOR x, y)) with just x
   // if type(x) == type(G_TRUNC)
   if (!mi_match(MI.getOperand(1).getReg(), MRI,
@@ -6367,7 +6413,7 @@ bool CombinerHelper::matchTruncBuildVectorFold(MachineInstr &MI,
 }
 
 bool CombinerHelper::matchTruncLshrBuildVectorFold(MachineInstr &MI,
-                                                   Register &MatchInfo) {
+                                                   Register &MatchInfo) const {
   // Replace (G_TRUNC (G_LSHR (G_BITCAST (G_BUILD_VECTOR x, y)), K)) with
   //    y if K == size of vector element type
   std::optional<ValueAndVReg> ShiftAmt;
@@ -6443,7 +6489,7 @@ CombinerHelper::computeRetValAgainstNaN(Register LHS, Register RHS,
 
 bool CombinerHelper::matchFPSelectToMinMax(Register Dst, Register Cond,
                                            Register TrueVal, Register FalseVal,
-                                           BuildFnTy &MatchInfo) {
+                                           BuildFnTy &MatchInfo) const {
   // Match: select (fcmp cond x, y) x, y
   //        select (fcmp cond x, y) y, x
   // And turn it into fminnum/fmaxnum or fmin/fmax based off of the condition.
@@ -6498,7 +6544,7 @@ bool CombinerHelper::matchFPSelectToMinMax(Register Dst, Register Cond,
 }
 
 bool CombinerHelper::matchSimplifySelectToMinMax(MachineInstr &MI,
-                                                 BuildFnTy &MatchInfo) {
+                                                 BuildFnTy &MatchInfo) const {
   // TODO: Handle integer cases.
   assert(MI.getOpcode() == TargetOpcode::G_SELECT);
   // Condition may be fed by a truncated compare.
@@ -6513,7 +6559,7 @@ bool CombinerHelper::matchSimplifySelectToMinMax(MachineInstr &MI,
 }
 
 bool CombinerHelper::matchRedundantBinOpInEquality(MachineInstr &MI,
-                                                   BuildFnTy &MatchInfo) {
+                                                   BuildFnTy &MatchInfo) const {
   assert(MI.getOpcode() == TargetOpcode::G_ICMP);
   // (X + Y) == X --> Y == 0
   // (X + Y) != X --> Y != 0
@@ -6544,7 +6590,7 @@ bool CombinerHelper::matchRedundantBinOpInEquality(MachineInstr &MI,
   return CmpInst::isEquality(Pred) && Y.isValid();
 }
 
-bool CombinerHelper::matchShiftsTooBig(MachineInstr &MI) {
+bool CombinerHelper::matchShiftsTooBig(MachineInstr &MI) const {
   Register ShiftReg = MI.getOperand(2).getReg();
   LLT ResTy = MRI.getType(MI.getOperand(0).getReg());
   auto IsShiftTooBig = [&](const Constant *C) {
@@ -6554,7 +6600,7 @@ bool CombinerHelper::matchShiftsTooBig(MachineInstr &MI) {
   return matchUnaryPredicate(MRI, ShiftReg, IsShiftTooBig);
 }
 
-bool CombinerHelper::matchCommuteConstantToRHS(MachineInstr &MI) {
+bool CombinerHelper::matchCommuteConstantToRHS(MachineInstr &MI) const {
   unsigned LHSOpndIdx = 1;
   unsigned RHSOpndIdx = 2;
   switch (MI.getOpcode()) {
@@ -6584,7 +6630,7 @@ bool CombinerHelper::matchCommuteConstantToRHS(MachineInstr &MI) {
          !getIConstantVRegVal(RHS, MRI);
 }
 
-bool CombinerHelper::matchCommuteFPConstantToRHS(MachineInstr &MI) {
+bool CombinerHelper::matchCommuteFPConstantToRHS(MachineInstr &MI) const {
   Register LHS = MI.getOperand(1).getReg();
   Register RHS = MI.getOperand(2).getReg();
   std::optional<FPValueAndVReg> ValAndVReg;
@@ -6593,7 +6639,7 @@ bool CombinerHelper::matchCommuteFPConstantToRHS(MachineInstr &MI) {
   return !mi_match(RHS, MRI, m_GFCstOrSplat(ValAndVReg));
 }
 
-void CombinerHelper::applyCommuteBinOpOperands(MachineInstr &MI) {
+void CombinerHelper::applyCommuteBinOpOperands(MachineInstr &MI) const {
   Observer.changingInstr(MI);
   unsigned LHSOpndIdx = 1;
   unsigned RHSOpndIdx = 2;
@@ -6615,7 +6661,7 @@ void CombinerHelper::applyCommuteBinOpOperands(MachineInstr &MI) {
   Observer.changedInstr(MI);
 }
 
-bool CombinerHelper::isOneOrOneSplat(Register Src, bool AllowUndefs) {
+bool CombinerHelper::isOneOrOneSplat(Register Src, bool AllowUndefs) const {
   LLT SrcTy = MRI.getType(Src);
   if (SrcTy.isFixedVector())
     return isConstantSplatVector(Src, 1, AllowUndefs);
@@ -6628,7 +6674,7 @@ bool CombinerHelper::isOneOrOneSplat(Register Src, bool AllowUndefs) {
   return false; // scalable vector
 }
 
-bool CombinerHelper::isZeroOrZeroSplat(Register Src, bool AllowUndefs) {
+bool CombinerHelper::isZeroOrZeroSplat(Register Src, bool AllowUndefs) const {
   LLT SrcTy = MRI.getType(Src);
   if (SrcTy.isFixedVector())
     return isConstantSplatVector(Src, 0, AllowUndefs);
@@ -6644,7 +6690,7 @@ bool CombinerHelper::isZeroOrZeroSplat(Register Src, bool AllowUndefs) {
 // Ignores COPYs during conformance checks.
 // FIXME scalable vectors.
 bool CombinerHelper::isConstantSplatVector(Register Src, int64_t SplatValue,
-                                           bool AllowUndefs) {
+                                           bool AllowUndefs) const {
   GBuildVector *BuildVector = getOpcodeDef<GBuildVector>(Src, MRI);
   if (!BuildVector)
     return false;
@@ -6669,7 +6715,7 @@ bool CombinerHelper::isConstantSplatVector(Register Src, int64_t SplatValue,
 // Ignores COPYs during lookups.
 // FIXME scalable vectors
 std::optional<APInt>
-CombinerHelper::getConstantOrConstantSplatVector(Register Src) {
+CombinerHelper::getConstantOrConstantSplatVector(Register Src) const {
   auto IConstant = getIConstantVRegValWithLookThrough(Src, MRI);
   if (IConstant)
     return IConstant->Value;
@@ -6715,7 +6761,7 @@ bool CombinerHelper::isConstantOrConstantVectorI(Register Src) const {
 
 // TODO: use knownbits to determine zeros
 bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select,
-                                              BuildFnTy &MatchInfo) {
+                                              BuildFnTy &MatchInfo) const {
   uint32_t Flags = Select->getFlags();
   Register Dest = Select->getReg(0);
   Register Cond = Select->getCondReg();
@@ -6847,7 +6893,7 @@ bool CombinerHelper::tryFoldSelectOfConstants(GSelect *Select,
 
 // TODO: use knownbits to determine zeros
 bool CombinerHelper::tryFoldBoolSelectToLogic(GSelect *Select,
-                                              BuildFnTy &MatchInfo) {
+                                              BuildFnTy &MatchInfo) const {
   uint32_t Flags = Select->getFlags();
   Register DstReg = Select->getReg(0);
   Register Cond = Select->getCondReg();
@@ -6928,7 +6974,7 @@ bool CombinerHelper::tryFoldBoolSelectToLogic(GSelect *Select,
 }
 
 bool CombinerHelper::matchSelectIMinMax(const MachineOperand &MO,
-                                        BuildFnTy &MatchInfo) {
+                                        BuildFnTy &MatchInfo) const {
   GSelect *Select = cast<GSelect>(MRI.getVRegDef(MO.getReg()));
   GICmp *Cmp = cast<GICmp>(MRI.getVRegDef(Select->getCondReg()));
 
@@ -6999,7 +7045,7 @@ bool CombinerHelper::matchSelectIMinMax(const MachineOperand &MO,
   }
 }
 
-bool CombinerHelper::matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo) {
+bool CombinerHelper::matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo) const {
   GSelect *Select = cast<GSelect>(&MI);
 
   if (tryFoldSelectOfConstants(Select, MatchInfo))
@@ -7015,8 +7061,8 @@ bool CombinerHelper::matchSelect(MachineInstr &MI, BuildFnTy &MatchInfo) {
 /// or   (icmp Pred1 V1, C1) || (icmp Pred2 V2, C2)
 /// into a single comparison using range-based reasoning.
 /// see InstCombinerImpl::foldAndOrOfICmpsUsingRanges.
-bool CombinerHelper::tryFoldAndOrOrICmpsUsingRanges(GLogicalBinOp *Logic,
-                                                    BuildFnTy &MatchInfo) {
+bool CombinerHelper::tryFoldAndOrOrICmpsUsingRanges(
+    GLogicalBinOp *Logic, BuildFnTy &MatchInfo) const {
   assert(Logic->getOpcode() != TargetOpcode::G_XOR && "unexpected xor");
   bool IsAnd = Logic->getOpcode() == TargetOpcode::G_AND;
   Register DstReg = Logic->getReg(0);
@@ -7175,7 +7221,7 @@ bool CombinerHelper::tryFoldAndOrOrICmpsUsingRanges(GLogicalBinOp *Logic,
 }
 
 bool CombinerHelper::tryFoldLogicOfFCmps(GLogicalBinOp *Logic,
-                                         BuildFnTy &MatchInfo) {
+                                         BuildFnTy &MatchInfo) const {
   assert(Logic->getOpcode() != TargetOpcode::G_XOR && "unexpecte xor");
   Register DestReg = Logic->getReg(0);
   Register LHS = Logic->getLHSReg();
@@ -7249,7 +7295,7 @@ bool CombinerHelper::tryFoldLogicOfFCmps(GLogicalBinOp *Logic,
   return false;
 }
 
-bool CombinerHelper::matchAnd(MachineInstr &MI, BuildFnTy &MatchInfo) {
+bool CombinerHelper::matchAnd(MachineInstr &MI, BuildFnTy &MatchInfo) const {
   GAnd *And = cast<GAnd>(&MI);
 
   if (tryFoldAndOrOrICmpsUsingRanges(And, MatchInfo))
@@ -7261,7 +7307,7 @@ bool CombinerHelper::matchAnd(MachineInstr &MI, BuildFnTy &MatchInfo) {
   return false;
 }
 
-bool CombinerHelper::matchOr(MachineInstr &MI, BuildFnTy &MatchInfo) {
+bool CombinerHelper::matchOr(MachineInstr &MI, BuildFnTy &MatchInfo) const {
   GOr *Or = cast<GOr>(&MI);
 
   if (tryFoldAndOrOrICmpsUsingRanges(Or, MatchInfo))
@@ -7273,7 +7319,8 @@ bool CombinerHelper::matchOr(MachineInstr &MI, BuildFnTy &MatchInfo) {
   return false;
 }
 
-bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) {
+bool CombinerHelper::matchAddOverflow(MachineInstr &MI,
+                                      BuildFnTy &MatchInfo) const {
   GAddCarryOut *Add = cast<GAddCarryOut>(&MI);
 
   // Addo has no flags
@@ -7441,18 +7488,20 @@ bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) {
 }
 
 void CombinerHelper::applyBuildFnMO(const MachineOperand &MO,
-                                    BuildFnTy &MatchInfo) {
+                                    BuildFnTy &MatchInfo) const {
   MachineInstr *Root = getDefIgnoringCopies(MO.getReg(), MRI);
   MatchInfo(Builder);
   Root->eraseFromParent();
 }
 
-bool CombinerHelper::matchFPowIExpansion(MachineInstr &MI, int64_t Exponent) {
+bool CombinerHelper::matchFPowIExpansion(MachineInstr &MI,
+                                         int64_t Exponent) const {
   bool OptForSize = MI.getMF()->getFunction().hasOptSize();
   return getTargetLowering().isBeneficialToExpandPowI(Exponent, OptForSize);
 }
 
-void CombinerHelper::applyExpandFPowI(MachineInstr &MI, int64_t Exponent) {
+void CombinerHelper::applyExpandFPowI(MachineInstr &MI,
+                                      int64_t Exponent) const {
   auto [Dst, Base] = MI.getFirst2Regs();
   LLT Ty = MRI.getType(Dst);
   int64_t ExpVal = Exponent;
@@ -7496,7 +7545,7 @@ void CombinerHelper::applyExpandFPowI(MachineInstr &MI, int64_t Exponent) {
 }
 
 bool CombinerHelper::matchFoldAPlusC1MinusC2(const MachineInstr &MI,
-                                             BuildFnTy &MatchInfo) {
+                                             BuildFnTy &MatchInfo) const {
   // fold (A+C1)-C2 -> A+(C1-C2)
   const GSub *Sub = cast<GSub>(&MI);
   GAdd *Add = cast<GAdd>(MRI.getVRegDef(Sub->getLHSReg()));
@@ -7519,7 +7568,7 @@ bool CombinerHelper::matchFoldAPlusC1MinusC2(const MachineInstr &MI,
 }
 
 bool CombinerHelper::matchFoldC2MinusAPlusC1(const MachineInstr &MI,
-                                             BuildFnTy &MatchInfo) {
+                                             BuildFnTy &MatchInfo) const {
   // fold C2-(A+C1) -> (C2-C1)-A
   const GSub *Sub = cast<GSub>(&MI);
   GAdd *Add = cast<GAdd>(MRI.getVRegDef(Sub->getRHSReg()));
@@ -7542,7 +7591,7 @@ bool CombinerHelper::matchFoldC2MinusAPlusC1(const MachineInstr &MI,
 }
 
 bool CombinerHelper::matchFoldAMinusC1MinusC2(const MachineInstr &MI,
-                                              BuildFnTy &MatchInfo) {
+                                              BuildFnTy &MatchInfo) const {
   // fold (A-C1)-C2 -> A-(C1+C2)
   const GSub *Sub1 = cast<GSub>(&MI);
   GSub *Sub2 = cast<GSub>(MRI.getVRegDef(Sub1->getLHSReg()));
@@ -7565,7 +7614,7 @@ bool CombinerHelper::matchFoldAMinusC1MinusC2(const MachineInstr &MI,
 }
 
 bool CombinerHelper::matchFoldC1Minus2MinusC2(const MachineInstr &MI,
-                                              BuildFnTy &MatchInfo) {
+                                              BuildFnTy &MatchInfo) const {
   // fold (C1-A)-C2 -> (C1-C2)-A
   const GSub *Sub1 = cast<GSub>(&MI);
   GSub *Sub2 = cast<GSub>(MRI.getVRegDef(Sub1->getLHSReg()));
@@ -7588,7 +7637,7 @@ bool CombinerHelper::matchFoldC1Minus2MinusC2(const MachineInstr &MI,
 }
 
 bool CombinerHelper::matchFoldAMinusC1PlusC2(const MachineInstr &MI,
-                                             BuildFnTy &MatchInfo) {
+                                             BuildFnTy &MatchInfo) const {
   // fold ((A-C1)+C2) -> (A+(C2-C1))
   const GAdd *Add = cast<GAdd>(&MI);
   GSub *Sub = cast<GSub>(MRI.getVRegDef(Add->getLHSReg()));
@@ -7610,8 +7659,8 @@ bool CombinerHelper::matchFoldAMinusC1PlusC2(const MachineInstr &MI,
   return true;
 }
 
-bool CombinerHelper::matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI,
-                                                         BuildFnTy &MatchInfo) {
+bool CombinerHelper::matchUnmergeValuesAnyExtBuildVector(
+    const MachineInstr &MI, BuildFnTy &MatchInfo) const {
   const GUnmerge *Unmerge = cast<GUnmerge>(&MI);
 
   if (!MRI.hasOneNonDBGUse(Unmerge->getSourceReg()))
@@ -7693,7 +7742,7 @@ bool CombinerHelper::matchUnmergeValuesAnyExtBuildVector(const MachineInstr &MI,
 }
 
 bool CombinerHelper::matchShuffleUndefRHS(MachineInstr &MI,
-                                          BuildFnTy &MatchInfo) {
+                                          BuildFnTy &MatchInfo) const {
 
   bool Changed = false;
   auto &Shuffle = cast<GShuffleVector>(MI);
@@ -7737,7 +7786,7 @@ static void commuteMask(MutableArrayRef<int> Mask, const unsigned NumElems) {
 }
 
 bool CombinerHelper::matchShuffleDisjointMask(MachineInstr &MI,
-                                              BuildFnTy &MatchInfo) {
+                                              BuildFnTy &MatchInfo) const {
 
   auto &Shuffle = cast<GShuffleVector>(MI);
   // If any of the two inputs is already undef, don't check the mask again to
@@ -7789,7 +7838,7 @@ bool CombinerHelper::matchShuffleDisjointMask(MachineInstr &MI,
 }
 
 bool CombinerHelper::matchSuboCarryOut(const MachineInstr &MI,
-                                       BuildFnTy &MatchInfo) {
+                                       BuildFnTy &MatchInfo) const {
   const GSubCarryOut *Subo = cast<GSubCarryOut>(&MI);
 
   Register Dst = Subo->getReg(0);
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
index 797a1e84e21e3..24d2d9ddaeebd 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperArtifacts.cpp
@@ -29,7 +29,7 @@
 using namespace llvm;
 
 bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI,
-                                         BuildFnTy &MatchInfo) {
+                                         BuildFnTy &MatchInfo) const {
   const GMerge *Merge = cast<GMerge>(&MI);
 
   Register Dst = Merge->getReg(0);
@@ -58,7 +58,7 @@ bool CombinerHelper::matchMergeXAndUndef(const MachineInstr &MI,
 }
 
 bool CombinerHelper::matchMergeXAndZero(const MachineInstr &MI,
-                                        BuildFnTy &MatchInfo) {
+                                        BuildFnTy &MatchInfo) const {
   const GMerge *Merge = cast<GMerge>(&MI);
 
   Register Dst = Merge->getReg(0);
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
index 0896038f2d150..7b4c427a9c504 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCasts.cpp
@@ -26,7 +26,7 @@
 using namespace llvm;
 
 bool CombinerHelper::matchSextOfTrunc(const MachineOperand &MO,
-                                      BuildFnTy &MatchInfo) {
+                                      BuildFnTy &MatchInfo) const {
   GSext *Sext = cast<GSext>(getDefIgnoringCopies(MO.getReg(), MRI));
   GTrunc *Trunc = cast<GTrunc>(getDefIgnoringCopies(Sext->getSrcReg(), MRI));
 
@@ -59,7 +59,7 @@ bool CombinerHelper::matchSextOfTrunc(const MachineOperand &MO,
 }
 
 bool CombinerHelper::matchZextOfTrunc(const MachineOperand &MO,
-                                      BuildFnTy &MatchInfo) {
+                                      BuildFnTy &MatchInfo) const {
   GZext *Zext = cast<GZext>(getDefIgnoringCopies(MO.getReg(), MRI));
   GTrunc *Trunc = cast<GTrunc>(getDefIgnoringCopies(Zext->getSrcReg(), MRI));
 
@@ -94,7 +94,7 @@ bool CombinerHelper::matchZextOfTrunc(const MachineOperand &MO,
 }
 
 bool CombinerHelper::matchNonNegZext(const MachineOperand &MO,
-                                     BuildFnTy &MatchInfo) {
+                                     BuildFnTy &MatchInfo) const {
   GZext *Zext = cast<GZext>(MRI.getVRegDef(MO.getReg()));
 
   Register Dst = Zext->getReg(0);
@@ -116,7 +116,7 @@ bool CombinerHelper::matchNonNegZext(const MachineOperand &MO,
 
 bool CombinerHelper::matchTruncateOfExt(const MachineInstr &Root,
                                         const MachineInstr &ExtMI,
-                                        BuildFnTy &MatchInfo) {
+                                        BuildFnTy &MatchInfo) const {
   const GTrunc *Trunc = cast<GTrunc>(&Root);
   const GExtOp *Ext = cast<GExtOp>(&ExtMI);
 
@@ -179,7 +179,7 @@ bool CombinerHelper::isCastFree(unsigned Opcode, LLT ToTy, LLT FromTy) const {
 
 bool CombinerHelper::matchCastOfSelect(const MachineInstr &CastMI,
                                        const MachineInstr &SelectMI,
-                                       BuildFnTy &MatchInfo) {
+                                       BuildFnTy &MatchInfo) const {
   const GExtOrTruncOp *Cast = cast<GExtOrTruncOp>(&CastMI);
   const GSelect *Select = cast<GSelect>(&SelectMI);
 
@@ -211,7 +211,7 @@ bool CombinerHelper::matchCastOfSelect(const MachineInstr &CastMI,
 
 bool CombinerHelper::matchExtOfExt(const MachineInstr &FirstMI,
                                    const MachineInstr &SecondMI,
-                                   BuildFnTy &MatchInfo) {
+                                   BuildFnTy &MatchInfo) const {
   const GExtOp *First = cast<GExtOp>(&FirstMI);
   const GExtOp *Second = cast<GExtOp>(&SecondMI);
 
@@ -275,7 +275,7 @@ bool CombinerHelper::matchExtOfExt(const MachineInstr &FirstMI,
 
 bool CombinerHelper::matchCastOfBuildVector(const MachineInstr &CastMI,
                                             const MachineInstr &BVMI,
-                                            BuildFnTy &MatchInfo) {
+                                            BuildFnTy &MatchInfo) const {
   const GExtOrTruncOp *Cast = cast<GExtOrTruncOp>(&CastMI);
   const GBuildVector *BV = cast<GBuildVector>(&BVMI);
 
@@ -315,7 +315,7 @@ bool CombinerHelper::matchCastOfBuildVector(const MachineInstr &CastMI,
 
 bool CombinerHelper::matchNarrowBinop(const MachineInstr &TruncMI,
                                       const MachineInstr &BinopMI,
-                                      BuildFnTy &MatchInfo) {
+                                      BuildFnTy &MatchInfo) const {
   const GTrunc *Trunc = cast<GTrunc>(&TruncMI);
   const GBinOp *BinOp = cast<GBinOp>(&BinopMI);
 
@@ -339,7 +339,7 @@ bool CombinerHelper::matchNarrowBinop(const MachineInstr &TruncMI,
 }
 
 bool CombinerHelper::matchCastOfInteger(const MachineInstr &CastMI,
-                                        APInt &MatchInfo) {
+                                        APInt &MatchInfo) const {
   const GExtOrTruncOp *Cast = cast<GExtOrTruncOp>(&CastMI);
 
   APInt Input = getIConstantFromReg(Cast->getSrcReg(), MRI);
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCompares.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCompares.cpp
index 872b5fed11c6e..fc40533cf3dc9 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperCompares.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperCompares.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
 bool CombinerHelper::constantFoldICmp(const GICmp &ICmp,
                                       const GIConstant &LHSCst,
                                       const GIConstant &RHSCst,
-                                      BuildFnTy &MatchInfo) {
+                                      BuildFnTy &MatchInfo) const {
   if (LHSCst.getKind() != GIConstant::GIConstantKind::Scalar)
     return false;
 
@@ -60,7 +60,7 @@ bool CombinerHelper::constantFoldICmp(const GICmp &ICmp,
 bool CombinerHelper::constantFoldFCmp(const GFCmp &FCmp,
                                       const GFConstant &LHSCst,
                                       const GFConstant &RHSCst,
-                                      BuildFnTy &MatchInfo) {
+                                      BuildFnTy &MatchInfo) const {
   if (LHSCst.getKind() != GFConstant::GFConstantKind::Scalar)
     return false;
 
@@ -89,7 +89,7 @@ bool CombinerHelper::constantFoldFCmp(const GFCmp &FCmp,
 }
 
 bool CombinerHelper::matchCanonicalizeICmp(const MachineInstr &MI,
-                                           BuildFnTy &MatchInfo) {
+                                           BuildFnTy &MatchInfo) const {
   const GICmp *Cmp = cast<GICmp>(&MI);
 
   Register Dst = Cmp->getReg(0);
@@ -114,7 +114,7 @@ bool CombinerHelper::matchCanonicalizeICmp(const MachineInstr &MI,
 }
 
 bool CombinerHelper::matchCanonicalizeFCmp(const MachineInstr &MI,
-                                           BuildFnTy &MatchInfo) {
+                                           BuildFnTy &MatchInfo) const {
   const GFCmp *Cmp = cast<GFCmp>(&MI);
 
   Register Dst = Cmp->getReg(0);
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
index 84fb3b5965895..229076bbdeaa5 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp
@@ -31,7 +31,7 @@ using namespace llvm;
 using namespace MIPatternMatch;
 
 bool CombinerHelper::matchExtractVectorElement(MachineInstr &MI,
-                                               BuildFnTy &MatchInfo) {
+                                               BuildFnTy &MatchInfo) const {
   GExtractVectorElement *Extract = cast<GExtractVectorElement>(&MI);
 
   Register Dst = Extract->getReg(0);
@@ -89,7 +89,7 @@ bool CombinerHelper::matchExtractVectorElement(MachineInstr &MI,
 }
 
 bool CombinerHelper::matchExtractVectorElementWithDifferentIndices(
-    const MachineOperand &MO, BuildFnTy &MatchInfo) {
+    const MachineOperand &MO, BuildFnTy &MatchInfo) const {
   MachineInstr *Root = getDefIgnoringCopies(MO.getReg(), MRI);
   GExtractVectorElement *Extract = cast<GExtractVectorElement>(Root);
 
@@ -146,7 +146,8 @@ bool CombinerHelper::matchExtractVectorElementWithDifferentIndices(
 }
 
 bool CombinerHelper::matchExtractVectorElementWithBuildVector(
-    const MachineInstr &MI, const MachineInstr &MI2, BuildFnTy &MatchInfo) {
+    const MachineInstr &MI, const MachineInstr &MI2,
+    BuildFnTy &MatchInfo) const {
   const GExtractVectorElement *Extract = cast<GExtractVectorElement>(&MI);
   const GBuildVector *Build = cast<GBuildVector>(&MI2);
 
@@ -185,7 +186,7 @@ bool CombinerHelper::matchExtractVectorElementWithBuildVector(
 }
 
 bool CombinerHelper::matchExtractVectorElementWithBuildVectorTrunc(
-    const MachineOperand &MO, BuildFnTy &MatchInfo) {
+    const MachineOperand &MO, BuildFnTy &MatchInfo) const {
   MachineInstr *Root = getDefIgnoringCopies(MO.getReg(), MRI);
   GExtractVectorElement *Extract = cast<GExtractVectorElement>(Root);
 
@@ -252,7 +253,8 @@ bool CombinerHelper::matchExtractVectorElementWithBuildVectorTrunc(
 }
 
 bool CombinerHelper::matchExtractVectorElementWithShuffleVector(
-    const MachineInstr &MI, const MachineInstr &MI2, BuildFnTy &MatchInfo) {
+    const MachineInstr &MI, const MachineInstr &MI2,
+    BuildFnTy &MatchInfo) const {
   const GExtractVectorElement *Extract = cast<GExtractVectorElement>(&MI);
   const GShuffleVector *Shuffle = cast<GShuffleVector>(&MI2);
 
@@ -338,7 +340,7 @@ bool CombinerHelper::matchExtractVectorElementWithShuffleVector(
 }
 
 bool CombinerHelper::matchInsertVectorElementOOB(MachineInstr &MI,
-                                                 BuildFnTy &MatchInfo) {
+                                                 BuildFnTy &MatchInfo) const {
   GInsertVectorElement *Insert = cast<GInsertVectorElement>(&MI);
 
   Register Dst = Insert->getReg(0);
@@ -361,7 +363,7 @@ bool CombinerHelper::matchInsertVectorElementOOB(MachineInstr &MI,
 }
 
 bool CombinerHelper::matchAddOfVScale(const MachineOperand &MO,
-                                      BuildFnTy &MatchInfo) {
+                                      BuildFnTy &MatchInfo) const {
   GAdd *Add = cast<GAdd>(MRI.getVRegDef(MO.getReg()));
   GVScale *LHSVScale = cast<GVScale>(MRI.getVRegDef(Add->getLHSReg()));
   GVScale *RHSVScale = cast<GVScale>(MRI.getVRegDef(Add->getRHSReg()));
@@ -380,7 +382,7 @@ bool CombinerHelper::matchAddOfVScale(const MachineOperand &MO,
 }
 
 bool CombinerHelper::matchMulOfVScale(const MachineOperand &MO,
-                                      BuildFnTy &MatchInfo) {
+                                      BuildFnTy &MatchInfo) const {
   GMul *Mul = cast<GMul>(MRI.getVRegDef(MO.getReg()));
   GVScale *LHSVScale = cast<GVScale>(MRI.getVRegDef(Mul->getLHSReg()));
 
@@ -401,7 +403,7 @@ bool CombinerHelper::matchMulOfVScale(const MachineOperand &MO,
 }
 
 bool CombinerHelper::matchSubOfVScale(const MachineOperand &MO,
-                                      BuildFnTy &MatchInfo) {
+                                      BuildFnTy &MatchInfo) const {
   GSub *Sub = cast<GSub>(MRI.getVRegDef(MO.getReg()));
   GVScale *RHSVScale = cast<GVScale>(MRI.getVRegDef(Sub->getRHSReg()));
 
@@ -421,7 +423,7 @@ bool CombinerHelper::matchSubOfVScale(const MachineOperand &MO,
 }
 
 bool CombinerHelper::matchShlOfVScale(const MachineOperand &MO,
-                                      BuildFnTy &MatchInfo) {
+                                      BuildFnTy &MatchInfo) const {
   GShl *Shl = cast<GShl>(MRI.getVRegDef(MO.getReg()));
   GVScale *LHSVScale = cast<GVScale>(MRI.getVRegDef(Shl->getSrcReg()));
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp
index 13dd934543a70..d76918b913984 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64O0PreLegalizerCombiner.cpp
@@ -41,8 +41,7 @@ namespace {
 
 class AArch64O0PreLegalizerCombinerImpl : public Combiner {
 protected:
-  // TODO: Make CombinerHelper methods const.
-  mutable CombinerHelper Helper;
+  const CombinerHelper Helper;
   const AArch64O0PreLegalizerCombinerImplRuleConfig &RuleConfig;
   const AArch64Subtarget &STI;
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
index 28d9f4f50f388..cf6b2ce9c5341 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -440,8 +440,7 @@ void applyCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI,
 
 class AArch64PostLegalizerCombinerImpl : public Combiner {
 protected:
-  // TODO: Make CombinerHelper methods const.
-  mutable CombinerHelper Helper;
+  const CombinerHelper Helper;
   const AArch64PostLegalizerCombinerImplRuleConfig &RuleConfig;
   const AArch64Subtarget &STI;
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 56d70ffdece71..5fe2e3cefa112 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -1243,8 +1243,7 @@ void applyExtMulToMULL(MachineInstr &MI, MachineRegisterInfo &MRI,
 
 class AArch64PostLegalizerLoweringImpl : public Combiner {
 protected:
-  // TODO: Make CombinerHelper methods const.
-  mutable CombinerHelper Helper;
+  const CombinerHelper Helper;
   const AArch64PostLegalizerLoweringImplRuleConfig &RuleConfig;
   const AArch64Subtarget &STI;
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index 80459827c30f3..bbf1883925a69 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -605,7 +605,8 @@ void applyPushAddSubExt(MachineInstr &MI, MachineRegisterInfo &MRI,
 }
 
 bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B,
-                        CombinerHelper &Helper, GISelChangeObserver &Observer) {
+                        const CombinerHelper &Helper,
+                        GISelChangeObserver &Observer) {
   // Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if
   // result is only used in the no-overflow case. It is restricted to cases
   // where we know that the high-bits of the operands are 0. If there's an
@@ -720,8 +721,7 @@ bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B,
 
 class AArch64PreLegalizerCombinerImpl : public Combiner {
 protected:
-  // TODO: Make CombinerHelper methods const.
-  mutable CombinerHelper Helper;
+  const CombinerHelper Helper;
   const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig;
   const AArch64Subtarget &STI;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 1e31fa3218d9c..98c48f4fe3705 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -48,8 +48,7 @@ class AMDGPURegBankCombinerImpl : public Combiner {
   const RegisterBankInfo &RBI;
   const TargetRegisterInfo &TRI;
   const SIInstrInfo &TII;
-  // TODO: Make CombinerHelper methods const.
-  mutable CombinerHelper Helper;
+  const CombinerHelper Helper;
 
 public:
   AMDGPURegBankCombinerImpl(
diff --git a/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp b/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp
index bd8a065011c92..56d47007cb1b0 100644
--- a/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp
@@ -43,8 +43,7 @@ class MipsPostLegalizerCombinerImpl : public Combiner {
 protected:
   const MipsPostLegalizerCombinerImplRuleConfig &RuleConfig;
   const MipsSubtarget &STI;
-  // TODO: Make CombinerHelper methods const.
-  mutable CombinerHelper Helper;
+  const CombinerHelper Helper;
 
 public:
   MipsPostLegalizerCombinerImpl(
diff --git a/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp b/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
index 0765233bfc315..80c1a5eaa52dc 100644
--- a/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
@@ -37,8 +37,7 @@ struct MipsPreLegalizerCombinerInfo : public CombinerInfo {
 class MipsPreLegalizerCombinerImpl : public Combiner {
 protected:
   const MipsSubtarget &STI;
-  // TODO: Make CombinerHelper methods const.
-  mutable CombinerHelper Helper;
+  const CombinerHelper Helper;
 
 public:
   MipsPreLegalizerCombinerImpl(MachineFunction &MF, CombinerInfo &CInfo,
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVO0PreLegalizerCombiner.cpp b/llvm/lib/Target/RISCV/GISel/RISCVO0PreLegalizerCombiner.cpp
index a8c9d7bff154b..aa44c0c15bb94 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVO0PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVO0PreLegalizerCombiner.cpp
@@ -38,8 +38,7 @@ namespace {
 
 class RISCVO0PreLegalizerCombinerImpl : public Combiner {
 protected:
-  // TODO: Make CombinerHelper methods const.
-  mutable CombinerHelper Helper;
+  const CombinerHelper Helper;
   const RISCVO0PreLegalizerCombinerImplRuleConfig &RuleConfig;
   const RISCVSubtarget &STI;
 
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp
index 3814cee6015e6..c558ed66f3a15 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp
@@ -44,8 +44,7 @@ namespace {
 
 class RISCVPostLegalizerCombinerImpl : public Combiner {
 protected:
-  // TODO: Make CombinerHelper methods const.
-  mutable CombinerHelper Helper;
+  const CombinerHelper Helper;
   const RISCVPostLegalizerCombinerImplRuleConfig &RuleConfig;
   const RISCVSubtarget &STI;
 
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp b/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp
index 7e533e4bd798f..efcb24706886e 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp
@@ -40,8 +40,7 @@ namespace {
 
 class RISCVPreLegalizerCombinerImpl : public Combiner {
 protected:
-  // TODO: Make CombinerHelper methods const.
-  mutable CombinerHelper Helper;
+  const CombinerHelper Helper;
   const RISCVPreLegalizerCombinerImplRuleConfig &RuleConfig;
   const RISCVSubtarget &STI;
 

From d3c4637cbbd5f0a84811abe195098ce714a2cc32 Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Thu, 19 Dec 2024 18:08:24 -0800
Subject: [PATCH 130/209] [WebKit checkers] Recognize adoptRef as a safe
 function (#120629)

adoptRef in WebKit constructs Ref/RefPtr so treat it as such in
isCtorOfRefCounted.
Also removed the support for makeRef and makeRefPtr as they don't exist
any more.
---
 .../Checkers/WebKit/PtrTypesSemantics.cpp     |  5 +--
 .../Analysis/Checkers/WebKit/call-args.cpp    | 18 ++++++++
 .../Analysis/Checkers/WebKit/mock-types.h     | 45 ++++++++++++++++++-
 .../ref-cntbl-crtp-base-no-virtual-dtor.cpp   |  8 ----
 4 files changed, 64 insertions(+), 12 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index 797f3e1f3fba5..5487fea1b956c 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -125,9 +125,8 @@ bool isCtorOfRefCounted(const clang::FunctionDecl *F) {
   assert(F);
   const std::string &FunctionName = safeGetName(F);
 
-  return isRefType(FunctionName) || FunctionName == "makeRef" ||
-         FunctionName == "makeRefPtr" || FunctionName == "UniqueRef" ||
-         FunctionName == "makeUniqueRef" ||
+  return isRefType(FunctionName) || FunctionName == "adoptRef" ||
+         FunctionName == "UniqueRef" || FunctionName == "makeUniqueRef" ||
          FunctionName == "makeUniqueRefWithoutFastMallocCheck"
 
          || FunctionName == "String" || FunctionName == "AtomString" ||
diff --git a/clang/test/Analysis/Checkers/WebKit/call-args.cpp b/clang/test/Analysis/Checkers/WebKit/call-args.cpp
index 2146eae9975b9..b4613d5090f29 100644
--- a/clang/test/Analysis/Checkers/WebKit/call-args.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/call-args.cpp
@@ -376,3 +376,21 @@ namespace call_with_explicit_temporary_obj {
 
 namespace call_with_explicit_construct {
 }
+
+namespace call_with_adopt_ref {
+  class Obj {
+  public:
+    void ref() const;
+    void deref() const;
+    void method();
+  };
+
+  // This is needed due to rdar://141692212.
+  struct dummy {
+    RefPtr<Obj> any;
+  };
+
+  void foo() {
+    adoptRef(new Obj)->method();
+  }
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/mock-types.h b/clang/test/Analysis/Checkers/WebKit/mock-types.h
index f3bd20f8bcf60..85397c2d25951 100644
--- a/clang/test/Analysis/Checkers/WebKit/mock-types.h
+++ b/clang/test/Analysis/Checkers/WebKit/mock-types.h
@@ -74,7 +74,10 @@ template<typename T> struct DefaultRefDerefTraits {
 template <typename T, typename PtrTraits = RawPtrTraits<T>, typename RefDerefTraits = DefaultRefDerefTraits<T>> struct Ref {
   typename PtrTraits::StorageType t;
 
+  enum AdoptTag { Adopt };
+
   Ref() : t{} {};
+  Ref(T &t, AdoptTag) : t(&t) { }
   Ref(T &t) : t(&RefDerefTraits::ref(t)) { }
   Ref(const Ref& o) : t(RefDerefTraits::refIfNotNull(PtrTraits::unwrap(o.t))) { }
   Ref(Ref&& o) : t(o.leakRef()) { }
@@ -101,10 +104,19 @@ template <typename T, typename PtrTraits = RawPtrTraits<T>, typename RefDerefTra
   T* leakRef() { return PtrTraits::exchange(t, nullptr); }
 };
 
+template <typename T> Ref<T> adoptRef(T& t) {
+  using Ref = Ref<T>;
+  return Ref(t, Ref::Adopt);
+}
+
+template<typename T> class RefPtr;
+template<typename T> RefPtr<T> adoptRef(T*);
+
 template <typename T> struct RefPtr {
   T *t;
 
-  RefPtr() : t(new T) {}
+  RefPtr() : t(nullptr) { }
+
   RefPtr(T *t)
     : t(t) {
     if (t)
@@ -113,6 +125,26 @@ template <typename T> struct RefPtr {
   RefPtr(Ref<T>&& o)
     : t(o.leakRef())
   { }
+  RefPtr(RefPtr&& o)
+    : t(o.t)
+  {
+    o.t = nullptr;
+  }
+  RefPtr(const RefPtr& o)
+    : t(o.t)
+  {
+    if (t)
+      t->ref();
+  }
+  RefPtr operator=(const RefPtr& o)
+  {
+    if (t)
+      t->deref();
+    t = o.t;
+    if (t)
+      t->ref();
+    return *this;
+  }
   ~RefPtr() {
     if (t)
       t->deref();
@@ -138,8 +170,19 @@ template <typename T> struct RefPtr {
     return *this;
   }
   operator bool() const { return t; }
+
+private:
+  friend RefPtr adoptRef<T>(T*);
+
+  // call_with_adopt_ref in call-args.cpp requires this method to be private.
+  enum AdoptTag { Adopt };
+  RefPtr(T *t, AdoptTag) : t(t) { }
 };
 
+template <typename T> RefPtr<T> adoptRef(T* t) {
+  return RefPtr<T>(t, RefPtr<T>::Adopt);
+}
+
 template <typename T> bool operator==(const RefPtr<T> &, const RefPtr<T> &) {
   return false;
 }
diff --git a/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp b/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp
index 33c60ea8ca64d..4209db14eaa52 100644
--- a/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/ref-cntbl-crtp-base-no-virtual-dtor.cpp
@@ -61,14 +61,6 @@ template<typename Out, typename... In> Function<Out(In...)> adopt(Detail::Callab
     return Function<Out(In...)>(impl, Function<Out(In...)>::Adopt);
 }
 
-template<typename T, typename PtrTraits = RawPtrTraits<T>, typename RefDerefTraits = DefaultRefDerefTraits<T>> Ref<T, PtrTraits, RefDerefTraits> adoptRef(T&);
-
-template<typename T, typename _PtrTraits, typename RefDerefTraits>
-inline Ref<T, _PtrTraits, RefDerefTraits> adoptRef(T& reference)
-{
-    return Ref<T, _PtrTraits, RefDerefTraits>(reference);
-}
-
 enum class DestructionThread : unsigned char { Any, Main, MainRunLoop };
 void ensureOnMainThread(Function<void()>&&); // Sync if called on main thread, async otherwise.
 void ensureOnMainRunLoop(Function<void()>&&); // Sync if called on main run loop, async otherwise.

From 4307198d51487cc16f98eebb2113caf4a1905914 Mon Sep 17 00:00:00 2001
From: Shubham Sandeep Rastogi <srastogi22@apple.com>
Date: Thu, 19 Dec 2024 18:09:14 -0800
Subject: [PATCH 131/209] Move DroppedVariableStats to CodeGen lib (#120650)

To get Dropped variable statistics for MIR, we need to move the base
class DroppedVariableStats code to the CodeGen library because we cannot
have CodeGen link against Passes.

Also moved the code for the virtual functions to the header because
clang/lib/CodeGen doesn't link against llvm/lib/CodeGen however it does
link against Passes which contains the `class StandardInstrumentations`
code but not the definition for the virtual functions leading to the
error about not finding vtable for `class DroppedVariableStatsIR`
---
 .../DroppedVariableStats.h                    | 26 +++++++++++++--
 .../llvm/Passes/StandardInstrumentations.h    |  2 +-
 llvm/lib/CodeGen/CMakeLists.txt               |  1 +
 .../DroppedVariableStats.cpp                  | 32 +------------------
 llvm/lib/Passes/CMakeLists.txt                |  1 -
 llvm/unittests/CodeGen/CMakeLists.txt         |  1 +
 .../DroppedVariableStatsIRTest.cpp            |  0
 llvm/unittests/IR/CMakeLists.txt              |  1 -
 8 files changed, 28 insertions(+), 36 deletions(-)
 rename llvm/include/llvm/{Passes => CodeGen}/DroppedVariableStats.h (91%)
 rename llvm/lib/{Passes => CodeGen}/DroppedVariableStats.cpp (84%)
 rename llvm/unittests/{IR => CodeGen}/DroppedVariableStatsIRTest.cpp (100%)

diff --git a/llvm/include/llvm/Passes/DroppedVariableStats.h b/llvm/include/llvm/CodeGen/DroppedVariableStats.h
similarity index 91%
rename from llvm/include/llvm/Passes/DroppedVariableStats.h
rename to llvm/include/llvm/CodeGen/DroppedVariableStats.h
index 4555157c942b5..8986da53284e8 100644
--- a/llvm/include/llvm/Passes/DroppedVariableStats.h
+++ b/llvm/include/llvm/CodeGen/DroppedVariableStats.h
@@ -18,6 +18,7 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassInstrumentation.h"
 
@@ -206,12 +207,33 @@ class DroppedVariableStatsIR : public DroppedVariableStats {
   virtual void
   visitEveryInstruction(unsigned &DroppedCount,
                         DenseMap<VarID, DILocation *> &InlinedAtsMap,
-                        VarID Var) override;
+                        VarID Var) override {
+    const DIScope *DbgValScope = std::get<0>(Var);
+    for (const auto &I : instructions(Func)) {
+      auto *DbgLoc = I.getDebugLoc().get();
+      if (!DbgLoc)
+        continue;
+      if (updateDroppedCount(DbgLoc, DbgLoc->getScope(), DbgValScope,
+                             InlinedAtsMap, Var, DroppedCount))
+        break;
+    }
+  }
   /// Override base class method to run on #dbg_values specifically.
   virtual void visitEveryDebugRecord(
       DenseSet<VarID> &VarIDSet,
       DenseMap<StringRef, DenseMap<VarID, DILocation *>> &InlinedAtsMap,
-      StringRef FuncName, bool Before) override;
+      StringRef FuncName, bool Before) override {
+    for (const auto &I : instructions(Func)) {
+      for (DbgRecord &DR : I.getDbgRecordRange()) {
+        if (auto *Dbg = dyn_cast<DbgVariableRecord>(&DR)) {
+          auto *DbgVar = Dbg->getVariable();
+          auto DbgLoc = DR.getDebugLoc();
+          populateVarIDSetAndInlinedMap(DbgVar, DbgLoc, VarIDSet, InlinedAtsMap,
+                                        FuncName, Before);
+        }
+      }
+    }
+  }
 
   template <typename IRUnitT> static const IRUnitT *unwrapIR(Any IR) {
     const IRUnitT **IRPtr = llvm::any_cast<const IRUnitT *>(&IR);
diff --git a/llvm/include/llvm/Passes/StandardInstrumentations.h b/llvm/include/llvm/Passes/StandardInstrumentations.h
index 6ba466f9269f0..12a34c099eaff 100644
--- a/llvm/include/llvm/Passes/StandardInstrumentations.h
+++ b/llvm/include/llvm/Passes/StandardInstrumentations.h
@@ -19,13 +19,13 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/CodeGen/DroppedVariableStats.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/OptBisect.h"
 #include "llvm/IR/PassTimingInfo.h"
 #include "llvm/IR/ValueHandle.h"
-#include "llvm/Passes/DroppedVariableStats.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Transforms/IPO/SampleProfileProbe.h"
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 145fd2fac8b56..11d8ebf869344 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -50,6 +50,7 @@ add_llvm_component_library(LLVMCodeGen
   DeadMachineInstructionElim.cpp
   DetectDeadLanes.cpp
   DFAPacketizer.cpp
+  DroppedVariableStats.cpp
   DwarfEHPrepare.cpp
   EarlyIfConversion.cpp
   EdgeBundles.cpp
diff --git a/llvm/lib/Passes/DroppedVariableStats.cpp b/llvm/lib/CodeGen/DroppedVariableStats.cpp
similarity index 84%
rename from llvm/lib/Passes/DroppedVariableStats.cpp
rename to llvm/lib/CodeGen/DroppedVariableStats.cpp
index 5dc6b75fb8ace..ef6abad802e4d 100644
--- a/llvm/lib/Passes/DroppedVariableStats.cpp
+++ b/llvm/lib/CodeGen/DroppedVariableStats.cpp
@@ -11,7 +11,7 @@
 ///
 ///===---------------------------------------------------------------------===//
 
-#include "llvm/Passes/DroppedVariableStats.h"
+#include "llvm/CodeGen/DroppedVariableStats.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Module.h"
@@ -162,33 +162,3 @@ void DroppedVariableStatsIR::registerCallbacks(
   PIC.registerAfterPassInvalidatedCallback(
       [this](StringRef P, const PreservedAnalyses &PA) { return cleanup(); });
 }
-
-void DroppedVariableStatsIR::visitEveryInstruction(
-    unsigned &DroppedCount, DenseMap<VarID, DILocation *> &InlinedAtsMap,
-    VarID Var) {
-  const DIScope *DbgValScope = std::get<0>(Var);
-  for (const auto &I : instructions(Func)) {
-    auto *DbgLoc = I.getDebugLoc().get();
-    if (!DbgLoc)
-      continue;
-    if (updateDroppedCount(DbgLoc, DbgLoc->getScope(), DbgValScope,
-                           InlinedAtsMap, Var, DroppedCount))
-      break;
-  }
-}
-
-void DroppedVariableStatsIR::visitEveryDebugRecord(
-    DenseSet<VarID> &VarIDSet,
-    DenseMap<StringRef, DenseMap<VarID, DILocation *>> &InlinedAtsMap,
-    StringRef FuncName, bool Before) {
-  for (const auto &I : instructions(Func)) {
-    for (DbgRecord &DR : I.getDbgRecordRange()) {
-      if (auto *Dbg = dyn_cast<DbgVariableRecord>(&DR)) {
-        auto *DbgVar = Dbg->getVariable();
-        auto DbgLoc = DR.getDebugLoc();
-        populateVarIDSetAndInlinedMap(DbgVar, DbgLoc, VarIDSet, InlinedAtsMap,
-                                      FuncName, Before);
-      }
-    }
-  }
-}
diff --git a/llvm/lib/Passes/CMakeLists.txt b/llvm/lib/Passes/CMakeLists.txt
index 9e16a446c9b39..6425f4934b210 100644
--- a/llvm/lib/Passes/CMakeLists.txt
+++ b/llvm/lib/Passes/CMakeLists.txt
@@ -1,6 +1,5 @@
 add_llvm_component_library(LLVMPasses
   CodeGenPassBuilder.cpp
-  DroppedVariableStats.cpp
   OptimizationLevel.cpp
   PassBuilder.cpp
   PassBuilderBindings.cpp
diff --git a/llvm/unittests/CodeGen/CMakeLists.txt b/llvm/unittests/CodeGen/CMakeLists.txt
index 963cdcc0275e1..807fd1a9b7b56 100644
--- a/llvm/unittests/CodeGen/CMakeLists.txt
+++ b/llvm/unittests/CodeGen/CMakeLists.txt
@@ -27,6 +27,7 @@ add_llvm_unittest(CodeGenTests
   CCStateTest.cpp
   DIEHashTest.cpp
   DIETest.cpp
+  DroppedVariableStatsIRTest.cpp
   DwarfStringPoolEntryRefTest.cpp
   InstrRefLDVTest.cpp
   LowLevelTypeTest.cpp
diff --git a/llvm/unittests/IR/DroppedVariableStatsIRTest.cpp b/llvm/unittests/CodeGen/DroppedVariableStatsIRTest.cpp
similarity index 100%
rename from llvm/unittests/IR/DroppedVariableStatsIRTest.cpp
rename to llvm/unittests/CodeGen/DroppedVariableStatsIRTest.cpp
diff --git a/llvm/unittests/IR/CMakeLists.txt b/llvm/unittests/IR/CMakeLists.txt
index 01f02bf5d70ac..441ef271b20e1 100644
--- a/llvm/unittests/IR/CMakeLists.txt
+++ b/llvm/unittests/IR/CMakeLists.txt
@@ -43,7 +43,6 @@ add_llvm_unittest(IRTests
   ShuffleVectorInstTest.cpp
   StructuralHashTest.cpp
   TimePassesTest.cpp
-  DroppedVariableStatsIRTest.cpp
   TypesTest.cpp
   UseTest.cpp
   UserTest.cpp

From a71f9e6986b80fa90c453219795a1369b8ea7b6e Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Thu, 19 Dec 2024 18:11:28 -0800
Subject: [PATCH 132/209] [webkit.UncountedLambdaCapturesChecker] Detect
 protectedThis pattern. (#120528)

In WebKit, we often capture this as Ref or RefPtr in addition to this
itself so that the object lives as long as a capturing lambda stays
alive.

Detect this pattern and treat it as safe. This PR also makes the check
for a lambda being passed as a function argument more robust by handling
CXXBindTemporaryExpr, CXXConstructExpr, and DeclRefExpr referring to the
lambda.
---
 .../WebKit/UncountedLambdaCapturesChecker.cpp | 78 ++++++++++++++++++-
 .../WebKit/uncounted-lambda-captures.cpp      | 52 +++++++++++++
 2 files changed, 128 insertions(+), 2 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp
index d786b02e2d7f3..da9698e327562 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/UncountedLambdaCapturesChecker.cpp
@@ -115,7 +115,7 @@ class UncountedLambdaCapturesChecker
             if (ArgIndex >= CE->getNumArgs())
               return true;
             auto *Arg = CE->getArg(ArgIndex)->IgnoreParenCasts();
-            if (auto *L = dyn_cast_or_null<LambdaExpr>(Arg)) {
+            if (auto *L = findLambdaInArg(Arg)) {
               LambdasToIgnore.insert(L);
               if (!Param->hasAttr<NoEscapeAttr>() && !TreatAllArgsAsNoEscape)
                 Checker->visitLambdaExpr(L, shouldCheckThis());
@@ -126,6 +126,38 @@ class UncountedLambdaCapturesChecker
         return true;
       }
 
+      LambdaExpr *findLambdaInArg(Expr *E) {
+        if (auto *Lambda = dyn_cast_or_null<LambdaExpr>(E))
+          return Lambda;
+        auto *TempExpr = dyn_cast_or_null<CXXBindTemporaryExpr>(E);
+        if (!TempExpr)
+          return nullptr;
+        E = TempExpr->getSubExpr()->IgnoreParenCasts();
+        if (!E)
+          return nullptr;
+        if (auto *Lambda = dyn_cast<LambdaExpr>(E))
+          return Lambda;
+        auto *CE = dyn_cast_or_null<CXXConstructExpr>(E);
+        if (!CE || !CE->getNumArgs())
+          return nullptr;
+        auto *CtorArg = CE->getArg(0)->IgnoreParenCasts();
+        if (!CtorArg)
+          return nullptr;
+        if (auto *Lambda = dyn_cast<LambdaExpr>(CtorArg))
+          return Lambda;
+        auto *DRE = dyn_cast<DeclRefExpr>(CtorArg);
+        if (!DRE)
+          return nullptr;
+        auto *VD = dyn_cast_or_null<VarDecl>(DRE->getDecl());
+        if (!VD)
+          return nullptr;
+        auto *Init = VD->getInit();
+        if (!Init)
+          return nullptr;
+        TempExpr = dyn_cast<CXXBindTemporaryExpr>(Init->IgnoreParenCasts());
+        return dyn_cast_or_null<LambdaExpr>(TempExpr->getSubExpr());
+      }
+
       void checkCalleeLambda(CallExpr *CE) {
         auto *Callee = CE->getCallee();
         if (!Callee)
@@ -180,11 +212,53 @@ class UncountedLambdaCapturesChecker
       } else if (C.capturesThis() && shouldCheckThis) {
         if (ignoreParamVarDecl) // this is always a parameter to this function.
           continue;
-        reportBugOnThisPtr(C);
+        bool hasProtectThis = false;
+        for (const LambdaCapture &OtherCapture : L->captures()) {
+          if (!OtherCapture.capturesVariable())
+            continue;
+          if (auto *ValueDecl = OtherCapture.getCapturedVar()) {
+            if (protectThis(ValueDecl)) {
+              hasProtectThis = true;
+              break;
+            }
+          }
+        }
+        if (!hasProtectThis)
+          reportBugOnThisPtr(C);
       }
     }
   }
 
+  bool protectThis(const ValueDecl *ValueDecl) const {
+    auto *VD = dyn_cast<VarDecl>(ValueDecl);
+    if (!VD)
+      return false;
+    auto *Init = VD->getInit()->IgnoreParenCasts();
+    if (!Init)
+      return false;
+    auto *BTE = dyn_cast<CXXBindTemporaryExpr>(Init);
+    if (!BTE)
+      return false;
+    auto *CE = dyn_cast_or_null<CXXConstructExpr>(BTE->getSubExpr());
+    if (!CE)
+      return false;
+    auto *Ctor = CE->getConstructor();
+    if (!Ctor)
+      return false;
+    auto clsName = safeGetName(Ctor->getParent());
+    if (!isRefType(clsName) || !CE->getNumArgs())
+      return false;
+    auto *Arg = CE->getArg(0)->IgnoreParenCasts();
+    while (auto *UO = dyn_cast<UnaryOperator>(Arg)) {
+      auto OpCode = UO->getOpcode();
+      if (OpCode == UO_Deref || OpCode == UO_AddrOf)
+        Arg = UO->getSubExpr();
+      else
+        break;
+    }
+    return isa<CXXThisExpr>(Arg);
+  }
+
   void reportBug(const LambdaCapture &Capture, ValueDecl *CapturedVar,
                  const QualType T) const {
     assert(CapturedVar);
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp
index daff32e9940c8..2173245bc7af3 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp
@@ -207,6 +207,58 @@ struct RefCountableWithLambdaCapturingThis {
     };
     call(lambda);
   }
+
+  void method_captures_this_unsafe_capture_local_var_explicitly() {
+    RefCountable* x = make_obj();
+    call([this, protectedThis = RefPtr { this }, x]() {
+      // expected-warning@-1{{Captured raw-pointer 'x' to ref-counted type or CheckedPtr-capable type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
+      nonTrivial();
+      x->method();
+    });
+  }
+
+  void method_captures_this_with_other_protected_var() {
+    RefCountable* x = make_obj();
+    call([this, protectedX = RefPtr { x }]() {
+      // expected-warning@-1{{Captured raw-pointer 'this' to ref-counted type or CheckedPtr-capable type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
+      nonTrivial();
+      protectedX->method();
+    });
+  }
+
+  void method_captures_this_unsafe_capture_local_var_explicitly_with_deref() {
+    RefCountable* x = make_obj();
+    call([this, protectedThis = Ref { *this }, x]() {
+      // expected-warning@-1{{Captured raw-pointer 'x' to ref-counted type or CheckedPtr-capable type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
+      nonTrivial();
+      x->method();
+    });
+  }
+
+  void method_captures_this_unsafe_local_var_via_vardecl() {
+    RefCountable* x = make_obj();
+    auto lambda = [this, protectedThis = Ref { *this }, x]() {
+      // expected-warning@-1{{Captured raw-pointer 'x' to ref-counted type or CheckedPtr-capable type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
+      nonTrivial();
+      x->method();
+    };
+    call(lambda);
+  }
+
+  void method_captures_this_with_guardian() {
+    auto lambda = [this, protectedThis = Ref { *this }]() {
+      nonTrivial();
+    };
+    call(lambda);
+  }
+
+  void method_captures_this_with_guardian_refPtr() {
+    auto lambda = [this, protectedThis = RefPtr { &*this }]() {
+      nonTrivial();
+    };
+    call(lambda);
+  }
+
 };
 
 struct NonRefCountableWithLambdaCapturingThis {

From 7666c40c09410e0fea4c4e08ce71fe10b05f588a Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 20 Dec 2024 02:14:47 +0000
Subject: [PATCH 133/209] [gn build] Port 4307198d5148

---
 llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn       | 1 +
 llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn        | 1 -
 llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn | 1 +
 llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn      | 1 -
 4 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
index ab72ac4ae9f4b..bcb46d919b6c1 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
@@ -50,6 +50,7 @@ static_library("CodeGen") {
     "DFAPacketizer.cpp",
     "DeadMachineInstructionElim.cpp",
     "DetectDeadLanes.cpp",
+    "DroppedVariableStats.cpp",
     "DwarfEHPrepare.cpp",
     "EHContGuardCatchret.cpp",
     "EarlyIfConversion.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
index 655264509db59..274f5b54345c7 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
@@ -21,7 +21,6 @@ static_library("Passes") {
   ]
   sources = [
     "CodeGenPassBuilder.cpp",
-    "DroppedVariableStats.cpp",
     "OptimizationLevel.cpp",
     "PassBuilder.cpp",
     "PassBuilderBindings.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
index a3f89a5648cb5..dc01cc9a40a9c 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
@@ -26,6 +26,7 @@ unittest("CodeGenTests") {
     "CCStateTest.cpp",
     "DIEHashTest.cpp",
     "DIETest.cpp",
+    "DroppedVariableStatsIRTest.cpp",
     "DwarfStringPoolEntryRefTest.cpp",
     "InstrRefLDVTest.cpp",
     "LexicalScopesTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn
index 0f34231ae3216..aaa4139bd3dda 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn
@@ -28,7 +28,6 @@ unittest("IRTests") {
     "DemandedBitsTest.cpp",
     "DominatorTreeBatchUpdatesTest.cpp",
     "DominatorTreeTest.cpp",
-    "DroppedVariableStatsIRTest.cpp",
     "FunctionTest.cpp",
     "IRBuilderTest.cpp",
     "InstructionsTest.cpp",

From 4e3c0bb7f7b3f4df702d787853d31c53d626fe4c Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Fri, 20 Dec 2024 11:31:01 +0900
Subject: [PATCH 134/209] llvm-cov: Introduce `-show-created-time` to suppress
 timestamps (#120417)

This shouldn't affect anything since `-show-created-time=true` by
default.

Timestamps sometimes prevent reproducible build.
---
 .../llvm-cov/Inputs/showProjectSummary.test   | 13 +++++++++++++
 .../tools/llvm-cov/showProjectSummary.cpp     | 11 +++++++++++
 llvm/tools/llvm-cov/CodeCoverage.cpp          | 19 +++++++++++++------
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/llvm/test/tools/llvm-cov/Inputs/showProjectSummary.test b/llvm/test/tools/llvm-cov/Inputs/showProjectSummary.test
index 5eb6b4e24d536..91d22e0e213b8 100644
--- a/llvm/test/tools/llvm-cov/Inputs/showProjectSummary.test
+++ b/llvm/test/tools/llvm-cov/Inputs/showProjectSummary.test
@@ -13,3 +13,16 @@ HTML-HEADER: <td><pre>Line</pre></td>
 HTML-HEADER: <td><pre>Count</pre></td>
 HTML-HEADER: <td><pre>Source</pre></td>
 HTML-FOOTER: <h5>Generated by llvm-cov{{.*}}</h5>
+
+HTMF-TITLE: <h1>Test Suite</h1>
+HTMF: <h2>Coverage Report</h2>
+HTMF-NOT: <h4>Created:
+HTMF-FILE: <pre>{{.*}}showProjectSummary.cpp</pre>
+HTMF-NOT: <h4>Created:
+HTMF-FUNCTION: <pre>main</pre>
+HTMF-NOT: <h4>Created:
+HTMF-HEADER: <td><pre>Line</pre></td>
+HTMF-HEADER: <td><pre>Count</pre></td>
+HTMF-HEADER: <td><pre>Source</pre></td>
+HTMF-NOT: <h4>Created:
+HTMF-FOOTER: <h5>Generated by llvm-cov{{.*}}</h5>
diff --git a/llvm/test/tools/llvm-cov/showProjectSummary.cpp b/llvm/test/tools/llvm-cov/showProjectSummary.cpp
index aa5e30e30d900..c53e272cfdea2 100644
--- a/llvm/test/tools/llvm-cov/showProjectSummary.cpp
+++ b/llvm/test/tools/llvm-cov/showProjectSummary.cpp
@@ -11,6 +11,8 @@ int main(int argc, char ** argv) {
   return x;
 }
 
+// RUN: rm -rf %t.dir
+
 // Test console output.
 // RUN: llvm-cov show %S/Inputs/showProjectSummary.covmapping -instr-profile %t.profdata -path-equivalence=/tmp,%S %s | FileCheck -check-prefixes=TEXT %S/Inputs/showProjectSummary.test
 // RUN: llvm-cov show %S/Inputs/showProjectSummary.covmapping -instr-profile %t.profdata -path-equivalence=/tmp,%S -name=main %s | FileCheck -check-prefixes=TEXT %S/Inputs/showProjectSummary.test
@@ -27,3 +29,12 @@ int main(int argc, char ** argv) {
 // RUN: FileCheck -check-prefixes=HTML-TITLE,HTML,HTML-FOOTER -input-file %t.dir/index.html %S/Inputs/showProjectSummary.test
 // RUN: llvm-cov show %S/Inputs/showProjectSummary.covmapping -format=html -o %t.filtered.dir -instr-profile %t.profdata  -project-title "Test Suite" -path-equivalence=/tmp,%S -name=main %s
 // RUN: FileCheck -check-prefixes=HTML-TITLE,HTML,HTML-FOOTER -input-file %t.filtered.dir/index.html %S/Inputs/showProjectSummary.test
+
+// Test html output. (w/o ctime)
+// RUN: llvm-cov show %S/Inputs/showProjectSummary.covmapping -show-created-time=false -format=html -o %t.dir -instr-profile %t.profdata -path-equivalence=/tmp,%S %s
+// RUN: FileCheck -check-prefixes=HTMF,HTMF-FILE,HTMF-HEADER -input-file %t.dir/coverage/tmp/showProjectSummary.cpp.html %S/Inputs/showProjectSummary.test
+// RUN: llvm-cov show %S/Inputs/showProjectSummary.covmapping -show-created-time=false -format=html -o %t.dir -instr-profile %t.profdata -project-title "Test Suite" -path-equivalence=/tmp,%S %s
+// RUN: FileCheck -check-prefixes=HTMF-TITLE,HTMF,HTMF-FILE,HTMF-HEADER -input-file %t.dir/coverage/tmp/showProjectSummary.cpp.html %S/Inputs/showProjectSummary.test
+// RUN: FileCheck -check-prefixes=HTMF-TITLE,HTMF,HTMF-FOOTER -input-file %t.dir/index.html %S/Inputs/showProjectSummary.test
+// RUN: llvm-cov show %S/Inputs/showProjectSummary.covmapping -show-created-time=false -format=html -o %t.filtered.dir -instr-profile %t.profdata  -project-title "Test Suite" -path-equivalence=/tmp,%S -name=main %s
+// RUN: FileCheck -check-prefixes=HTMF-TITLE,HTMF,HTMF-FOOTER -input-file %t.filtered.dir/index.html %S/Inputs/showProjectSummary.test
diff --git a/llvm/tools/llvm-cov/CodeCoverage.cpp b/llvm/tools/llvm-cov/CodeCoverage.cpp
index d06fd86fe52af..5db5c2e023541 100644
--- a/llvm/tools/llvm-cov/CodeCoverage.cpp
+++ b/llvm/tools/llvm-cov/CodeCoverage.cpp
@@ -1013,6 +1013,10 @@ int CodeCoverageTool::doShow(int argc, const char **argv,
                                       cl::desc("Show directory coverage"),
                                       cl::cat(ViewCategory));
 
+  cl::opt<bool> ShowCreatedTime("show-created-time", cl::Optional,
+                                cl::desc("Show created time for each page."),
+                                cl::init(true), cl::cat(ViewCategory));
+
   cl::opt<std::string> ShowOutputDirectory(
       "output-dir", cl::init(""),
       cl::desc("Directory in which coverage information is written out"));
@@ -1112,12 +1116,15 @@ int CodeCoverageTool::doShow(int argc, const char **argv,
     return 1;
   }
 
-  auto ModifiedTime = Status.getLastModificationTime();
-  std::string ModifiedTimeStr = to_string(ModifiedTime);
-  size_t found = ModifiedTimeStr.rfind(':');
-  ViewOpts.CreatedTimeStr = (found != std::string::npos)
-                                ? "Created: " + ModifiedTimeStr.substr(0, found)
-                                : "Created: " + ModifiedTimeStr;
+  if (ShowCreatedTime) {
+    auto ModifiedTime = Status.getLastModificationTime();
+    std::string ModifiedTimeStr = to_string(ModifiedTime);
+    size_t found = ModifiedTimeStr.rfind(':');
+    ViewOpts.CreatedTimeStr =
+        (found != std::string::npos)
+            ? "Created: " + ModifiedTimeStr.substr(0, found)
+            : "Created: " + ModifiedTimeStr;
+  }
 
   auto Coverage = load();
   if (!Coverage)

From e93d226664d7012d1bb017f0cda24ad1b75f37fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 19 Dec 2024 19:07:00 -0800
Subject: [PATCH 135/209] [flang][cuda] Update CompilerGeneratedNames pass to
 work on gpu module (#120660)

- Update `CompilerGeneratedNames` so it can perform renaming in
gpu.module
- Update Codegen so it look in the correct module for the type
descriptor.
---
 flang/lib/Optimizer/CodeGen/CodeGen.cpp       | 41 ++++++++++++-------
 .../Transforms/CompilerGeneratedNames.cpp     | 40 ++++++++++--------
 .../CUDA/cuda-compiler-generated-names.mlir   | 17 ++++++++
 3 files changed, 68 insertions(+), 30 deletions(-)
 create mode 100644 flang/test/Fir/CUDA/cuda-compiler-generated-names.mlir

diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index b3f864f4f2d4f..aaf97d46d83d4 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -1341,10 +1341,10 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
 
   /// Get the address of the type descriptor global variable that was created by
   /// lowering for derived type \p recType.
-  mlir::Value getTypeDescriptor(mlir::ModuleOp mod,
-                                mlir::ConversionPatternRewriter &rewriter,
-                                mlir::Location loc,
-                                fir::RecordType recType) const {
+  template <typename ModOpTy>
+  mlir::Value
+  getTypeDescriptor(ModOpTy mod, mlir::ConversionPatternRewriter &rewriter,
+                    mlir::Location loc, fir::RecordType recType) const {
     std::string name =
         this->options.typeDescriptorsRenamedForAssembly
             ? fir::NameUniquer::getTypeDescriptorAssemblyName(recType.getName())
@@ -1369,7 +1369,8 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
     return rewriter.create<mlir::LLVM::ZeroOp>(loc, llvmPtrTy);
   }
 
-  mlir::Value populateDescriptor(mlir::Location loc, mlir::ModuleOp mod,
+  template <typename ModOpTy>
+  mlir::Value populateDescriptor(mlir::Location loc, ModOpTy mod,
                                  fir::BaseBoxType boxTy, mlir::Type inputType,
                                  mlir::ConversionPatternRewriter &rewriter,
                                  unsigned rank, mlir::Value eleSize,
@@ -1508,10 +1509,16 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
       extraField =
           this->getExtraFromBox(loc, sourceBoxTyPair, sourceBox, rewriter);
     }
-    auto mod = box->template getParentOfType<mlir::ModuleOp>();
-    mlir::Value descriptor =
-        populateDescriptor(loc, mod, boxTy, inputType, rewriter, rank, eleSize,
-                           cfiTy, typeDesc, allocatorIdx, extraField);
+
+    mlir::Value descriptor;
+    if (auto gpuMod = box->template getParentOfType<mlir::gpu::GPUModuleOp>())
+      descriptor = populateDescriptor(loc, gpuMod, boxTy, inputType, rewriter,
+                                      rank, eleSize, cfiTy, typeDesc,
+                                      allocatorIdx, extraField);
+    else if (auto mod = box->template getParentOfType<mlir::ModuleOp>())
+      descriptor = populateDescriptor(loc, mod, boxTy, inputType, rewriter,
+                                      rank, eleSize, cfiTy, typeDesc,
+                                      allocatorIdx, extraField);
 
     return {boxTy, descriptor, eleSize};
   }
@@ -1554,11 +1561,17 @@ struct EmboxCommonConversion : public fir::FIROpConversion<OP> {
     mlir::Value extraField =
         this->getExtraFromBox(loc, inputBoxTyPair, loweredBox, rewriter);
 
-    auto mod = box->template getParentOfType<mlir::ModuleOp>();
-    mlir::Value descriptor =
-        populateDescriptor(loc, mod, boxTy, box.getBox().getType(), rewriter,
-                           rank, eleSize, cfiTy, typeDesc,
-                           /*allocatorIdx=*/kDefaultAllocator, extraField);
+    mlir::Value descriptor;
+    if (auto gpuMod = box->template getParentOfType<mlir::gpu::GPUModuleOp>())
+      descriptor =
+          populateDescriptor(loc, gpuMod, boxTy, box.getBox().getType(),
+                             rewriter, rank, eleSize, cfiTy, typeDesc,
+                             /*allocatorIdx=*/kDefaultAllocator, extraField);
+    else if (auto mod = box->template getParentOfType<mlir::ModuleOp>())
+      descriptor =
+          populateDescriptor(loc, mod, boxTy, box.getBox().getType(), rewriter,
+                             rank, eleSize, cfiTy, typeDesc,
+                             /*allocatorIdx=*/kDefaultAllocator, extraField);
 
     return {boxTy, descriptor, eleSize};
   }
diff --git a/flang/lib/Optimizer/Transforms/CompilerGeneratedNames.cpp b/flang/lib/Optimizer/Transforms/CompilerGeneratedNames.cpp
index 7f2cc41275e59..f92c60908b149 100644
--- a/flang/lib/Optimizer/Transforms/CompilerGeneratedNames.cpp
+++ b/flang/lib/Optimizer/Transforms/CompilerGeneratedNames.cpp
@@ -11,6 +11,7 @@
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/Support/InternalNames.h"
 #include "flang/Optimizer/Transforms/Passes.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/Pass/Pass.h"
@@ -42,24 +43,31 @@ void CompilerGeneratedNamesConversionPass::runOnOperation() {
   auto *context = &getContext();
 
   llvm::DenseMap<mlir::StringAttr, mlir::FlatSymbolRefAttr> remappings;
-  for (auto &funcOrGlobal : op->getRegion(0).front()) {
-    if (llvm::isa<mlir::func::FuncOp>(funcOrGlobal) ||
-        llvm::isa<fir::GlobalOp>(funcOrGlobal)) {
-      auto symName = funcOrGlobal.getAttrOfType<mlir::StringAttr>(
-          mlir::SymbolTable::getSymbolAttrName());
-      auto deconstructedName = fir::NameUniquer::deconstruct(symName);
-      if (deconstructedName.first != fir::NameUniquer::NameKind::NOT_UNIQUED &&
-          !fir::NameUniquer::isExternalFacingUniquedName(deconstructedName)) {
-        std::string newName =
-            fir::NameUniquer::replaceSpecialSymbols(symName.getValue().str());
-        if (newName != symName) {
-          auto newAttr = mlir::StringAttr::get(context, newName);
-          mlir::SymbolTable::setSymbolName(&funcOrGlobal, newAttr);
-          auto newSymRef = mlir::FlatSymbolRefAttr::get(newAttr);
-          remappings.try_emplace(symName, newSymRef);
-        }
+
+  auto processOp = [&](mlir::Operation &op) {
+    auto symName = op.getAttrOfType<mlir::StringAttr>(
+        mlir::SymbolTable::getSymbolAttrName());
+    auto deconstructedName = fir::NameUniquer::deconstruct(symName);
+    if (deconstructedName.first != fir::NameUniquer::NameKind::NOT_UNIQUED &&
+        !fir::NameUniquer::isExternalFacingUniquedName(deconstructedName)) {
+      std::string newName =
+          fir::NameUniquer::replaceSpecialSymbols(symName.getValue().str());
+      if (newName != symName) {
+        auto newAttr = mlir::StringAttr::get(context, newName);
+        mlir::SymbolTable::setSymbolName(&op, newAttr);
+        auto newSymRef = mlir::FlatSymbolRefAttr::get(newAttr);
+        remappings.try_emplace(symName, newSymRef);
       }
     }
+  };
+  for (auto &op : op->getRegion(0).front()) {
+    if (llvm::isa<mlir::func::FuncOp>(op) || llvm::isa<fir::GlobalOp>(op))
+      processOp(op);
+    else if (auto gpuMod = mlir::dyn_cast<mlir::gpu::GPUModuleOp>(&op))
+      for (auto &op : gpuMod->getRegion(0).front())
+        if (llvm::isa<mlir::func::FuncOp>(op) || llvm::isa<fir::GlobalOp>(op) ||
+            llvm::isa<mlir::gpu::GPUFuncOp>(op))
+          processOp(op);
   }
 
   if (remappings.empty())
diff --git a/flang/test/Fir/CUDA/cuda-compiler-generated-names.mlir b/flang/test/Fir/CUDA/cuda-compiler-generated-names.mlir
new file mode 100644
index 0000000000000..4507e444d1b51
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-compiler-generated-names.mlir
@@ -0,0 +1,17 @@
+// RUN: fir-opt --split-input-file --compiler-generated-names --fir-to-llvm-ir="target=x86_64-unknown-linux-gnu type-descriptors-renamed-for-assembly=true" %s | FileCheck %s
+
+module @mod1 attributes {gpu.container} {
+  gpu.module @gpu1 {
+    fir.global linkonce @_QMtest_dinitE.dt.tseq constant : i8
+
+    func.func @embox1(%arg0: !fir.ref<!fir.type<_QMtest_dinitTtseq{i:i32}>>) {
+      %0 = fir.embox %arg0() : (!fir.ref<!fir.type<_QMtest_dinitTtseq{i:i32}>>) -> !fir.box<!fir.type<_QMtest_dinitTtseq{i:i32}>>
+      return
+    }
+  }
+}
+
+// CHECK-LABEL: gpu.module @gpu1
+// CHECK: llvm.mlir.global linkonce constant @_QMtest_dinitEXdtXtseq
+// CHECK: llvm.mlir.addressof @_QMtest_dinitEXdtXtseq : !llvm.ptr
+

From f421a7a6ee9938fd4cb940639e516757d7ad1a80 Mon Sep 17 00:00:00 2001
From: Pengcheng Wang <wangpengcheng.pp@bytedance.com>
Date: Fri, 20 Dec 2024 11:43:12 +0800
Subject: [PATCH 136/209] [ARM] Use `RegisterClassInfo::getRegPressureSetLimit`
 (#120377)

`RegisterClassInfo::getRegPressureSetLimit` is a wrapper of
`TargetRegisterInfo::getRegPressureSetLimit` with some logics to
adjust the limit by removing reserved registers.

It seems that we shouldn't use
`TargetRegisterInfo::getRegPressureSetLimit`
directly, just like the comment "This limit must be adjusted
dynamically for reserved registers" said.

Separate from https://github.com/llvm/llvm-project/pull/118787
---
 llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index e3e2e83fd5c7e..ae54bad0a0554 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -6938,7 +6938,6 @@ bool ARMPipelinerLoopInfo::tooMuchRegisterPressure(SwingSchedulerDAG &SSD,
   RegClassInfo.runOnMachineFunction(*MF);
   RPTracker.init(MF, &RegClassInfo, nullptr, EndLoop->getParent(),
                  EndLoop->getParent()->end(), false, false);
-  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
 
   bumpCrossIterationPressure(RPTracker, CrossIterationNeeds);
 
@@ -6980,10 +6979,16 @@ bool ARMPipelinerLoopInfo::tooMuchRegisterPressure(SwingSchedulerDAG &SSD,
   }
 
   auto &P = RPTracker.getPressure().MaxSetPressure;
-  for (unsigned I = 0, E = P.size(); I < E; ++I)
-    if (P[I] > TRI->getRegPressureSetLimit(*MF, I)) {
+  for (unsigned I = 0, E = P.size(); I < E; ++I) {
+    // Exclude some Neon register classes.
+    if (I == ARM::DQuad_with_ssub_0 || I == ARM::DTripleSpc_with_ssub_0 ||
+        I == ARM::DTriple_with_qsub_0_in_QPR)
+      continue;
+
+    if (P[I] > RegClassInfo.getRegPressureSetLimit(I)) {
       return true;
     }
+  }
   return false;
 }
 

From 91e392b2836f25bf5641f0ca7ec2f64018c2193b Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 19 Dec 2024 19:52:31 -0800
Subject: [PATCH 137/209] [ubsan] Fix test on Windows

---
 compiler-rt/test/ubsan/TestCases/Misc/local_bounds.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/test/ubsan/TestCases/Misc/local_bounds.cpp b/compiler-rt/test/ubsan/TestCases/Misc/local_bounds.cpp
index d5e0b46a0f8be..5c3bae80ecdda 100644
--- a/compiler-rt/test/ubsan/TestCases/Misc/local_bounds.cpp
+++ b/compiler-rt/test/ubsan/TestCases/Misc/local_bounds.cpp
@@ -25,7 +25,7 @@ test(char i) {
   // CHECK: error: access out of bounds
   // CHECK: SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior
   // LINE: local_bounds.cpp:[[#@LINE-3]]:{{.*}}runtime error: access out of bounds
-  // LINE: SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior {{.*}}local_bounds.cpp:[[#@LINE-4]]:
+  // LINE: SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior {{.*}}local_bounds.cpp:[[#@LINE-4]]
 }
 
 int main(int argc, char **argv) {

From 08db696c87fb68cc896f742202665440cee53f8c Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Thu, 19 Dec 2024 22:53:58 -0500
Subject: [PATCH 138/209] [AMDGPU][True16][MC] V_MED3_I/U16_fake16 CodeGen
 pattern (#120600)

In this patch https://github.com/llvm/llvm-project/pull/113603 replace
`V_MED3_I/U16` to `V_MED3_I/U16_fake16` for Post-GFX11, but it miss to
update the CodeGen pattern. This patch update and corrert the CodeGen
pattern
---
 llvm/lib/Target/AMDGPU/SIInstructions.td      | 20 +++++++++++-----
 .../inst-select-pattern-smed3.s16.mir         | 24 ++++++++++++-------
 .../inst-select-pattern-umed3.s16.mir         | 24 ++++++++++++-------
 3 files changed, 46 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index b1f93a447a7b8..789ce8815cf80 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3645,13 +3645,15 @@ multiclass FPMed3Pat<ValueType vt,
 
 multiclass Int16Med3Pat<Instruction med3Inst,
                         SDPatternOperator min,
-                        SDPatternOperator max> {
+                        SDPatternOperator max,
+                        RegisterOperand outputSrcType> {
   // This matches 16 permutations of
   // max(min(x, y), min(max(x, y), z))
   def : GCNPat <
   (max (min i16:$src0, i16:$src1),
        (min (max i16:$src0, i16:$src1), i16:$src2)),
-  (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE)
+  (med3Inst SRCMODS.NONE, outputSrcType:$src0, SRCMODS.NONE, outputSrcType:$src1,
+            SRCMODS.NONE, outputSrcType:$src2, DSTCLAMP.NONE)
 >;
 
   // This matches 16 permutations of
@@ -3716,10 +3718,16 @@ def : FPMinCanonMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>
 def : FPMinCanonMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
 }
 
-let OtherPredicates = [isGFX9Plus] in {
-defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax>;
-defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax>;
-} // End Predicates = [isGFX9Plus]
+let SubtargetPredicate = isGFX9Plus in {
+let True16Predicate = NotHasTrue16BitInsts in {
+  defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax, VSrc_b16>;
+  defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax, VSrc_b16>;
+}
+let True16Predicate = UseFakeTrue16Insts in {
+  defm : Int16Med3Pat<V_MED3_I16_fake16_e64, smin, smax, VSrc_b16>;
+  defm : Int16Med3Pat<V_MED3_U16_fake16_e64, umin, umax, VSrc_b16>;
+}
+} // End SubtargetPredicate = [isGFX9Plus]
 
 let OtherPredicates = [isGFX12Plus] in {
 def : FPMinMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir
index 9dc53bd1dc0bd..22dd12eac0923 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir
@@ -24,6 +24,7 @@ body: |
     ; GFX8-NEXT: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec
     ; GFX8-NEXT: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec
     ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]]
+    ;
     ; GFX9-LABEL: name: smed3_s16_vvv
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -32,14 +33,15 @@ body: |
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
     ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]]
+    ;
     ; GFX11-LABEL: name: smed3_s16_vvv
     ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]]
+    ; GFX11-NEXT: [[V_MED3_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s32) = COPY $vgpr2
@@ -75,6 +77,7 @@ body: |
     ; GFX8-NEXT: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec
     ; GFX8-NEXT: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec
     ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]], implicit [[V_MAX_I16_e64_]]
+    ;
     ; GFX9-LABEL: name: smed3_s16_vvv_multiuse0
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -84,6 +87,7 @@ body: |
     ; GFX9-NEXT: [[V_MAX_I16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[COPY]], [[COPY1]], implicit $exec
     ; GFX9-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
     ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MAX_I16_e64_]]
+    ;
     ; GFX11-LABEL: name: smed3_s16_vvv_multiuse0
     ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -91,8 +95,8 @@ body: |
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: [[V_MAX_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_fake16_e64 [[COPY]], [[COPY1]], implicit $exec
-    ; GFX11-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MAX_I16_fake16_e64_]]
+    ; GFX11-NEXT: [[V_MED3_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_fake16_e64_]], implicit [[V_MAX_I16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s32) = COPY $vgpr2
@@ -128,6 +132,7 @@ body: |
     ; GFX8-NEXT: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec
     ; GFX8-NEXT: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec
     ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]], implicit [[V_MIN_I16_e64_]]
+    ;
     ; GFX9-LABEL: name: smed3_s16_vvv_multiuse1
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -137,6 +142,7 @@ body: |
     ; GFX9-NEXT: [[V_MIN_I16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[COPY]], [[COPY1]], implicit $exec
     ; GFX9-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
     ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MIN_I16_e64_]]
+    ;
     ; GFX11-LABEL: name: smed3_s16_vvv_multiuse1
     ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -144,8 +150,8 @@ body: |
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: [[V_MIN_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_fake16_e64 [[COPY]], [[COPY1]], implicit $exec
-    ; GFX11-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MIN_I16_fake16_e64_]]
+    ; GFX11-NEXT: [[V_MED3_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_fake16_e64_]], implicit [[V_MIN_I16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s32) = COPY $vgpr2
@@ -181,6 +187,7 @@ body: |
     ; GFX8-NEXT: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec
     ; GFX8-NEXT: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec
     ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]], implicit [[V_MAX_I16_e64_1]]
+    ;
     ; GFX9-LABEL: name: smed3_s16_vvv_multiuse2
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -191,6 +198,7 @@ body: |
     ; GFX9-NEXT: [[V_MAX_I16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec
     ; GFX9-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
     ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MAX_I16_e64_]]
+    ;
     ; GFX11-LABEL: name: smed3_s16_vvv_multiuse2
     ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -199,8 +207,8 @@ body: |
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: [[V_MIN_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_fake16_e64 [[COPY]], [[COPY1]], implicit $exec
     ; GFX11-NEXT: [[V_MAX_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_fake16_e64 [[V_MIN_I16_fake16_e64_]], [[COPY2]], implicit $exec
-    ; GFX11-NEXT: [[V_MED3_I16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_e64_]], implicit [[V_MAX_I16_fake16_e64_]]
+    ; GFX11-NEXT: [[V_MED3_I16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_I16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_I16_fake16_e64_]], implicit [[V_MAX_I16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s32) = COPY $vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir
index 6928c963a5fcf..6e1489e3227d9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir
@@ -24,6 +24,7 @@ body: |
     ; GFX8-NEXT: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec
     ; GFX8-NEXT: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec
     ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]]
+    ;
     ; GFX9-LABEL: name: umed3_s16_vvv
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -32,14 +33,15 @@ body: |
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
     ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]]
+    ;
     ; GFX11-LABEL: name: umed3_s16_vvv
     ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]]
+    ; GFX11-NEXT: [[V_MED3_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s32) = COPY $vgpr2
@@ -75,6 +77,7 @@ body: |
     ; GFX8-NEXT: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec
     ; GFX8-NEXT: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec
     ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]], implicit [[V_MAX_U16_e64_]]
+    ;
     ; GFX9-LABEL: name: umed3_s16_vvv_multiuse0
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -84,6 +87,7 @@ body: |
     ; GFX9-NEXT: [[V_MAX_U16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[COPY]], [[COPY1]], implicit $exec
     ; GFX9-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
     ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MAX_U16_e64_]]
+    ;
     ; GFX11-LABEL: name: umed3_s16_vvv_multiuse0
     ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -91,8 +95,8 @@ body: |
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: [[V_MAX_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_fake16_e64 [[COPY]], [[COPY1]], implicit $exec
-    ; GFX11-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MAX_U16_fake16_e64_]]
+    ; GFX11-NEXT: [[V_MED3_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_fake16_e64_]], implicit [[V_MAX_U16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s32) = COPY $vgpr2
@@ -128,6 +132,7 @@ body: |
     ; GFX8-NEXT: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec
     ; GFX8-NEXT: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec
     ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]], implicit [[V_MIN_U16_e64_]]
+    ;
     ; GFX9-LABEL: name: umed3_s16_vvv_multiuse1
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -137,6 +142,7 @@ body: |
     ; GFX9-NEXT: [[V_MIN_U16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[COPY]], [[COPY1]], implicit $exec
     ; GFX9-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
     ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MIN_U16_e64_]]
+    ;
     ; GFX11-LABEL: name: umed3_s16_vvv_multiuse1
     ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -144,8 +150,8 @@ body: |
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: [[V_MIN_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_fake16_e64 [[COPY]], [[COPY1]], implicit $exec
-    ; GFX11-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MIN_U16_fake16_e64_]]
+    ; GFX11-NEXT: [[V_MED3_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_fake16_e64_]], implicit [[V_MIN_U16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s32) = COPY $vgpr2
@@ -181,6 +187,7 @@ body: |
     ; GFX8-NEXT: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec
     ; GFX8-NEXT: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec
     ; GFX8-NEXT: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]], implicit [[V_MAX_U16_e64_1]]
+    ;
     ; GFX9-LABEL: name: umed3_s16_vvv_multiuse2
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -191,6 +198,7 @@ body: |
     ; GFX9-NEXT: [[V_MAX_U16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec
     ; GFX9-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
     ; GFX9-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MAX_U16_e64_]]
+    ;
     ; GFX11-LABEL: name: umed3_s16_vvv_multiuse2
     ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -199,8 +207,8 @@ body: |
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: [[V_MIN_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_fake16_e64 [[COPY]], [[COPY1]], implicit $exec
     ; GFX11-NEXT: [[V_MAX_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_fake16_e64 [[V_MIN_U16_fake16_e64_]], [[COPY2]], implicit $exec
-    ; GFX11-NEXT: [[V_MED3_U16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_e64_]], implicit [[V_MAX_U16_fake16_e64_]]
+    ; GFX11-NEXT: [[V_MED3_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_MED3_U16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MED3_U16_fake16_e64_]], implicit [[V_MAX_U16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s32) = COPY $vgpr2

From e7e622f153126582670d2d07034e97e1b780a464 Mon Sep 17 00:00:00 2001
From: Shubham Sandeep Rastogi <srastogi22@apple.com>
Date: Thu, 19 Dec 2024 19:57:35 -0800
Subject: [PATCH 139/209] Revert "Move DroppedVariableStats to CodeGen lib
 (#120650)"

This reverts commit 4307198d51487cc16f98eebb2113caf4a1905914.

Broke bot ppc64le-clang-multistage-test:

undefined reference to
`llvm::DroppedVariableStats::populateVarIDSetAndInlinedMap in
In function `llvm::DroppedVariableStatsIR::visitEveryInstruction
---
 .../DroppedVariableStats.h                    | 26 ++-------------
 .../llvm/Passes/StandardInstrumentations.h    |  2 +-
 llvm/lib/CodeGen/CMakeLists.txt               |  1 -
 llvm/lib/Passes/CMakeLists.txt                |  1 +
 .../DroppedVariableStats.cpp                  | 32 ++++++++++++++++++-
 llvm/unittests/CodeGen/CMakeLists.txt         |  1 -
 llvm/unittests/IR/CMakeLists.txt              |  1 +
 .../DroppedVariableStatsIRTest.cpp            |  0
 8 files changed, 36 insertions(+), 28 deletions(-)
 rename llvm/include/llvm/{CodeGen => Passes}/DroppedVariableStats.h (91%)
 rename llvm/lib/{CodeGen => Passes}/DroppedVariableStats.cpp (84%)
 rename llvm/unittests/{CodeGen => IR}/DroppedVariableStatsIRTest.cpp (100%)

diff --git a/llvm/include/llvm/CodeGen/DroppedVariableStats.h b/llvm/include/llvm/Passes/DroppedVariableStats.h
similarity index 91%
rename from llvm/include/llvm/CodeGen/DroppedVariableStats.h
rename to llvm/include/llvm/Passes/DroppedVariableStats.h
index 8986da53284e8..4555157c942b5 100644
--- a/llvm/include/llvm/CodeGen/DroppedVariableStats.h
+++ b/llvm/include/llvm/Passes/DroppedVariableStats.h
@@ -18,7 +18,6 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassInstrumentation.h"
 
@@ -207,33 +206,12 @@ class DroppedVariableStatsIR : public DroppedVariableStats {
   virtual void
   visitEveryInstruction(unsigned &DroppedCount,
                         DenseMap<VarID, DILocation *> &InlinedAtsMap,
-                        VarID Var) override {
-    const DIScope *DbgValScope = std::get<0>(Var);
-    for (const auto &I : instructions(Func)) {
-      auto *DbgLoc = I.getDebugLoc().get();
-      if (!DbgLoc)
-        continue;
-      if (updateDroppedCount(DbgLoc, DbgLoc->getScope(), DbgValScope,
-                             InlinedAtsMap, Var, DroppedCount))
-        break;
-    }
-  }
+                        VarID Var) override;
   /// Override base class method to run on #dbg_values specifically.
   virtual void visitEveryDebugRecord(
       DenseSet<VarID> &VarIDSet,
       DenseMap<StringRef, DenseMap<VarID, DILocation *>> &InlinedAtsMap,
-      StringRef FuncName, bool Before) override {
-    for (const auto &I : instructions(Func)) {
-      for (DbgRecord &DR : I.getDbgRecordRange()) {
-        if (auto *Dbg = dyn_cast<DbgVariableRecord>(&DR)) {
-          auto *DbgVar = Dbg->getVariable();
-          auto DbgLoc = DR.getDebugLoc();
-          populateVarIDSetAndInlinedMap(DbgVar, DbgLoc, VarIDSet, InlinedAtsMap,
-                                        FuncName, Before);
-        }
-      }
-    }
-  }
+      StringRef FuncName, bool Before) override;
 
   template <typename IRUnitT> static const IRUnitT *unwrapIR(Any IR) {
     const IRUnitT **IRPtr = llvm::any_cast<const IRUnitT *>(&IR);
diff --git a/llvm/include/llvm/Passes/StandardInstrumentations.h b/llvm/include/llvm/Passes/StandardInstrumentations.h
index 12a34c099eaff..6ba466f9269f0 100644
--- a/llvm/include/llvm/Passes/StandardInstrumentations.h
+++ b/llvm/include/llvm/Passes/StandardInstrumentations.h
@@ -19,13 +19,13 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/CodeGen/DroppedVariableStats.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/OptBisect.h"
 #include "llvm/IR/PassTimingInfo.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/Passes/DroppedVariableStats.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Transforms/IPO/SampleProfileProbe.h"
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 11d8ebf869344..145fd2fac8b56 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -50,7 +50,6 @@ add_llvm_component_library(LLVMCodeGen
   DeadMachineInstructionElim.cpp
   DetectDeadLanes.cpp
   DFAPacketizer.cpp
-  DroppedVariableStats.cpp
   DwarfEHPrepare.cpp
   EarlyIfConversion.cpp
   EdgeBundles.cpp
diff --git a/llvm/lib/Passes/CMakeLists.txt b/llvm/lib/Passes/CMakeLists.txt
index 6425f4934b210..9e16a446c9b39 100644
--- a/llvm/lib/Passes/CMakeLists.txt
+++ b/llvm/lib/Passes/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_llvm_component_library(LLVMPasses
   CodeGenPassBuilder.cpp
+  DroppedVariableStats.cpp
   OptimizationLevel.cpp
   PassBuilder.cpp
   PassBuilderBindings.cpp
diff --git a/llvm/lib/CodeGen/DroppedVariableStats.cpp b/llvm/lib/Passes/DroppedVariableStats.cpp
similarity index 84%
rename from llvm/lib/CodeGen/DroppedVariableStats.cpp
rename to llvm/lib/Passes/DroppedVariableStats.cpp
index ef6abad802e4d..5dc6b75fb8ace 100644
--- a/llvm/lib/CodeGen/DroppedVariableStats.cpp
+++ b/llvm/lib/Passes/DroppedVariableStats.cpp
@@ -11,7 +11,7 @@
 ///
 ///===---------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/DroppedVariableStats.h"
+#include "llvm/Passes/DroppedVariableStats.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Module.h"
@@ -162,3 +162,33 @@ void DroppedVariableStatsIR::registerCallbacks(
   PIC.registerAfterPassInvalidatedCallback(
       [this](StringRef P, const PreservedAnalyses &PA) { return cleanup(); });
 }
+
+void DroppedVariableStatsIR::visitEveryInstruction(
+    unsigned &DroppedCount, DenseMap<VarID, DILocation *> &InlinedAtsMap,
+    VarID Var) {
+  const DIScope *DbgValScope = std::get<0>(Var);
+  for (const auto &I : instructions(Func)) {
+    auto *DbgLoc = I.getDebugLoc().get();
+    if (!DbgLoc)
+      continue;
+    if (updateDroppedCount(DbgLoc, DbgLoc->getScope(), DbgValScope,
+                           InlinedAtsMap, Var, DroppedCount))
+      break;
+  }
+}
+
+void DroppedVariableStatsIR::visitEveryDebugRecord(
+    DenseSet<VarID> &VarIDSet,
+    DenseMap<StringRef, DenseMap<VarID, DILocation *>> &InlinedAtsMap,
+    StringRef FuncName, bool Before) {
+  for (const auto &I : instructions(Func)) {
+    for (DbgRecord &DR : I.getDbgRecordRange()) {
+      if (auto *Dbg = dyn_cast<DbgVariableRecord>(&DR)) {
+        auto *DbgVar = Dbg->getVariable();
+        auto DbgLoc = DR.getDebugLoc();
+        populateVarIDSetAndInlinedMap(DbgVar, DbgLoc, VarIDSet, InlinedAtsMap,
+                                      FuncName, Before);
+      }
+    }
+  }
+}
diff --git a/llvm/unittests/CodeGen/CMakeLists.txt b/llvm/unittests/CodeGen/CMakeLists.txt
index 807fd1a9b7b56..963cdcc0275e1 100644
--- a/llvm/unittests/CodeGen/CMakeLists.txt
+++ b/llvm/unittests/CodeGen/CMakeLists.txt
@@ -27,7 +27,6 @@ add_llvm_unittest(CodeGenTests
   CCStateTest.cpp
   DIEHashTest.cpp
   DIETest.cpp
-  DroppedVariableStatsIRTest.cpp
   DwarfStringPoolEntryRefTest.cpp
   InstrRefLDVTest.cpp
   LowLevelTypeTest.cpp
diff --git a/llvm/unittests/IR/CMakeLists.txt b/llvm/unittests/IR/CMakeLists.txt
index 441ef271b20e1..01f02bf5d70ac 100644
--- a/llvm/unittests/IR/CMakeLists.txt
+++ b/llvm/unittests/IR/CMakeLists.txt
@@ -43,6 +43,7 @@ add_llvm_unittest(IRTests
   ShuffleVectorInstTest.cpp
   StructuralHashTest.cpp
   TimePassesTest.cpp
+  DroppedVariableStatsIRTest.cpp
   TypesTest.cpp
   UseTest.cpp
   UserTest.cpp
diff --git a/llvm/unittests/CodeGen/DroppedVariableStatsIRTest.cpp b/llvm/unittests/IR/DroppedVariableStatsIRTest.cpp
similarity index 100%
rename from llvm/unittests/CodeGen/DroppedVariableStatsIRTest.cpp
rename to llvm/unittests/IR/DroppedVariableStatsIRTest.cpp

From 16e42235f153eaa2c3abd2e203a771db4e709607 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 20 Dec 2024 04:01:26 +0000
Subject: [PATCH 140/209] [gn build] Port e7e622f15312

---
 llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn       | 1 -
 llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn        | 1 +
 llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn | 1 -
 llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn      | 1 +
 4 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
index bcb46d919b6c1..ab72ac4ae9f4b 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
@@ -50,7 +50,6 @@ static_library("CodeGen") {
     "DFAPacketizer.cpp",
     "DeadMachineInstructionElim.cpp",
     "DetectDeadLanes.cpp",
-    "DroppedVariableStats.cpp",
     "DwarfEHPrepare.cpp",
     "EHContGuardCatchret.cpp",
     "EarlyIfConversion.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
index 274f5b54345c7..655264509db59 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
@@ -21,6 +21,7 @@ static_library("Passes") {
   ]
   sources = [
     "CodeGenPassBuilder.cpp",
+    "DroppedVariableStats.cpp",
     "OptimizationLevel.cpp",
     "PassBuilder.cpp",
     "PassBuilderBindings.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
index dc01cc9a40a9c..a3f89a5648cb5 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
@@ -26,7 +26,6 @@ unittest("CodeGenTests") {
     "CCStateTest.cpp",
     "DIEHashTest.cpp",
     "DIETest.cpp",
-    "DroppedVariableStatsIRTest.cpp",
     "DwarfStringPoolEntryRefTest.cpp",
     "InstrRefLDVTest.cpp",
     "LexicalScopesTest.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn
index aaa4139bd3dda..0f34231ae3216 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn
@@ -28,6 +28,7 @@ unittest("IRTests") {
     "DemandedBitsTest.cpp",
     "DominatorTreeBatchUpdatesTest.cpp",
     "DominatorTreeTest.cpp",
+    "DroppedVariableStatsIRTest.cpp",
     "FunctionTest.cpp",
     "IRBuilderTest.cpp",
     "InstructionsTest.cpp",

From ebb5f1a4e5f2f150c60302a9374b3ae1b66e2028 Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Thu, 19 Dec 2024 16:35:06 -0800
Subject: [PATCH 141/209] [AArch64][GlobalISel] Fix crash when selecting an
 anyextending FP load.

We split anyext FP loads back into a regular load + extend, but when we do that
we need to ensure that some state about the instruction is updated to correctly
reflect the new reality.

rdar://141660282
---
 .../GISel/AArch64InstructionSelector.cpp      |  3 +-
 .../GlobalISel/select-fp-anyext-crash.ll      | 42 +++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/select-fp-anyext-crash.ll

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 5000078928a1d..07f03644336cd 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -3055,8 +3055,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
 #endif
 
     const Register ValReg = LdSt.getReg(0);
-    const LLT ValTy = MRI.getType(ValReg);
     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
+    LLT ValTy = MRI.getType(ValReg);
 
     // The code below doesn't support truncating stores, so we need to split it
     // again.
@@ -3096,6 +3096,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
         auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB);
         RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI);
         MIB.setInstr(LdSt);
+        ValTy = MemTy; // This is no longer an extending load.
       }
     }
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-fp-anyext-crash.ll b/llvm/test/CodeGen/AArch64/GlobalISel/select-fp-anyext-crash.ll
new file mode 100644
index 0000000000000..7af5b3d801e0c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-fp-anyext-crash.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O0 -o - %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64e-apple-macosx10.15.0"
+
+; Check we don't crash here when selecting an anyextending FP load.
+
+define i32 @test() {
+; CHECK-LABEL: test:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    stp x29, x30, [sp, #64] ; 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 80
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    mov x8, #0 ; =0x0
+; CHECK-NEXT:    ldr s0, [x8]
+; CHECK-NEXT:    ; kill: def $d0 killed $s0
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    mov w9, #0 ; =0x0
+; CHECK-NEXT:    str w9, [sp, #60] ; 4-byte Folded Spill
+; CHECK-NEXT:    str xzr, [x8]
+; CHECK-NEXT:    str xzr, [x8, #8]
+; CHECK-NEXT:    str xzr, [x8, #16]
+; CHECK-NEXT:    str xzr, [x8, #24]
+; CHECK-NEXT:    str d0, [x8, #32]
+; CHECK-NEXT:    str xzr, [x8, #40]
+; CHECK-NEXT:    mov x8, #0 ; =0x0
+; CHECK-NEXT:    mov x0, x8
+; CHECK-NEXT:    blr x8
+; CHECK-NEXT:    ldr w0, [sp, #60] ; 4-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ret
+entry:
+  %0 = inttoptr i64 0 to ptr
+  %1 = load i32, ptr %0, align 4
+  %call86 = call i32 (ptr, ...) null(ptr null, i32 0, i32 0, i32 0, i32 0, i32 %1, i32 0)
+  %2 = load float, ptr %0, align 4
+  ret i32 0
+}
+

From a73ca291547cf4f5822a3029dd56315354557517 Mon Sep 17 00:00:00 2001
From: Dmitry Chestnykh <dm.chestnykh@gmail.com>
Date: Fri, 20 Dec 2024 07:19:05 +0300
Subject: [PATCH 142/209] [compiler-rt] Add weak defs for
 .*contiguous_container.* functions (#120376)

Fix #120278
---
 .../lib/sanitizer_common/CMakeLists.txt       |  1 +
 .../sanitizer_common_interface.inc            | 14 ++---
 .../sanitizer_contiguous_container.cpp        | 53 +++++++++++++++++++
 3 files changed, 61 insertions(+), 7 deletions(-)
 create mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_contiguous_container.cpp

diff --git a/compiler-rt/lib/sanitizer_common/CMakeLists.txt b/compiler-rt/lib/sanitizer_common/CMakeLists.txt
index 09391e4f5f370..9eb47e21863cb 100644
--- a/compiler-rt/lib/sanitizer_common/CMakeLists.txt
+++ b/compiler-rt/lib/sanitizer_common/CMakeLists.txt
@@ -4,6 +4,7 @@
 set(SANITIZER_SOURCES_NOTERMINATION
   sanitizer_allocator.cpp
   sanitizer_common.cpp
+  sanitizer_contiguous_container.cpp
   sanitizer_deadlock_detector1.cpp
   sanitizer_deadlock_detector2.cpp
   sanitizer_errno.cpp
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interface.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interface.inc
index 4ea75cdd67cb9..900a99329d56c 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interface.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interface.inc
@@ -8,18 +8,18 @@
 // Sanitizer Common interface list.
 //===----------------------------------------------------------------------===//
 INTERFACE_FUNCTION(__sanitizer_acquire_crash_state)
-INTERFACE_FUNCTION(__sanitizer_annotate_contiguous_container)
-INTERFACE_FUNCTION(__sanitizer_annotate_double_ended_contiguous_container)
-INTERFACE_FUNCTION(__sanitizer_copy_contiguous_container_annotations)
-INTERFACE_FUNCTION(__sanitizer_contiguous_container_find_bad_address)
-INTERFACE_FUNCTION(
+INTERFACE_WEAK_FUNCTION(__sanitizer_annotate_contiguous_container)
+INTERFACE_WEAK_FUNCTION(__sanitizer_annotate_double_ended_contiguous_container)
+INTERFACE_WEAK_FUNCTION(__sanitizer_copy_contiguous_container_annotations)
+INTERFACE_WEAK_FUNCTION(__sanitizer_contiguous_container_find_bad_address)
+INTERFACE_WEAK_FUNCTION(
     __sanitizer_double_ended_contiguous_container_find_bad_address)
 INTERFACE_FUNCTION(__sanitizer_set_death_callback)
 INTERFACE_FUNCTION(__sanitizer_set_report_path)
 INTERFACE_FUNCTION(__sanitizer_set_report_fd)
 INTERFACE_FUNCTION(__sanitizer_get_report_path)
-INTERFACE_FUNCTION(__sanitizer_verify_contiguous_container)
-INTERFACE_FUNCTION(__sanitizer_verify_double_ended_contiguous_container)
+INTERFACE_WEAK_FUNCTION(__sanitizer_verify_contiguous_container)
+INTERFACE_WEAK_FUNCTION(__sanitizer_verify_double_ended_contiguous_container)
 INTERFACE_WEAK_FUNCTION(__sanitizer_on_print)
 INTERFACE_WEAK_FUNCTION(__sanitizer_report_error_summary)
 INTERFACE_WEAK_FUNCTION(__sanitizer_sandbox_on_notify)
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_contiguous_container.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_contiguous_container.cpp
new file mode 100644
index 0000000000000..4c093c1d4e918
--- /dev/null
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_contiguous_container.cpp
@@ -0,0 +1,53 @@
+//===-- sanitizer_contiguous_container.cpp
+//-----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+//
+// This file provides weak defs of __sanitizer*contiguous_container* functions
+// whose strong implementations can be defined in particular runtime libs
+// of sanitizers
+//
+//===---------------------------------------------------------------------===//
+
+#include "sanitizer_internal_defs.h"
+
+SANITIZER_INTERFACE_WEAK_DEF(void, __sanitizer_annotate_contiguous_container,
+                             const void *, const void *, const void *,
+                             const void *) {}
+
+SANITIZER_INTERFACE_WEAK_DEF(
+    void, __sanitizer_annotate_double_ended_contiguous_container, const void *,
+    const void *, const void *, const void *, const void *, const void *) {}
+
+SANITIZER_INTERFACE_WEAK_DEF(void,
+                             __sanitizer_copy_contiguous_container_annotations,
+                             const void *, const void *, const void *,
+                             const void *) {}
+
+SANITIZER_INTERFACE_WEAK_DEF(int, __sanitizer_verify_contiguous_container,
+                             const void *, const void *, const void *) {
+  return 0;
+}
+
+SANITIZER_INTERFACE_WEAK_DEF(
+    int, __sanitizer_verify_double_ended_contiguous_container, const void *,
+    const void *, const void *, const void *) {
+  return 0;
+}
+
+SANITIZER_INTERFACE_WEAK_DEF(const void *,
+                             __sanitizer_contiguous_container_find_bad_address,
+                             const void *, const void *, const void *) {
+  return nullptr;
+}
+
+SANITIZER_INTERFACE_WEAK_DEF(
+    const void *,
+    __sanitizer_double_ended_contiguous_container_find_bad_address,
+    const void *, const void *, const void *, const void *) {
+  return nullptr;
+}

From c7606710f93cf0ab655a5bcbbf873954051ba109 Mon Sep 17 00:00:00 2001
From: Aaditya <115080342+easyonaadit@users.noreply.github.com>
Date: Fri, 20 Dec 2024 10:27:27 +0530
Subject: [PATCH 143/209] [AMDGPU] Update base addr of dyn alloca considering
 GrowingUp stack (#119822)

Currently, compiler calculates the base address of
dynamic sized stack object (alloca) as follows:
1. `NewSP = Align(CurrSP + Size)`
_where_ `Size = # of elements * wave size * alloca type`
2. `BaseAddr = NewSP`
3. The alignment is computed as: `AlignedAddr = Addr & ~(Alignment - 1)`
4. Return the `BaseAddr`
This makes sense when stack is grows downwards.

AMDGPU stack grows upwards, the base address
needs to be aligned first and SP bump by required size later:
1. `BaseAddr = Align(CurrSP)`
2. `NewSP = BaseAddr + Size`
3. `AlignedAddr = (Addr + (Alignment - 1)) & ~(Alignment - 1)`
4. and returns the `BaseAddr`.
---
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  13 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  37 +--
 .../GlobalISel/dynamic-alloca-uniform.ll      | 219 ++++++++++--------
 .../AMDGPU/GlobalISel/non-entry-alloca.ll     |  26 ++-
 .../regbankselect-dyn-stackalloc.mir          | 180 +++++++++-----
 llvm/test/CodeGen/AMDGPU/amdpal-callable.ll   |   6 +-
 llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll  |  46 ++--
 7 files changed, 312 insertions(+), 215 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index c05f079516ba6..d94c400ad1422 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1204,15 +1204,18 @@ bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
   auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
   auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
 
-  auto SPCopy = B.buildCopy(PtrTy, SPReg);
+  auto OldSP = B.buildCopy(PtrTy, SPReg);
   if (Alignment > TFI.getStackAlign()) {
-    auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
-    B.buildMaskLowPtrBits(Dst, PtrAdd,
+    auto StackAlignMask = (Alignment.value() << ST.getWavefrontSizeLog2()) - 1;
+    auto Tmp1 = B.buildPtrAdd(PtrTy, OldSP,
+                              B.buildConstant(LLT::scalar(32), StackAlignMask));
+    B.buildMaskLowPtrBits(Dst, Tmp1,
                           Log2(Alignment) + ST.getWavefrontSizeLog2());
   } else {
-    B.buildPtrAdd(Dst, SPCopy, ScaledSize);
+    B.buildCopy(Dst, OldSP);
   }
-
+  auto PtrAdd = B.buildPtrAdd(PtrTy, Dst, ScaledSize);
+  B.buildCopy(SPReg, PtrAdd);
   MI.eraseFromParent();
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9addeae6a1a4a..58b061f5c1af0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4016,8 +4016,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                          InVals, /*IsThisReturn=*/false, SDValue());
 }
 
-// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
-// except for applying the wave size scale to the increment amount.
+// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
+// except for stack growth direction(default: downwards, AMDGPU: upwards) and
+// applying the wave size scale to the increment amount.
 SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
                                                       SelectionDAG &DAG) const {
   const MachineFunction &MF = DAG.getMachineFunction();
@@ -4037,31 +4038,35 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(SDValue Op,
   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
 
   SDValue Size = Tmp2.getOperand(1);
-  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
-  Chain = SP.getValue(1);
-  MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
+  SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+  Align Alignment = cast<ConstantSDNode>(Tmp3)->getAlignValue();
+
   const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
   assert(TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp &&
          "Stack grows upwards for AMDGPU");
 
+  Chain = BaseAddr.getValue(1);
+  Align StackAlign = TFL->getStackAlign();
+  if (Alignment > StackAlign) {
+    uint64_t ScaledAlignment = (uint64_t)Alignment.value()
+                               << Subtarget->getWavefrontSizeLog2();
+    uint64_t StackAlignMask = ScaledAlignment - 1;
+    SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
+                                  DAG.getConstant(StackAlignMask, dl, VT));
+    BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
+                           DAG.getSignedConstant(-ScaledAlignment, dl, VT));
+  }
+
   SDValue ScaledSize = DAG.getNode(
       ISD::SHL, dl, VT, Size,
       DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
 
-  Align StackAlign = TFL->getStackAlign();
-  Tmp1 = DAG.getNode(ISD::ADD, dl, VT, SP, ScaledSize); // Value
-  if (Alignment && *Alignment > StackAlign) {
-    Tmp1 = DAG.getNode(
-        ISD::AND, dl, VT, Tmp1,
-        DAG.getSignedConstant(-(uint64_t)Alignment->value()
-                                  << Subtarget->getWavefrontSizeLog2(),
-                              dl, VT));
-  }
+  SDValue NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
 
-  Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
+  Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
   Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
 
-  return DAG.getMergeValues({Tmp1, Tmp2}, dl);
+  return DAG.getMergeValues({BaseAddr, Tmp2}, dl);
 }
 
 SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
index 741323a201d02..ae055ea041297 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
@@ -8,52 +8,55 @@
 define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align4(i32 %n) {
 ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align4:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX9-NEXT:    s_load_dword s5, s[8:9], 0x0
 ; GFX9-NEXT:    s_add_u32 s0, s0, s17
-; GFX9-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX9-NEXT:    s_movk_i32 s32, 0x400
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    s_mov_b32 s4, s32
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl2_add_u32 s4, s4, 15
-; GFX9-NEXT:    s_and_b32 s4, s4, -16
-; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
-; GFX9-NEXT:    s_add_u32 s4, s32, s4
+; GFX9-NEXT:    s_lshl2_add_u32 s5, s5, 15
+; GFX9-NEXT:    s_and_b32 s5, s5, -16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    s_lshl_b32 s5, s5, 6
 ; GFX9-NEXT:    s_mov_b32 s33, 0
+; GFX9-NEXT:    s_add_u32 s32, s4, s5
 ; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align4:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-NEXT:    s_movk_i32 s32, 0x200
 ; GFX10-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-NEXT:    s_mov_b32 s4, s32
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-NEXT:    s_movk_i32 s32, 0x200
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_mov_b32 s33, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshl2_add_u32 s4, s4, 15
-; GFX10-NEXT:    s_and_b32 s4, s4, -16
-; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
-; GFX10-NEXT:    s_add_u32 s4, s32, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:    s_mov_b32 s33, 0
 ; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_lshl2_add_u32 s5, s5, 15
+; GFX10-NEXT:    s_and_b32 s5, s5, -16
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 5
+; GFX10-NEXT:    s_add_u32 s32, s4, s5
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align4:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_mov_b32 s32, 16
 ; GFX11-NEXT:    s_mov_b32 s33, 0
+; GFX11-NEXT:    s_mov_b32 s0, s32
+; GFX11-NEXT:    scratch_store_b32 off, v0, s0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl2_add_u32 s0, s0, 15
+; GFX11-NEXT:    s_lshl2_add_u32 s1, s1, 15
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s0, s0, -16
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-NEXT:    s_and_b32 s1, s1, -16
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s32, s0
-; GFX11-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-NEXT:    s_add_u32 s32, s0, s1
 ; GFX11-NEXT:    s_endpgm
   %alloca = alloca i32, i32 %n, align 4, addrspace(5)
   store i32 0, ptr addrspace(5) %alloca
@@ -64,24 +67,25 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
 ; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align4:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s7, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, gv@gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s6, s32
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s33, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl2_add_u32 s4, s4, 15
 ; GFX9-NEXT:    s_and_b32 s4, s4, -16
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
-; GFX9-NEXT:    s_add_u32 s4, s32, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-NEXT:    s_add_u32 s32, s6, s4
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -89,31 +93,32 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
 ; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align4:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s7, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, gv@gotpcrel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_mov_b32 s6, s32
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    s_mov_b32 s33, s6
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-NEXT:    s_mov_b32 s33, s7
+; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_lshl2_add_u32 s4, s4, 15
 ; GFX10-NEXT:    s_and_b32 s4, s4, -16
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
-; GFX10-NEXT:    s_add_u32 s4, s32, s4
+; GFX10-NEXT:    s_add_u32 s32, s6, s4
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align4:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s3, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
@@ -121,7 +126,9 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
 ; GFX11-NEXT:    s_addc_u32 s1, s1, gv@gotpcrel32@hi+12
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s33, s2
+; GFX11-NEXT:    s_mov_b32 s2, s32
+; GFX11-NEXT:    s_mov_b32 s33, s3
+; GFX11-NEXT:    scratch_store_b32 off, v0, s2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -129,10 +136,9 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, s0, -16
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s32, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s32, s2, s0
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    scratch_store_b32 off, v0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %n = load i32, ptr addrspace(4) @gv, align 4
   %alloca = alloca i32, i32 %n, addrspace(5)
@@ -143,52 +149,55 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
 define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align16(i32 %n) {
 ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX9-NEXT:    s_load_dword s5, s[8:9], 0x0
 ; GFX9-NEXT:    s_add_u32 s0, s0, s17
-; GFX9-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX9-NEXT:    s_movk_i32 s32, 0x400
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    s_mov_b32 s4, s32
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl2_add_u32 s4, s4, 15
-; GFX9-NEXT:    s_and_b32 s4, s4, -16
-; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
-; GFX9-NEXT:    s_add_u32 s4, s32, s4
+; GFX9-NEXT:    s_lshl2_add_u32 s5, s5, 15
+; GFX9-NEXT:    s_and_b32 s5, s5, -16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    s_lshl_b32 s5, s5, 6
 ; GFX9-NEXT:    s_mov_b32 s33, 0
+; GFX9-NEXT:    s_add_u32 s32, s4, s5
 ; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-NEXT:    s_load_dword s5, s[8:9], 0x0
+; GFX10-NEXT:    s_movk_i32 s32, 0x200
 ; GFX10-NEXT:    s_add_u32 s0, s0, s17
+; GFX10-NEXT:    s_mov_b32 s4, s32
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-NEXT:    s_movk_i32 s32, 0x200
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_mov_b32 s33, 0
-; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshl2_add_u32 s4, s4, 15
-; GFX10-NEXT:    s_and_b32 s4, s4, -16
-; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
-; GFX10-NEXT:    s_add_u32 s4, s32, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:    s_mov_b32 s33, 0
 ; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_lshl2_add_u32 s5, s5, 15
+; GFX10-NEXT:    s_and_b32 s5, s5, -16
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 5
+; GFX10-NEXT:    s_add_u32 s32, s4, s5
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x0
+; GFX11-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_mov_b32 s32, 16
 ; GFX11-NEXT:    s_mov_b32 s33, 0
+; GFX11-NEXT:    s_mov_b32 s0, s32
+; GFX11-NEXT:    scratch_store_b32 off, v0, s0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl2_add_u32 s0, s0, 15
+; GFX11-NEXT:    s_lshl2_add_u32 s1, s1, 15
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s0, s0, -16
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-NEXT:    s_and_b32 s1, s1, -16
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 5
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s32, s0
-; GFX11-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-NEXT:    s_add_u32 s32, s0, s1
 ; GFX11-NEXT:    s_endpgm
   %alloca = alloca i32, i32 %n, align 16, addrspace(5)
   store i32 0, ptr addrspace(5) %alloca
@@ -199,24 +208,25 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
 ; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_mov_b32 s7, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, gv@gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s6, s32
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    s_mov_b32 s33, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s33, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl2_add_u32 s4, s4, 15
 ; GFX9-NEXT:    s_and_b32 s4, s4, -16
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
-; GFX9-NEXT:    s_add_u32 s4, s32, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-NEXT:    s_add_u32 s32, s6, s4
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -224,31 +234,32 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
 ; GFX10-LABEL: func_dynamic_stackalloc_sgpr_align16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_mov_b32 s7, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, gv@gotpcrel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_mov_b32 s6, s32
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    s_mov_b32 s33, s6
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s6
+; GFX10-NEXT:    s_mov_b32 s33, s7
+; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_lshl2_add_u32 s4, s4, 15
 ; GFX10-NEXT:    s_and_b32 s4, s4, -16
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
-; GFX10-NEXT:    s_add_u32 s4, s32, s4
+; GFX10-NEXT:    s_add_u32 s32, s6, s4
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s2, s33
+; GFX11-NEXT:    s_mov_b32 s3, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
@@ -256,7 +267,9 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
 ; GFX11-NEXT:    s_addc_u32 s1, s1, gv@gotpcrel32@hi+12
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s33, s2
+; GFX11-NEXT:    s_mov_b32 s2, s32
+; GFX11-NEXT:    s_mov_b32 s33, s3
+; GFX11-NEXT:    scratch_store_b32 off, v0, s2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -264,10 +277,9 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, s0, -16
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s32, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s32, s2, s0
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
-; GFX11-NEXT:    scratch_store_b32 off, v0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %n = load i32, ptr addrspace(4) @gv, align 16
   %alloca = alloca i32, i32 %n, addrspace(5)
@@ -279,37 +291,39 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) {
 ; GFX9-LABEL: kernel_dynamic_stackalloc_sgpr_align32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX9-NEXT:    s_movk_i32 s32, 0x800
 ; GFX9-NEXT:    s_add_u32 s0, s0, s17
 ; GFX9-NEXT:    s_addc_u32 s1, s1, 0
-; GFX9-NEXT:    s_movk_i32 s32, 0x800
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_add_u32 s5, s32, 0x7ff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl2_add_u32 s4, s4, 15
+; GFX9-NEXT:    s_and_b32 s5, s5, 0xfffff800
 ; GFX9-NEXT:    s_and_b32 s4, s4, -16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
-; GFX9-NEXT:    s_add_u32 s4, s32, s4
-; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff800
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    s_mov_b32 s33, 0
+; GFX9-NEXT:    s_add_u32 s32, s5, s4
 ; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: kernel_dynamic_stackalloc_sgpr_align32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dword s4, s[8:9], 0x0
+; GFX10-NEXT:    s_movk_i32 s32, 0x400
 ; GFX10-NEXT:    s_add_u32 s0, s0, s17
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
-; GFX10-NEXT:    s_movk_i32 s32, 0x400
+; GFX10-NEXT:    s_add_u32 s5, s32, 0x3ff
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_and_b32 s5, s5, 0xfffffc00
 ; GFX10-NEXT:    s_mov_b32 s33, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_lshl2_add_u32 s4, s4, 15
 ; GFX10-NEXT:    s_and_b32 s4, s4, -16
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
-; GFX10-NEXT:    s_add_u32 s4, s32, s4
-; GFX10-NEXT:    s_and_b32 s4, s4, 0xfffffc00
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX10-NEXT:    s_add_u32 s32, s5, s4
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: kernel_dynamic_stackalloc_sgpr_align32:
@@ -317,16 +331,17 @@ define amdgpu_kernel void @kernel_dynamic_stackalloc_sgpr_align32(i32 %n) {
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX11-NEXT:    s_mov_b32 s32, 32
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_add_u32 s1, s32, 0x3ff
 ; GFX11-NEXT:    s_mov_b32 s33, 0
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xfffffc00
+; GFX11-NEXT:    scratch_store_b32 off, v0, s1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_lshl2_add_u32 s0, s0, 15
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, s0, -16
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s32, s0
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xfffffc00
-; GFX11-NEXT:    scratch_store_b32 off, v0, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s32, s1, s0
 ; GFX11-NEXT:    s_endpgm
   %alloca = alloca i32, i32 %n, align 32, addrspace(5)
   store i32 0, ptr addrspace(5) %alloca
@@ -349,14 +364,15 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_mov_b32 s33, s6
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX9-NEXT:    s_add_u32 s5, s32, 0x7ff
+; GFX9-NEXT:    s_and_b32 s5, s5, 0xfffff800
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl2_add_u32 s4, s4, 15
 ; GFX9-NEXT:    s_and_b32 s4, s4, -16
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
-; GFX9-NEXT:    s_add_u32 s4, s32, s4
-; GFX9-NEXT:    s_and_b32 s4, s4, 0xfffff800
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-NEXT:    s_add_u32 s32, s5, s4
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf000
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -376,15 +392,16 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
 ; GFX10-NEXT:    s_mov_b32 s33, s6
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX10-NEXT:    s_add_u32 s5, s32, 0x3ff
+; GFX10-NEXT:    s_and_b32 s5, s5, 0xfffffc00
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_lshl2_add_u32 s4, s4, 15
 ; GFX10-NEXT:    s_and_b32 s4, s4, -16
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
-; GFX10-NEXT:    s_add_u32 s4, s32, s4
+; GFX10-NEXT:    s_add_u32 s32, s5, s4
 ; GFX10-NEXT:    s_addk_i32 s32, 0xf800
-; GFX10-NEXT:    s_and_b32 s4, s4, 0xfffffc00
-; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align32:
@@ -402,16 +419,18 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT:    s_add_u32 s1, s32, 0x3ff
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xfffffc00
+; GFX11-NEXT:    scratch_store_b32 off, v0, s1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_lshl2_add_u32 s0, s0, 15
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, s0, -16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX11-NEXT:    s_add_u32 s32, s1, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s32, s0
 ; GFX11-NEXT:    s_addk_i32 s32, 0xffc0
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xfffffc00
-; GFX11-NEXT:    scratch_store_b32 off, v0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %n = load i32, ptr addrspace(4) @gv
   %alloca = alloca i32, i32 %n, align 32, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
index 01287d5b7cf24..69abef02d3d92 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
@@ -27,19 +27,20 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_3
 ; GCN-NEXT:  ; %bb.2: ; %bb.1
-; GCN-NEXT:    s_load_dword s5, s[8:9], 0x10
-; GCN-NEXT:    s_add_u32 s4, s32, 0x1000
+; GCN-NEXT:    s_load_dword s4, s[8:9], 0x10
+; GCN-NEXT:    s_mov_b32 s6, s32
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
+; GCN-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN-NEXT:    v_mov_b32_e32 v3, 1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s5, s5, 2
-; GCN-NEXT:    s_add_u32 s4, s4, s5
+; GCN-NEXT:    s_lshl_b32 s4, s4, 2
+; GCN-NEXT:    s_add_u32 s4, s6, s4
 ; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GCN-NEXT:    s_add_u32 s32, s6, 0x1000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_u32_e32 v0, v2, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -94,19 +95,20 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; GCN-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GCN-NEXT:  ; %bb.1: ; %bb.0
 ; GCN-NEXT:    s_load_dword s4, s[8:9], 0xc
-; GCN-NEXT:    s_add_u32 s5, s32, 0x1000
-; GCN-NEXT:    s_and_b32 s5, s5, 0xfffff000
+; GCN-NEXT:    s_add_u32 s5, s32, 0xfff
+; GCN-NEXT:    s_and_b32 s6, s5, 0xfffff000
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    v_mov_b32_e32 v2, s5
+; GCN-NEXT:    v_mov_b32_e32 v2, s6
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s4, s4, 2
 ; GCN-NEXT:    v_mov_b32_e32 v3, 1
-; GCN-NEXT:    s_add_u32 s4, s5, s4
+; GCN-NEXT:    s_add_u32 s4, s6, s4
 ; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
 ; GCN-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GCN-NEXT:    s_add_u32 s32, s6, 0x1000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_u32_e32 v0, v2, v0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
@@ -159,7 +161,7 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
 ; GCN-NEXT:    s_and_b64 exec, exec, vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB2_3
 ; GCN-NEXT:  ; %bb.2: ; %bb.1
-; GCN-NEXT:    s_add_u32 s6, s32, 0x1000
+; GCN-NEXT:    s_mov_b32 s6, s32
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s6
 ; GCN-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
@@ -169,6 +171,7 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
 ; GCN-NEXT:    v_add_u32_e32 v2, s6, v2
 ; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; GCN-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
+; GCN-NEXT:    s_add_u32 s32, s6, 0x1000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_u32_e32 v2, v2, v3
 ; GCN-NEXT:    global_store_dword v[0:1], v2, off
@@ -219,7 +222,7 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB3_2
 ; GCN-NEXT:  ; %bb.1: ; %bb.0
-; GCN-NEXT:    s_add_u32 s6, s32, 0x1000
+; GCN-NEXT:    s_add_u32 s6, s32, 0xfff
 ; GCN-NEXT:    s_and_b32 s6, s6, 0xfffff000
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v4, s6
@@ -230,6 +233,7 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
 ; GCN-NEXT:    v_add_u32_e32 v2, s6, v2
 ; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; GCN-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
+; GCN-NEXT:    s_add_u32 s32, s6, 0x1000
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_add_u32_e32 v2, v2, v3
 ; GCN-NEXT:    global_store_dword v[0:1], v2, off
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir
index ed1ca320943de..5378ce2d1efaa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-dyn-stackalloc.mir
@@ -23,8 +23,11 @@ body: |
     ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
     ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
     ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+    ;
     ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align1
     ; WAVE32: liveins: $sgpr0
     ; WAVE32-NEXT: {{  $}}
@@ -32,8 +35,10 @@ body: |
     ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
     ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
     ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
     %0:_(s32) = COPY $sgpr0
     %1:_(p5) = G_DYN_STACKALLOC %0, 1
     S_ENDPGM 0, implicit %1
@@ -57,8 +62,11 @@ body: |
     ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
     ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
     ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+    ;
     ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align2
     ; WAVE32: liveins: $sgpr0
     ; WAVE32-NEXT: {{  $}}
@@ -66,8 +74,10 @@ body: |
     ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
     ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
     ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
     %0:_(s32) = COPY $sgpr0
     %1:_(p5) = G_DYN_STACKALLOC %0, 2
     S_ENDPGM 0, implicit %1
@@ -91,8 +101,11 @@ body: |
     ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
     ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
     ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+    ;
     ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align4
     ; WAVE32: liveins: $sgpr0
     ; WAVE32-NEXT: {{  $}}
@@ -100,8 +113,10 @@ body: |
     ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
     ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
     ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
     %0:_(s32) = COPY $sgpr0
     %1:_(p5) = G_DYN_STACKALLOC %0, 4
     S_ENDPGM 0, implicit %1
@@ -125,8 +140,11 @@ body: |
     ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
     ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
     ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+    ;
     ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align8
     ; WAVE32: liveins: $sgpr0
     ; WAVE32-NEXT: {{  $}}
@@ -134,8 +152,10 @@ body: |
     ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
     ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
     ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
     %0:_(s32) = COPY $sgpr0
     %1:_(p5) = G_DYN_STACKALLOC %0, 8
     S_ENDPGM 0, implicit %1
@@ -159,8 +179,11 @@ body: |
     ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
     ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
     ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
+    ;
     ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align16
     ; WAVE32: liveins: $sgpr0
     ; WAVE32-NEXT: {{  $}}
@@ -168,8 +191,10 @@ body: |
     ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
     ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
     ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: [[COPY2:%[0-9]+]]:sgpr(p5) = COPY [[COPY1]](p5)
+    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY2]], [[SHL]](s32)
+    ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY2]](p5)
     %0:_(s32) = COPY $sgpr0
     %1:_(p5) = G_DYN_STACKALLOC %0, 16
     S_ENDPGM 0, implicit %1
@@ -193,10 +218,14 @@ body: |
     ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
     ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
     ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
-    ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048
-    ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32)
+    ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2047
+    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+    ; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048
+    ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+    ; WAVE64-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+    ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
     ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
+    ;
     ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align32
     ; WAVE32: liveins: $sgpr0
     ; WAVE32-NEXT: {{  $}}
@@ -204,9 +233,12 @@ body: |
     ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
     ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
     ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
-    ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1024
-    ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32)
+    ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1023
+    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+    ; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1024
+    ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+    ; WAVE32-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+    ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
     ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
     %0:_(s32) = COPY $sgpr0
     %1:_(p5) = G_DYN_STACKALLOC %0, 32
@@ -231,10 +263,14 @@ body: |
     ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
     ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
     ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
-    ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -4096
-    ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32)
+    ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
+    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+    ; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -4096
+    ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+    ; WAVE64-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+    ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
     ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
+    ;
     ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align64
     ; WAVE32: liveins: $sgpr0
     ; WAVE32-NEXT: {{  $}}
@@ -242,9 +278,12 @@ body: |
     ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
     ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
     ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
-    ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048
-    ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32)
+    ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2047
+    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+    ; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048
+    ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+    ; WAVE32-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+    ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
     ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
     %0:_(s32) = COPY $sgpr0
     %1:_(p5) = G_DYN_STACKALLOC %0, 64
@@ -269,10 +308,14 @@ body: |
     ; WAVE64-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
     ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
     ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
-    ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -8192
-    ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32)
+    ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 8191
+    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+    ; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -8192
+    ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+    ; WAVE64-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+    ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
     ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
+    ;
     ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_align128
     ; WAVE32: liveins: $sgpr0
     ; WAVE32-NEXT: {{  $}}
@@ -280,9 +323,12 @@ body: |
     ; WAVE32-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
     ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY]], [[C]](s32)
     ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
-    ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -4096
-    ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C1]](s32)
+    ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 4095
+    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[C1]](s32)
+    ; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -4096
+    ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+    ; WAVE32-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+    ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
     ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
     %0:_(s32) = COPY $sgpr0
     %1:_(p5) = G_DYN_STACKALLOC %0, 128
@@ -304,15 +350,20 @@ body: |
     ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
     ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32)
     ; WAVE64-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32)
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY [[COPY]](p5)
+    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
+    ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY1]](p5)
+    ;
     ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_constant_align4
     ; WAVE32: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 32
     ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
     ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32)
     ; WAVE32-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32)
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY [[COPY]](p5)
+    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
+    ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]](p5)
     %0:_(s32) = G_CONSTANT i32 32
     %1:_(p5) = G_DYN_STACKALLOC %0, 4
     S_ENDPGM 0, implicit %1
@@ -336,8 +387,11 @@ body: |
     ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
     ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32)
     ; WAVE64-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32)
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY [[COPY]](p5)
+    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
+    ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY1]](p5)
+    ;
     ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_constant_align8
     ; WAVE32: liveins: $sgpr0
     ; WAVE32-NEXT: {{  $}}
@@ -345,8 +399,10 @@ body: |
     ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
     ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32)
     ; WAVE32-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32)
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY [[COPY]](p5)
+    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
+    ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]](p5)
     %0:_(s32) = G_CONSTANT i32 32
     %1:_(p5) = G_DYN_STACKALLOC %0, 8
     S_ENDPGM 0, implicit %1
@@ -370,8 +426,11 @@ body: |
     ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
     ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32)
     ; WAVE64-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32)
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY [[COPY]](p5)
+    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
+    ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[COPY1]](p5)
+    ;
     ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_constant_align16
     ; WAVE32: liveins: $sgpr0
     ; WAVE32-NEXT: {{  $}}
@@ -379,8 +438,10 @@ body: |
     ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
     ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32)
     ; WAVE32-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32)
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: [[COPY1:%[0-9]+]]:sgpr(p5) = COPY [[COPY]](p5)
+    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY1]], [[SHL]](s32)
+    ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD]](p5)
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[COPY1]](p5)
     %0:_(s32) = G_CONSTANT i32 32
     %1:_(p5) = G_DYN_STACKALLOC %0, 16
     S_ENDPGM 0, implicit %1
@@ -404,10 +465,14 @@ body: |
     ; WAVE64-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 6
     ; WAVE64-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32)
     ; WAVE64-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32)
-    ; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048
-    ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+    ; WAVE64-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 2047
+    ; WAVE64-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; WAVE64-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -2048
+    ; WAVE64-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C3]](s32)
+    ; WAVE64-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+    ; WAVE64-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
     ; WAVE64-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
+    ;
     ; WAVE32-LABEL: name: test_dyn_stackalloc_sgpr_constant_align32
     ; WAVE32: liveins: $sgpr0
     ; WAVE32-NEXT: {{  $}}
@@ -415,9 +480,12 @@ body: |
     ; WAVE32-NEXT: [[C1:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 5
     ; WAVE32-NEXT: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[C]], [[C1]](s32)
     ; WAVE32-NEXT: [[COPY:%[0-9]+]]:sgpr(p5) = COPY $sp_reg
-    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[SHL]](s32)
-    ; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1024
-    ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C2]](s32)
+    ; WAVE32-NEXT: [[C2:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1023
+    ; WAVE32-NEXT: [[PTR_ADD:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[COPY]], [[C2]](s32)
+    ; WAVE32-NEXT: [[C3:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 -1024
+    ; WAVE32-NEXT: [[PTRMASK:%[0-9]+]]:sgpr(p5) = G_PTRMASK [[PTR_ADD]], [[C3]](s32)
+    ; WAVE32-NEXT: [[PTR_ADD1:%[0-9]+]]:sgpr(p5) = G_PTR_ADD [[PTRMASK]], [[SHL]](s32)
+    ; WAVE32-NEXT: $sp_reg = COPY [[PTR_ADD1]](p5)
     ; WAVE32-NEXT: S_ENDPGM 0, implicit [[PTRMASK]](p5)
     %0:_(s32) = G_CONSTANT i32 32
     %1:_(p5) = G_DYN_STACKALLOC %0, 32
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
index 1a0fda3d54d3f..a5f915c48ebee 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -143,7 +143,7 @@ attributes #0 = { nounwind }
 ; GCN: amdpal.pipelines:
 ; GCN-NEXT:  - .registers:
 ; SDAG-NEXT:     '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
-; GISEL-NEXT:    '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01cb{{$}}
+; GISEL-NEXT:    '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf01ca{{$}}
 ; GCN-NEXT:      '0x2e13 (COMPUTE_PGM_RSRC2)': 0x8001{{$}}
 ; GCN-NEXT:    .shader_functions:
 ; GCN-NEXT:      dynamic_stack:
@@ -157,10 +157,10 @@ attributes #0 = { nounwind }
 ; GCN-NEXT:        .backend_stack_size: 0x10{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
 ; SDAG-NEXT:        .sgpr_count:     0x25{{$}}
-; GISEL-NEXT:        .sgpr_count:     0x27{{$}}
+; GISEL-NEXT:        .sgpr_count:     0x26{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
 ; SDAG-NEXT:        .vgpr_count:     0x3{{$}}
-; GISEL-NEXT:        .vgpr_count:     0x5{{$}}
+; GISEL-NEXT:        .vgpr_count:     0x4{{$}}
 ; GCN-NEXT:      multiple_stack:
 ; GCN-NEXT:        .backend_stack_size: 0x24{{$}}
 ; GCN-NEXT:        .lds_size:       0{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index 85096eb63f46e..2bd60e869f843 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -30,15 +30,14 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; MUBUF-NEXT:    s_cmp_lg_u32 s9, 0
 ; MUBUF-NEXT:    s_cbranch_scc1 .LBB0_3
 ; MUBUF-NEXT:  ; %bb.2: ; %bb.1
-; MUBUF-NEXT:    s_add_i32 s6, s32, 0x1000
-; MUBUF-NEXT:    s_lshl_b32 s7, s10, 2
-; MUBUF-NEXT:    s_mov_b32 s32, s6
+; MUBUF-NEXT:    s_mov_b32 s6, s32
 ; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
-; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
-; MUBUF-NEXT:    v_mov_b32_e32 v3, 1
+; MUBUF-NEXT:    v_mov_b32_e32 v2, 1
+; MUBUF-NEXT:    s_lshl_b32 s7, s10, 2
+; MUBUF-NEXT:    s_add_i32 s32, s6, 0x1000
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s6
+; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s6 offset:4
 ; MUBUF-NEXT:    s_add_i32 s6, s6, s7
-; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; MUBUF-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
 ; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
 ; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; MUBUF-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
@@ -66,11 +65,11 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; FLATSCR-NEXT:    s_cmp_lg_u32 s5, 0
 ; FLATSCR-NEXT:    s_cbranch_scc1 .LBB0_3
 ; FLATSCR-NEXT:  ; %bb.2: ; %bb.1
-; FLATSCR-NEXT:    s_add_i32 s2, s32, 0x1000
+; FLATSCR-NEXT:    s_mov_b32 s2, s32
 ; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 1
 ; FLATSCR-NEXT:    s_lshl_b32 s3, s6, 2
-; FLATSCR-NEXT:    s_mov_b32 s32, s2
+; FLATSCR-NEXT:    s_add_i32 s32, s2, 0x1000
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s2
 ; FLATSCR-NEXT:    s_add_i32 s2, s2, s3
 ; FLATSCR-NEXT:    scratch_load_dword v2, off, s2
@@ -131,10 +130,10 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; MUBUF-NEXT:    s_cmp_lg_u32 s4, 0
 ; MUBUF-NEXT:    s_cbranch_scc1 .LBB1_2
 ; MUBUF-NEXT:  ; %bb.1: ; %bb.0
-; MUBUF-NEXT:    s_add_i32 s4, s32, 0x1000
+; MUBUF-NEXT:    s_add_i32 s4, s32, 0xfff
 ; MUBUF-NEXT:    s_and_b32 s4, s4, 0xfffff000
 ; MUBUF-NEXT:    s_lshl_b32 s5, s5, 2
-; MUBUF-NEXT:    s_mov_b32 s32, s4
+; MUBUF-NEXT:    s_add_i32 s32, s4, 0x1000
 ; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v2, s4
 ; MUBUF-NEXT:    v_mov_b32_e32 v3, 1
@@ -165,12 +164,12 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
 ; FLATSCR-NEXT:    s_cmp_lg_u32 s0, 0
 ; FLATSCR-NEXT:    s_cbranch_scc1 .LBB1_2
 ; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
-; FLATSCR-NEXT:    s_add_i32 s0, s32, 0x1000
+; FLATSCR-NEXT:    s_add_i32 s0, s32, 0xfff
 ; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
 ; FLATSCR-NEXT:    s_and_b32 s0, s0, 0xfffff000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 1
 ; FLATSCR-NEXT:    s_lshl_b32 s1, s1, 2
-; FLATSCR-NEXT:    s_mov_b32 s32, s0
+; FLATSCR-NEXT:    s_add_i32 s32, s0, 0x1000
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s0
 ; FLATSCR-NEXT:    s_add_i32 s0, s0, s1
 ; FLATSCR-NEXT:    scratch_load_dword v2, off, s0
@@ -230,16 +229,15 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
 ; MUBUF-NEXT:    s_and_b64 exec, exec, vcc
 ; MUBUF-NEXT:    s_cbranch_execz .LBB2_3
 ; MUBUF-NEXT:  ; %bb.2: ; %bb.1
-; MUBUF-NEXT:    s_add_i32 s6, s32, 0x1000
+; MUBUF-NEXT:    s_mov_b32 s6, s32
 ; MUBUF-NEXT:    v_mov_b32_e32 v2, 0
-; MUBUF-NEXT:    v_mov_b32_e32 v3, s6
-; MUBUF-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s6
 ; MUBUF-NEXT:    v_mov_b32_e32 v2, 1
-; MUBUF-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
+; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s6 offset:4
 ; MUBUF-NEXT:    v_lshl_add_u32 v2, v4, 2, s6
 ; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; MUBUF-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
-; MUBUF-NEXT:    s_mov_b32 s32, s6
+; MUBUF-NEXT:    s_add_i32 s32, s6, 0x1000
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    v_add_u32_e32 v2, v2, v3
 ; MUBUF-NEXT:    global_store_dword v[0:1], v2, off
@@ -266,14 +264,14 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
 ; FLATSCR-NEXT:    s_and_b64 exec, exec, vcc
 ; FLATSCR-NEXT:    s_cbranch_execz .LBB2_3
 ; FLATSCR-NEXT:  ; %bb.2: ; %bb.1
-; FLATSCR-NEXT:    s_add_i32 s2, s32, 0x1000
+; FLATSCR-NEXT:    s_mov_b32 s2, s32
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v3, 1
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[2:3], s2
 ; FLATSCR-NEXT:    v_lshl_add_u32 v2, v4, 2, s2
 ; FLATSCR-NEXT:    scratch_load_dword v2, v2, off
 ; FLATSCR-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
-; FLATSCR-NEXT:    s_mov_b32 s32, s2
+; FLATSCR-NEXT:    s_add_i32 s32, s2, 0x1000
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_add_u32_e32 v2, v2, v3
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
@@ -324,7 +322,7 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
 ; MUBUF-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; MUBUF-NEXT:    s_cbranch_execz .LBB3_2
 ; MUBUF-NEXT:  ; %bb.1: ; %bb.0
-; MUBUF-NEXT:    s_add_i32 s6, s32, 0x1000
+; MUBUF-NEXT:    s_add_i32 s6, s32, 0xfff
 ; MUBUF-NEXT:    s_and_b32 s6, s6, 0xfffff000
 ; MUBUF-NEXT:    v_mov_b32_e32 v2, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v4, s6
@@ -334,7 +332,7 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
 ; MUBUF-NEXT:    v_lshl_add_u32 v2, v3, 2, s6
 ; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
 ; MUBUF-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
-; MUBUF-NEXT:    s_mov_b32 s32, s6
+; MUBUF-NEXT:    s_add_i32 s32, s6, 0x1000
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    v_add_u32_e32 v2, v2, v3
 ; MUBUF-NEXT:    global_store_dword v[0:1], v2, off
@@ -358,7 +356,7 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
 ; FLATSCR-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; FLATSCR-NEXT:    s_cbranch_execz .LBB3_2
 ; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
-; FLATSCR-NEXT:    s_add_i32 s2, s32, 0x1000
+; FLATSCR-NEXT:    s_add_i32 s2, s32, 0xfff
 ; FLATSCR-NEXT:    s_and_b32 s2, s2, 0xfffff000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v4, 0
 ; FLATSCR-NEXT:    v_mov_b32_e32 v5, 1
@@ -366,7 +364,7 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
 ; FLATSCR-NEXT:    v_lshl_add_u32 v2, v3, 2, s2
 ; FLATSCR-NEXT:    scratch_load_dword v2, v2, off
 ; FLATSCR-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
-; FLATSCR-NEXT:    s_mov_b32 s32, s2
+; FLATSCR-NEXT:    s_add_i32 s32, s2, 0x1000
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    v_add_u32_e32 v2, v2, v3
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v2, off

From ecd59f802f9436c766d85ccde13a9f2b863dc5a6 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 19 Dec 2024 21:49:15 -0800
Subject: [PATCH 144/209] [SelectionDAG] Use SmallVectorImpl& to avoid
 repeating SmallVector size. NFC

---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 07749ec87d0b2..20119537b631f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -11872,7 +11872,7 @@ bool operator<(const UseMemo &L, const UseMemo &R) {
 /// pointed to by a UseMemo is deleted, set the User to nullptr to indicate that
 /// the node already has been taken care of recursively.
 class RAUOVWUpdateListener : public SelectionDAG::DAGUpdateListener {
-  SmallVector<UseMemo, 4> &Uses;
+  SmallVectorImpl<UseMemo> &Uses;
 
   void NodeDeleted(SDNode *N, SDNode *E) override {
     for (UseMemo &Memo : Uses)
@@ -11881,7 +11881,7 @@ class RAUOVWUpdateListener : public SelectionDAG::DAGUpdateListener {
   }
 
 public:
-  RAUOVWUpdateListener(SelectionDAG &d, SmallVector<UseMemo, 4> &uses)
+  RAUOVWUpdateListener(SelectionDAG &d, SmallVectorImpl<UseMemo> &uses)
       : SelectionDAG::DAGUpdateListener(d), Uses(uses) {}
 };
 

From e30308196901873a926b6c17304d022eb91fc585 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 19 Dec 2024 22:37:11 -0800
Subject: [PATCH 145/209] [X86] Remove redundant initialize*Pass in ctor

---
 llvm/lib/Target/X86/X86ArgumentStackSlotRebase.cpp | 4 +---
 llvm/lib/Target/X86/X86LowerAMXType.cpp            | 4 +---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ArgumentStackSlotRebase.cpp b/llvm/lib/Target/X86/X86ArgumentStackSlotRebase.cpp
index 19c4751e50f71..bf9b8e5730593 100644
--- a/llvm/lib/Target/X86/X86ArgumentStackSlotRebase.cpp
+++ b/llvm/lib/Target/X86/X86ArgumentStackSlotRebase.cpp
@@ -40,9 +40,7 @@ class X86ArgumentStackSlotPass : public MachineFunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid
 
-  explicit X86ArgumentStackSlotPass() : MachineFunctionPass(ID) {
-    initializeX86ArgumentStackSlotPassPass(*PassRegistry::getPassRegistry());
-  }
+  explicit X86ArgumentStackSlotPass() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp
index b484b9334e9b1..53091e7bb4dbd 100644
--- a/llvm/lib/Target/X86/X86LowerAMXType.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp
@@ -1407,9 +1407,7 @@ class X86LowerAMXTypeLegacyPass : public FunctionPass {
 public:
   static char ID;
 
-  X86LowerAMXTypeLegacyPass() : FunctionPass(ID) {
-    initializeX86LowerAMXTypeLegacyPassPass(*PassRegistry::getPassRegistry());
-  }
+  X86LowerAMXTypeLegacyPass() : FunctionPass(ID) {}
 
   bool runOnFunction(Function &F) override {
     // Performance optimization: most code doesn't use AMX, so return early if

From 59c7d6f965af9f1e05f8447c50fde8bbde346361 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 19 Dec 2024 22:42:53 -0800
Subject: [PATCH 146/209] [Xtensa,test] Remove failing RUN line from #119639

---
 llvm/test/MC/Disassembler/Xtensa/code_density.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/test/MC/Disassembler/Xtensa/code_density.txt b/llvm/test/MC/Disassembler/Xtensa/code_density.txt
index b2c91bcfbaefe..eac236a4f3081 100644
--- a/llvm/test/MC/Disassembler/Xtensa/code_density.txt
+++ b/llvm/test/MC/Disassembler/Xtensa/code_density.txt
@@ -1,6 +1,4 @@
-# RUN: llvm-mc -triple=xtensa -mattr=+density -disassemble < %s | FileCheck -check-prefixes=CHECK-DENSITY %s
-# RUN: llvm-mc -triple=xtensa -disassemble %s &> %t
-# RUN: FileCheck -check-prefixes=CHECK-CORE < %t %s
+# RUN: llvm-mc -triple=xtensa -mattr=+density -disassemble %s | FileCheck --check-prefix=CHECK-DENSITY %s
 
 #------------------------------------------------------------------------------
 # Verify that binary code is correctly disassembled with

From 93743ee566694d2fcafa3243c03330e86bf9c806 Mon Sep 17 00:00:00 2001
From: Mikhail Goncharov <goncharov.mikhail@gmail.com>
Date: Fri, 20 Dec 2024 08:14:26 +0100
Subject: [PATCH 147/209] Revert "[Clang] Re-write codegen for
 atomic_test_and_set and atomic_clear (#120449)"

This reverts commit 9fc2fadbfcb34df5f72bdaed28a7874bf584eed7.

See https://github.com/llvm/llvm-project/pull/120449#issuecomment-2556089016
---
 clang/include/clang/Basic/Builtins.td    |  12 +-
 clang/lib/AST/Expr.cpp                   |   2 -
 clang/lib/CodeGen/CGAtomic.cpp           |  25 +--
 clang/lib/CodeGen/CGBuiltin.cpp          | 141 +++++++++++++
 clang/lib/Sema/SemaChecking.cpp          |  35 +---
 clang/test/CodeGen/atomic-test-and-set.c | 250 -----------------------
 clang/test/Sema/atomic-ops.c             |   8 +-
 7 files changed, 157 insertions(+), 316 deletions(-)
 delete mode 100644 clang/test/CodeGen/atomic-test-and-set.c

diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 1e77016001e48..d64a66fc9d9cf 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1977,16 +1977,16 @@ def AtomicNandFetch : AtomicBuiltin {
   let Prototype = "void(...)";
 }
 
-def AtomicTestAndSet : AtomicBuiltin {
+def AtomicTestAndSet : Builtin {
   let Spellings = ["__atomic_test_and_set"];
-  let Attributes = [NoThrow, CustomTypeChecking];
-  let Prototype = "void(...)";
+  let Attributes = [NoThrow];
+  let Prototype = "bool(void volatile*, int)";
 }
 
-def AtomicClear : AtomicBuiltin {
+def AtomicClear : Builtin {
   let Spellings = ["__atomic_clear"];
-  let Attributes = [NoThrow, CustomTypeChecking];
-  let Prototype = "void(...)";
+  let Attributes = [NoThrow];
+  let Prototype = "void(void volatile*, int)";
 }
 
 def AtomicThreadFence : Builtin {
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 7e6cb53064ff2..8c8ccdb61dc01 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -5070,8 +5070,6 @@ unsigned AtomicExpr::getNumSubExprs(AtomicOp Op) {
   case AO__opencl_atomic_init:
   case AO__c11_atomic_load:
   case AO__atomic_load_n:
-  case AO__atomic_test_and_set:
-  case AO__atomic_clear:
     return 2;
 
   case AO__scoped_atomic_load_n:
diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp
index 3adb2a7ad207f..f6cb2ad421e90 100644
--- a/clang/lib/CodeGen/CGAtomic.cpp
+++ b/clang/lib/CodeGen/CGAtomic.cpp
@@ -723,24 +723,6 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, Address Dest,
   case AtomicExpr::AO__scoped_atomic_fetch_nand:
     Op = llvm::AtomicRMWInst::Nand;
     break;
-
-  case AtomicExpr::AO__atomic_test_and_set: {
-    llvm::AtomicRMWInst *RMWI =
-        CGF.emitAtomicRMWInst(llvm::AtomicRMWInst::Xchg, Ptr,
-                              CGF.Builder.getInt8(1), Order, Scope, E);
-    RMWI->setVolatile(E->isVolatile());
-    llvm::Value *Result = CGF.Builder.CreateIsNotNull(RMWI, "tobool");
-    CGF.Builder.CreateStore(Result, Dest);
-    return;
-  }
-
-  case AtomicExpr::AO__atomic_clear: {
-    llvm::StoreInst *Store =
-        CGF.Builder.CreateStore(CGF.Builder.getInt8(0), Ptr);
-    Store->setAtomic(Order, Scope);
-    Store->setVolatile(E->isVolatile());
-    return;
-  }
   }
 
   llvm::Value *LoadVal1 = CGF.Builder.CreateLoad(Val1);
@@ -896,8 +878,6 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
   case AtomicExpr::AO__c11_atomic_load:
   case AtomicExpr::AO__opencl_atomic_load:
   case AtomicExpr::AO__hip_atomic_load:
-  case AtomicExpr::AO__atomic_test_and_set:
-  case AtomicExpr::AO__atomic_clear:
     break;
 
   case AtomicExpr::AO__atomic_load:
@@ -1220,8 +1200,6 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
     case AtomicExpr::AO__opencl_atomic_fetch_max:
     case AtomicExpr::AO__scoped_atomic_fetch_max:
     case AtomicExpr::AO__scoped_atomic_max_fetch:
-    case AtomicExpr::AO__atomic_test_and_set:
-    case AtomicExpr::AO__atomic_clear:
       llvm_unreachable("Integral atomic operations always become atomicrmw!");
     }
 
@@ -1261,8 +1239,7 @@ RValue CodeGenFunction::EmitAtomicExpr(AtomicExpr *E) {
                  E->getOp() == AtomicExpr::AO__atomic_store ||
                  E->getOp() == AtomicExpr::AO__atomic_store_n ||
                  E->getOp() == AtomicExpr::AO__scoped_atomic_store ||
-                 E->getOp() == AtomicExpr::AO__scoped_atomic_store_n ||
-                 E->getOp() == AtomicExpr::AO__atomic_clear;
+                 E->getOp() == AtomicExpr::AO__scoped_atomic_store_n;
   bool IsLoad = E->getOp() == AtomicExpr::AO__c11_atomic_load ||
                 E->getOp() == AtomicExpr::AO__opencl_atomic_load ||
                 E->getOp() == AtomicExpr::AO__hip_atomic_load ||
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 0ea2ee4c264ae..4d4b7428abd50 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5099,6 +5099,147 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
                     ReturnValueSlot(), Args);
   }
 
+  case Builtin::BI__atomic_test_and_set: {
+    // Look at the argument type to determine whether this is a volatile
+    // operation. The parameter type is always volatile.
+    QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
+    bool Volatile =
+        PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
+
+    Address Ptr =
+        EmitPointerWithAlignment(E->getArg(0)).withElementType(Int8Ty);
+
+    Value *NewVal = Builder.getInt8(1);
+    Value *Order = EmitScalarExpr(E->getArg(1));
+    if (isa<llvm::ConstantInt>(Order)) {
+      int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
+      AtomicRMWInst *Result = nullptr;
+      switch (ord) {
+      case 0:  // memory_order_relaxed
+      default: // invalid order
+        Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
+                                         llvm::AtomicOrdering::Monotonic);
+        break;
+      case 1: // memory_order_consume
+      case 2: // memory_order_acquire
+        Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
+                                         llvm::AtomicOrdering::Acquire);
+        break;
+      case 3: // memory_order_release
+        Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
+                                         llvm::AtomicOrdering::Release);
+        break;
+      case 4: // memory_order_acq_rel
+
+        Result = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
+                                         llvm::AtomicOrdering::AcquireRelease);
+        break;
+      case 5: // memory_order_seq_cst
+        Result = Builder.CreateAtomicRMW(
+            llvm::AtomicRMWInst::Xchg, Ptr, NewVal,
+            llvm::AtomicOrdering::SequentiallyConsistent);
+        break;
+      }
+      Result->setVolatile(Volatile);
+      return RValue::get(Builder.CreateIsNotNull(Result, "tobool"));
+    }
+
+    llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
+
+    llvm::BasicBlock *BBs[5] = {
+      createBasicBlock("monotonic", CurFn),
+      createBasicBlock("acquire", CurFn),
+      createBasicBlock("release", CurFn),
+      createBasicBlock("acqrel", CurFn),
+      createBasicBlock("seqcst", CurFn)
+    };
+    llvm::AtomicOrdering Orders[5] = {
+        llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Acquire,
+        llvm::AtomicOrdering::Release, llvm::AtomicOrdering::AcquireRelease,
+        llvm::AtomicOrdering::SequentiallyConsistent};
+
+    Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
+    llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]);
+
+    Builder.SetInsertPoint(ContBB);
+    PHINode *Result = Builder.CreatePHI(Int8Ty, 5, "was_set");
+
+    for (unsigned i = 0; i < 5; ++i) {
+      Builder.SetInsertPoint(BBs[i]);
+      AtomicRMWInst *RMW = Builder.CreateAtomicRMW(llvm::AtomicRMWInst::Xchg,
+                                                   Ptr, NewVal, Orders[i]);
+      RMW->setVolatile(Volatile);
+      Result->addIncoming(RMW, BBs[i]);
+      Builder.CreateBr(ContBB);
+    }
+
+    SI->addCase(Builder.getInt32(0), BBs[0]);
+    SI->addCase(Builder.getInt32(1), BBs[1]);
+    SI->addCase(Builder.getInt32(2), BBs[1]);
+    SI->addCase(Builder.getInt32(3), BBs[2]);
+    SI->addCase(Builder.getInt32(4), BBs[3]);
+    SI->addCase(Builder.getInt32(5), BBs[4]);
+
+    Builder.SetInsertPoint(ContBB);
+    return RValue::get(Builder.CreateIsNotNull(Result, "tobool"));
+  }
+
+  case Builtin::BI__atomic_clear: {
+    QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
+    bool Volatile =
+        PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
+
+    Address Ptr = EmitPointerWithAlignment(E->getArg(0));
+    Ptr = Ptr.withElementType(Int8Ty);
+    Value *NewVal = Builder.getInt8(0);
+    Value *Order = EmitScalarExpr(E->getArg(1));
+    if (isa<llvm::ConstantInt>(Order)) {
+      int ord = cast<llvm::ConstantInt>(Order)->getZExtValue();
+      StoreInst *Store = Builder.CreateStore(NewVal, Ptr, Volatile);
+      switch (ord) {
+      case 0:  // memory_order_relaxed
+      default: // invalid order
+        Store->setOrdering(llvm::AtomicOrdering::Monotonic);
+        break;
+      case 3:  // memory_order_release
+        Store->setOrdering(llvm::AtomicOrdering::Release);
+        break;
+      case 5:  // memory_order_seq_cst
+        Store->setOrdering(llvm::AtomicOrdering::SequentiallyConsistent);
+        break;
+      }
+      return RValue::get(nullptr);
+    }
+
+    llvm::BasicBlock *ContBB = createBasicBlock("atomic.continue", CurFn);
+
+    llvm::BasicBlock *BBs[3] = {
+      createBasicBlock("monotonic", CurFn),
+      createBasicBlock("release", CurFn),
+      createBasicBlock("seqcst", CurFn)
+    };
+    llvm::AtomicOrdering Orders[3] = {
+        llvm::AtomicOrdering::Monotonic, llvm::AtomicOrdering::Release,
+        llvm::AtomicOrdering::SequentiallyConsistent};
+
+    Order = Builder.CreateIntCast(Order, Builder.getInt32Ty(), false);
+    llvm::SwitchInst *SI = Builder.CreateSwitch(Order, BBs[0]);
+
+    for (unsigned i = 0; i < 3; ++i) {
+      Builder.SetInsertPoint(BBs[i]);
+      StoreInst *Store = Builder.CreateStore(NewVal, Ptr, Volatile);
+      Store->setOrdering(Orders[i]);
+      Builder.CreateBr(ContBB);
+    }
+
+    SI->addCase(Builder.getInt32(0), BBs[0]);
+    SI->addCase(Builder.getInt32(3), BBs[1]);
+    SI->addCase(Builder.getInt32(5), BBs[2]);
+
+    Builder.SetInsertPoint(ContBB);
+    return RValue::get(nullptr);
+  }
+
   case Builtin::BI__atomic_thread_fence:
   case Builtin::BI__atomic_signal_fence:
   case Builtin::BI__c11_atomic_thread_fence:
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index f6c4def289255..e703a62ff9cf1 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3631,7 +3631,6 @@ static bool isValidOrderingForOp(int64_t Ordering, AtomicExpr::AtomicOp Op) {
   case AtomicExpr::AO__atomic_store_n:
   case AtomicExpr::AO__scoped_atomic_store:
   case AtomicExpr::AO__scoped_atomic_store_n:
-  case AtomicExpr::AO__atomic_clear:
     return OrderingCABI != llvm::AtomicOrderingCABI::consume &&
            OrderingCABI != llvm::AtomicOrderingCABI::acquire &&
            OrderingCABI != llvm::AtomicOrderingCABI::acq_rel;
@@ -3684,18 +3683,12 @@ ExprResult Sema::BuildAtomicExpr(SourceRange CallRange, SourceRange ExprRange,
     C11CmpXchg,
 
     // bool __atomic_compare_exchange(A *, C *, CP, bool, int, int)
-    GNUCmpXchg,
-
-    // bool __atomic_test_and_set(A *, int)
-    TestAndSet,
-
-    // void __atomic_clear(A *, int)
-    Clear,
+    GNUCmpXchg
   } Form = Init;
 
-  const unsigned NumForm = Clear + 1;
-  const unsigned NumArgs[] = {2, 2, 3, 3, 3, 3, 4, 5, 6, 2, 2};
-  const unsigned NumVals[] = {1, 0, 1, 1, 1, 1, 2, 2, 3, 0, 0};
+  const unsigned NumForm = GNUCmpXchg + 1;
+  const unsigned NumArgs[] = { 2, 2, 3, 3, 3, 3, 4, 5, 6 };
+  const unsigned NumVals[] = { 1, 0, 1, 1, 1, 1, 2, 2, 3 };
   // where:
   //   C is an appropriate type,
   //   A is volatile _Atomic(C) for __c11 builtins and is C for GNU builtins,
@@ -3856,14 +3849,6 @@ ExprResult Sema::BuildAtomicExpr(SourceRange CallRange, SourceRange ExprRange,
   case AtomicExpr::AO__scoped_atomic_compare_exchange_n:
     Form = GNUCmpXchg;
     break;
-
-  case AtomicExpr::AO__atomic_test_and_set:
-    Form = TestAndSet;
-    break;
-
-  case AtomicExpr::AO__atomic_clear:
-    Form = Clear;
-    break;
   }
 
   unsigned AdjustedNumArgs = NumArgs[Form];
@@ -4009,10 +3994,10 @@ ExprResult Sema::BuildAtomicExpr(SourceRange CallRange, SourceRange ExprRange,
   ValType.removeLocalVolatile();
   ValType.removeLocalConst();
   QualType ResultType = ValType;
-  if (Form == Copy || Form == LoadCopy || Form == GNUXchg || Form == Init ||
-      Form == Clear)
+  if (Form == Copy || Form == LoadCopy || Form == GNUXchg ||
+      Form == Init)
     ResultType = Context.VoidTy;
-  else if (Form == C11CmpXchg || Form == GNUCmpXchg || Form == TestAndSet)
+  else if (Form == C11CmpXchg || Form == GNUCmpXchg)
     ResultType = Context.BoolTy;
 
   // The type of a parameter passed 'by value'. In the GNU atomics, such
@@ -4057,10 +4042,6 @@ ExprResult Sema::BuildAtomicExpr(SourceRange CallRange, SourceRange ExprRange,
       APIOrderedArgs.push_back(Args[1]); // Order
       APIOrderedArgs.push_back(Args[3]); // OrderFail
       break;
-    case TestAndSet:
-    case Clear:
-      APIOrderedArgs.push_back(Args[1]); // Order
-      break;
     }
   } else
     APIOrderedArgs.append(Args.begin(), Args.end());
@@ -4146,8 +4127,6 @@ ExprResult Sema::BuildAtomicExpr(SourceRange CallRange, SourceRange ExprRange,
     SubExprs.push_back(APIOrderedArgs[1]); // Val1
     break;
   case Load:
-  case TestAndSet:
-  case Clear:
     SubExprs.push_back(APIOrderedArgs[1]); // Order
     break;
   case LoadCopy:
diff --git a/clang/test/CodeGen/atomic-test-and-set.c b/clang/test/CodeGen/atomic-test-and-set.c
deleted file mode 100644
index bb05623f89755..0000000000000
--- a/clang/test/CodeGen/atomic-test-and-set.c
+++ /dev/null
@@ -1,250 +0,0 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
-// RUN: %clang_cc1 %s -emit-llvm -o - -triple=aarch64-none-elf | FileCheck %s
-// REQUIRES: aarch64-registered-target
-
-#include <stdatomic.h>
-
-// CHECK-LABEL: define dso_local void @clear_relaxed(
-// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    store atomic i8 0, ptr [[TMP0]] monotonic, align 1
-// CHECK-NEXT:    ret void
-//
-void clear_relaxed(char *ptr) {
-  __atomic_clear(ptr, memory_order_relaxed);
-}
-
-// CHECK-LABEL: define dso_local void @clear_seq_cst(
-// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    store atomic i8 0, ptr [[TMP0]] seq_cst, align 1
-// CHECK-NEXT:    ret void
-//
-void clear_seq_cst(char *ptr) {
-  __atomic_clear(ptr, memory_order_seq_cst);
-}
-
-// CHECK-LABEL: define dso_local void @clear_release(
-// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    store atomic i8 0, ptr [[TMP0]] release, align 1
-// CHECK-NEXT:    ret void
-//
-void clear_release(char *ptr) {
-  __atomic_clear(ptr, memory_order_release);
-}
-
-// CHECK-LABEL: define dso_local void @clear_dynamic(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[ORDER:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[ORDER_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    store i32 [[ORDER]], ptr [[ORDER_ADDR]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ORDER_ADDR]], align 4
-// CHECK-NEXT:    switch i32 [[TMP1]], label %[[MONOTONIC:.*]] [
-// CHECK-NEXT:      i32 3, label %[[RELEASE:.*]]
-// CHECK-NEXT:      i32 5, label %[[SEQCST:.*]]
-// CHECK-NEXT:    ]
-// CHECK:       [[MONOTONIC]]:
-// CHECK-NEXT:    store atomic i8 0, ptr [[TMP0]] monotonic, align 1
-// CHECK-NEXT:    br label %[[ATOMIC_CONTINUE:.*]]
-// CHECK:       [[RELEASE]]:
-// CHECK-NEXT:    store atomic i8 0, ptr [[TMP0]] release, align 1
-// CHECK-NEXT:    br label %[[ATOMIC_CONTINUE]]
-// CHECK:       [[SEQCST]]:
-// CHECK-NEXT:    store atomic i8 0, ptr [[TMP0]] seq_cst, align 1
-// CHECK-NEXT:    br label %[[ATOMIC_CONTINUE]]
-// CHECK:       [[ATOMIC_CONTINUE]]:
-// CHECK-NEXT:    ret void
-//
-void clear_dynamic(char *ptr, int order) {
-  __atomic_clear(ptr, order);
-}
-
-// CHECK-LABEL: define dso_local void @test_and_set_relaxed(
-// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 monotonic, align 1
-// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i8 [[TMP1]], 0
-// CHECK-NEXT:    store i1 [[TOBOOL]], ptr [[ATOMIC_TEMP]], align 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
-// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1
-// CHECK-NEXT:    ret void
-//
-void test_and_set_relaxed(char *ptr) {
-  __atomic_test_and_set(ptr, memory_order_relaxed);
-}
-
-// CHECK-LABEL: define dso_local void @test_and_set_consume(
-// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 acquire, align 1
-// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i8 [[TMP1]], 0
-// CHECK-NEXT:    store i1 [[TOBOOL]], ptr [[ATOMIC_TEMP]], align 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
-// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1
-// CHECK-NEXT:    ret void
-//
-void test_and_set_consume(char *ptr) {
-  __atomic_test_and_set(ptr, memory_order_consume);
-}
-
-// CHECK-LABEL: define dso_local void @test_and_set_acquire(
-// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 acquire, align 1
-// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i8 [[TMP1]], 0
-// CHECK-NEXT:    store i1 [[TOBOOL]], ptr [[ATOMIC_TEMP]], align 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
-// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1
-// CHECK-NEXT:    ret void
-//
-void test_and_set_acquire(char *ptr) {
-  __atomic_test_and_set(ptr, memory_order_acquire);
-}
-
-// CHECK-LABEL: define dso_local void @test_and_set_release(
-// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 release, align 1
-// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i8 [[TMP1]], 0
-// CHECK-NEXT:    store i1 [[TOBOOL]], ptr [[ATOMIC_TEMP]], align 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
-// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1
-// CHECK-NEXT:    ret void
-//
-void test_and_set_release(char *ptr) {
-  __atomic_test_and_set(ptr, memory_order_release);
-}
-
-// CHECK-LABEL: define dso_local void @test_and_set_acq_rel(
-// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 acq_rel, align 1
-// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i8 [[TMP1]], 0
-// CHECK-NEXT:    store i1 [[TOBOOL]], ptr [[ATOMIC_TEMP]], align 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
-// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1
-// CHECK-NEXT:    ret void
-//
-void test_and_set_acq_rel(char *ptr) {
-  __atomic_test_and_set(ptr, memory_order_acq_rel);
-}
-
-// CHECK-LABEL: define dso_local void @test_and_set_seq_cst(
-// CHECK-SAME: ptr noundef [[PTR:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 seq_cst, align 1
-// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i8 [[TMP1]], 0
-// CHECK-NEXT:    store i1 [[TOBOOL]], ptr [[ATOMIC_TEMP]], align 1
-// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
-// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1
-// CHECK-NEXT:    ret void
-//
-void test_and_set_seq_cst(char *ptr) {
-  __atomic_test_and_set(ptr, memory_order_seq_cst);
-}
-
-// CHECK-LABEL: define dso_local void @test_and_set_dynamic(
-// CHECK-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[ORDER:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
-// CHECK-NEXT:    [[ORDER_ADDR:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i8, align 1
-// CHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    store i32 [[ORDER]], ptr [[ORDER_ADDR]], align 4
-// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ORDER_ADDR]], align 4
-// CHECK-NEXT:    switch i32 [[TMP1]], label %[[MONOTONIC:.*]] [
-// CHECK-NEXT:      i32 1, label %[[ACQUIRE:.*]]
-// CHECK-NEXT:      i32 2, label %[[ACQUIRE]]
-// CHECK-NEXT:      i32 3, label %[[RELEASE:.*]]
-// CHECK-NEXT:      i32 4, label %[[ACQREL:.*]]
-// CHECK-NEXT:      i32 5, label %[[SEQCST:.*]]
-// CHECK-NEXT:    ]
-// CHECK:       [[MONOTONIC]]:
-// CHECK-NEXT:    [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 monotonic, align 1
-// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i8 [[TMP2]], 0
-// CHECK-NEXT:    store i1 [[TOBOOL]], ptr [[ATOMIC_TEMP]], align 1
-// CHECK-NEXT:    br label %[[ATOMIC_CONTINUE:.*]]
-// CHECK:       [[ACQUIRE]]:
-// CHECK-NEXT:    [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 acquire, align 1
-// CHECK-NEXT:    [[TOBOOL1:%.*]] = icmp ne i8 [[TMP3]], 0
-// CHECK-NEXT:    store i1 [[TOBOOL1]], ptr [[ATOMIC_TEMP]], align 1
-// CHECK-NEXT:    br label %[[ATOMIC_CONTINUE]]
-// CHECK:       [[RELEASE]]:
-// CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 release, align 1
-// CHECK-NEXT:    [[TOBOOL2:%.*]] = icmp ne i8 [[TMP4]], 0
-// CHECK-NEXT:    store i1 [[TOBOOL2]], ptr [[ATOMIC_TEMP]], align 1
-// CHECK-NEXT:    br label %[[ATOMIC_CONTINUE]]
-// CHECK:       [[ACQREL]]:
-// CHECK-NEXT:    [[TMP5:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 acq_rel, align 1
-// CHECK-NEXT:    [[TOBOOL3:%.*]] = icmp ne i8 [[TMP5]], 0
-// CHECK-NEXT:    store i1 [[TOBOOL3]], ptr [[ATOMIC_TEMP]], align 1
-// CHECK-NEXT:    br label %[[ATOMIC_CONTINUE]]
-// CHECK:       [[SEQCST]]:
-// CHECK-NEXT:    [[TMP6:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 1 seq_cst, align 1
-// CHECK-NEXT:    [[TOBOOL4:%.*]] = icmp ne i8 [[TMP6]], 0
-// CHECK-NEXT:    store i1 [[TOBOOL4]], ptr [[ATOMIC_TEMP]], align 1
-// CHECK-NEXT:    br label %[[ATOMIC_CONTINUE]]
-// CHECK:       [[ATOMIC_CONTINUE]]:
-// CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1
-// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1
-// CHECK-NEXT:    ret void
-//
-void test_and_set_dynamic(char *ptr, int order) {
-  __atomic_test_and_set(ptr, order);
-}
-
-// CHECK-LABEL: define dso_local void @test_and_set_array(
-// CHECK-SAME: ) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[X:%.*]] = alloca [10 x i32], align 4
-// CHECK-NEXT:    [[ATOMIC_TEMP:%.*]] = alloca i32, align 4
-// CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [10 x i32], ptr [[X]], i64 0, i64 0
-// CHECK-NEXT:    [[TMP0:%.*]] = atomicrmw volatile xchg ptr [[ARRAYDECAY]], i8 1 seq_cst, align 4
-// CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i8 [[TMP0]], 0
-// CHECK-NEXT:    store i1 [[TOBOOL]], ptr [[ATOMIC_TEMP]], align 4
-// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 4
-// CHECK-NEXT:    [[LOADEDV:%.*]] = trunc i8 [[TMP1]] to i1
-// CHECK-NEXT:    ret void
-//
-void test_and_set_array() {
-  volatile int x[10];
-  __atomic_test_and_set(x, memory_order_seq_cst);
-}
diff --git a/clang/test/Sema/atomic-ops.c b/clang/test/Sema/atomic-ops.c
index c3837cf865df8..2405f804d0da5 100644
--- a/clang/test/Sema/atomic-ops.c
+++ b/clang/test/Sema/atomic-ops.c
@@ -284,15 +284,11 @@ void f(_Atomic(int) *i, const _Atomic(int) *ci,
 
   const volatile int flag_k = 0;
   volatile int flag = 0;
-  (void)(int)__atomic_test_and_set(&flag_k, memory_order_seq_cst); // expected-error {{address argument to atomic operation must be a pointer to non-const type ('const volatile int *' invalid)}}
+  (void)(int)__atomic_test_and_set(&flag_k, memory_order_seq_cst); // expected-warning {{passing 'const volatile int *' to parameter of type 'volatile void *'}}
   (void)(int)__atomic_test_and_set(&flag, memory_order_seq_cst);
-  __atomic_clear(&flag_k, memory_order_seq_cst); // expected-error {{address argument to atomic operation must be a pointer to non-const type ('const volatile int *' invalid)}}
+  __atomic_clear(&flag_k, memory_order_seq_cst); // expected-warning {{passing 'const volatile int *' to parameter of type 'volatile void *'}}
   __atomic_clear(&flag, memory_order_seq_cst);
   (int)__atomic_clear(&flag, memory_order_seq_cst); // expected-error {{operand of type 'void'}}
-  __atomic_clear(0x8000, memory_order_seq_cst); // expected-error {{address argument to atomic builtin must be a pointer ('int' invalid)}}
-  __atomic_clear(&flag, memory_order_consume); // expected-warning {{memory order argument to atomic operation is invalid}}
-  __atomic_clear(&flag, memory_order_acquire); // expected-warning {{memory order argument to atomic operation is invalid}}
-  __atomic_clear(&flag, memory_order_acq_rel); // expected-warning {{memory order argument to atomic operation is invalid}}
 
   __c11_atomic_init(ci, 0); // expected-error {{address argument to atomic operation must be a pointer to non-const _Atomic type ('const _Atomic(int) *' invalid)}}
   __c11_atomic_store(ci, 0, memory_order_release); // expected-error {{address argument to atomic operation must be a pointer to non-const _Atomic type ('const _Atomic(int) *' invalid)}}

From a6d26c56ff066c8e8f92f4ca169fcf40ae0db537 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Fri, 20 Dec 2024 09:26:38 +0100
Subject: [PATCH 148/209] [clang] Fix dangling false positives for conditional
 operators. (#120233)

When analyzing a dangling gsl pointer, we currently filter out all field
access `MemberExpr` to avoid common false positives (`string_view sv =
Temp().sv`), However, this filter only applies to direct MemberExpr
instances, leaving the conditional operator as an escaping example
(`GSLPointer pointer(Cond ? Owner().ptr : GSLPointer());`).

This patch extends the MemberExpr logic to handle the conditional
operator. The heuristic is intentionally simple, which may result in
some false negatives. However, it effectively covers common cases like
`std::string_view sv = cond ? "123" : std::string();`, which is a
reasonable trade-off.

Fixes https://github.com/llvm/llvm-project/issues/120206
---
 clang/docs/ReleaseNotes.rst                   |  2 ++
 clang/lib/Sema/CheckExprLifetime.cpp          |  9 ++++++
 .../Sema/warn-lifetime-analysis-nocfg.cpp     | 29 +++++++++++++++++++
 3 files changed, 40 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index b8d92a6c881c6..a85ef60b7b58b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -706,6 +706,8 @@ Improvements to Clang's diagnostics
       return ptr + index < ptr; // warning
     }
 
+- Fix -Wdangling false positives on conditional operators (#120206).
+
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
index add6d7506bd6f..7109de03cadd1 100644
--- a/clang/lib/Sema/CheckExprLifetime.cpp
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -582,6 +582,15 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
     //   Temp().ptr; // Here ptr might not dangle.
     if (isa<MemberExpr>(Arg->IgnoreImpCasts()))
       return;
+    // Avoid false positives when the object is constructed from a conditional
+    // operator argument. A common case is:
+    //   // 'ptr' might not be owned by the Owner object.
+    //   std::string_view s = cond() ? Owner().ptr : sv;
+    if (const auto *Cond =
+            dyn_cast<AbstractConditionalOperator>(Arg->IgnoreImpCasts());
+        Cond && isPointerLikeType(Cond->getType()))
+      return;
+
     auto ReturnType = Callee->getReturnType();
 
     // Once we initialized a value with a non gsl-owner reference, it can no
diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
index 45b4dc838f44e..4c19367bb7f3d 100644
--- a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
+++ b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
@@ -777,3 +777,32 @@ void test4() {
 }
 
 } // namespace LifetimeboundInterleave
+
+namespace GH120206 {
+struct S {
+  std::string_view s;
+};
+
+struct [[gsl::Owner]] Q1 {
+  const S* get() const [[clang::lifetimebound]];
+};
+std::string_view test1(int c, std::string_view sv) {
+  std::string_view k = c > 1 ? Q1().get()->s : sv;
+  if (c == 1)
+    return  c > 1 ? Q1().get()->s : sv;
+  Q1 q;
+  return c > 1 ? q.get()->s : sv;
+}
+
+struct Q2 {
+  const S* get() const [[clang::lifetimebound]];
+};
+std::string_view test2(int c, std::string_view sv) {
+  std::string_view k = c > 1 ? Q2().get()->s : sv;
+  if (c == 1)
+    return c > 1 ? Q2().get()->s : sv;
+  Q2 q;
+  return c > 1 ? q.get()->s : sv;
+}
+
+} // namespace GH120206

From a16cb7ebea796f8c8f7a2ca8d01cc7f6db99e07a Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 20 Dec 2024 09:35:49 +0100
Subject: [PATCH 149/209] [LLVM] Update Sparc maintainer (#120430)

Currently vegovin is listed as the maintainer of the Sparc backend,
however they haven't been involved with LLVM for about 10 years already.

Nominate koachan as the new Sparc maintainer.
---
 llvm/Maintainers.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index 8c9800054bb99..73520801c1b2a 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -251,8 +251,8 @@ asb@igalia.com (email), [asb](https://github.com/asb) (GitHub)
 
 #### Sparc backend
 
-Venkatraman Govindaraju \
-venkatra@cs.wisc.edu (email), [vegovin](https://github.com/vegovin) (GitHub)
+Koakuma \
+koachan@protonmail.com (email), [koachan](https://github.com/koachan) (GitHub)
 
 #### SPIRV backend
 
@@ -459,6 +459,7 @@ Chandler Carruth (chandlerc@gmail.com, chandlerc@google.com, [chandlerc](https:/
 Evan Cheng (evan.cheng@apple.com) -- Parts of code generator not covered by someone else \
 Jake Ehrlich (jakehehrlich@google.com, [jakehehrlich](https://github.com/jakehehrlich)) -- llvm-objcopy and ObjCopy library \
 Renato Golin (rengolin@systemcall.eu, [rengolin](https://github.com/rengolin)) -- ARM backend \
+Venkatraman Govindaraju (venkatra@cs.wisc.edu, [vegovin](https://github.com/vegovin) -- Sparc backend \
 James Grosbach (grosbach@apple.com) -- MC layer \
 Anton Korobeynikov (anton@korobeynikov.info, [asl](https://github.com/asl)) -- ARM EABI \
 Chad Rosier (mcrosier@codeaurora.org) -- FastISel \

From 4096dd60668c9e9e402a486a28984bfa082fe48f Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 20 Dec 2024 09:37:13 +0100
Subject: [PATCH 150/209] [LLVM] Update LTO maintainer (#120575)

Update the LTO maintainer from pcc to teresajohnson.
---
 llvm/Maintainers.md | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index 73520801c1b2a..a1f3198ded94d 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -320,16 +320,11 @@ echristo@gmail.com (email), [echristo](https://github.com/echristo) (GitHub)
 Benjamin Kramer \
 benny.kra@gmail.com (email), [d0k](https://github.com/d0k) (GitHub)
 
-#### IR Linker
+#### IR Linker and LTO
 
 Teresa Johnson \
 tejohnson@google.com (email), [teresajohnson](https://github.com/teresajohnson) (GitHub)
 
-#### LTO
-
-Peter Collingbourne \
-peter@pcc.me.uk (email), [pcc](https://github.com/pcc) (GitHub)
-
 #### MCJIT, Orc, RuntimeDyld, PerfJITEvents
 
 Lang Hames \
@@ -456,6 +451,7 @@ sabre@nondot.org (email), [lattner](https://github.com/lattner) (GitHub), clattn
 Paul C. Anagnostopoulos (paul@windfall.com, [Paul-C-Anagnostopoulos](https://github.com/Paul-C-Anagnostopoulos)) -- TableGen \
 Justin Bogner (mail@justinbogner.com, [bogner](https://github.com/bogner)) -- SelectionDAG \
 Chandler Carruth (chandlerc@gmail.com, chandlerc@google.com, [chandlerc](https://github.com/chandlerc)) -- ADT, Support \
+Peter Collingbourne (peter@pcc.me.uk, [pcc](https://github.com/pcc)) -- LTO \
 Evan Cheng (evan.cheng@apple.com) -- Parts of code generator not covered by someone else \
 Jake Ehrlich (jakehehrlich@google.com, [jakehehrlich](https://github.com/jakehehrlich)) -- llvm-objcopy and ObjCopy library \
 Renato Golin (rengolin@systemcall.eu, [rengolin](https://github.com/rengolin)) -- ARM backend \

From 4111841f88eeca184e833e3643c5bf1dca6f7014 Mon Sep 17 00:00:00 2001
From: Kristof Beyls <kristof.beyls@arm.com>
Date: Fri, 20 Dec 2024 09:54:07 +0100
Subject: [PATCH 151/209] [BOLT] Correctly print preferred disassembly for
 annotated instructions (#120564)

This patch makes sure that `BinaryContext::printInstruction` prints the
preferred disassembly. Preferred disassembly only gets printed when
there are no annotations on the MCInst. Therefore, this patch
temporarily removes the annotations before printing it.

A few examples of before and after on AArch64 instructions are as
follows:

```
  BEFORE                     AFTER
                             (preferred disassembly)

  ret   x30                  ret
  orr   x30, xzr, x0         mov   x30, x0
  hint  #29                  autiasp
  hint  #12                  autia1716
```

Clearly, the preferred disassembly is easier for developers to read, and
is the disassembly that tools should be printing.

This patch is motivated as part of future work on the
llvm-bolt-binary-analysis tool, making sure that the reports it prints
do use preferred disassembly.

This patch was cherry-picked from
https://github.com/kbeyls/llvm-project/tree/bolt-gadget-scanner-prototype.

In this current patch, this only affects existing RISCV test cases.

This patch also does improve test cases in future patches that will
introduce a binary analysis for llvm-bolt-binary-analysis that checks
for correct application of pac-ret (pointer authentication on return
addresses).
---
 bolt/lib/Core/BinaryContext.cpp    | 10 +++++++++-
 bolt/test/RISCV/call-annotations.s |  8 ++++----
 bolt/test/RISCV/relax.s            |  4 ++--
 bolt/test/RISCV/reloc-branch.s     |  2 +-
 bolt/test/RISCV/reloc-jal.s        |  2 +-
 5 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index ac96b836ed579..f88e34b8e8962 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -1961,7 +1961,15 @@ void BinaryContext::printInstruction(raw_ostream &OS, const MCInst &Instruction,
     OS << "\tjit\t" << MIB->getTargetSymbol(Instruction)->getName()
        << " # ID: " << DynamicID;
   } else {
-    InstPrinter->printInst(&Instruction, 0, "", *STI, OS);
+    // If there are annotations on the instruction, the MCInstPrinter will fail
+    // to print the preferred alias as it only does so when the number of
+    // operands is as expected. See
+    // https://github.com/llvm/llvm-project/blob/782f1a0d895646c364a53f9dcdd6d4ec1f3e5ea0/llvm/lib/MC/MCInstPrinter.cpp#L142
+    // Therefore, create a temporary copy of the Inst from which the annotations
+    // are removed, and print that Inst.
+    MCInst InstNoAnnot = Instruction;
+    MIB->stripAnnotations(InstNoAnnot);
+    InstPrinter->printInst(&InstNoAnnot, 0, "", *STI, OS);
   }
   if (MIB->isCall(Instruction)) {
     if (MIB->isTailCall(Instruction))
diff --git a/bolt/test/RISCV/call-annotations.s b/bolt/test/RISCV/call-annotations.s
index ff99e0f1dfd84..f876544e214ca 100644
--- a/bolt/test/RISCV/call-annotations.s
+++ b/bolt/test/RISCV/call-annotations.s
@@ -16,13 +16,13 @@ f:
 
 // CHECK-LABEL: Binary Function "_start" after building cfg {
 // CHECK:      auipc ra, f
-// CHECK-NEXT: jalr ra, -0x4(ra) # Offset: 4
-// CHECK-NEXT: jal ra, f # Offset: 8
-// CHECK-NEXT: jal zero, f # TAILCALL  # Offset: 12
+// CHECK-NEXT: jalr -0x4(ra) # Offset: 4
+// CHECK-NEXT: jal f # Offset: 8
+// CHECK-NEXT: j f # TAILCALL  # Offset: 12
 
 // CHECK-LABEL: Binary Function "long_tail" after building cfg {
 // CHECK:      auipc t1, f
-// CHECK-NEXT: jalr zero, -0x18(t1) # TAILCALL  # Offset: 8
+// CHECK-NEXT: jr -0x18(t1) # TAILCALL  # Offset: 8
 
 // CHECK-LABEL: Binary Function "compressed_tail" after building cfg {
 // CHECK:      jr a0 # TAILCALL  # Offset: 0
diff --git a/bolt/test/RISCV/relax.s b/bolt/test/RISCV/relax.s
index ec390ea76b5c7..74f049b8f8dd9 100644
--- a/bolt/test/RISCV/relax.s
+++ b/bolt/test/RISCV/relax.s
@@ -5,9 +5,9 @@
 // RUN: llvm-objdump -d %t.bolt | FileCheck --check-prefix=OBJDUMP %s
 
 // CHECK:      Binary Function "_start" after building cfg {
-// CHECK:      jal ra, near_f
+// CHECK:      jal near_f
 // CHECK-NEXT: auipc ra, far_f
-// CHECK-NEXT: jalr ra, 0xc(ra)
+// CHECK-NEXT: jalr 0xc(ra)
 // CHECK-NEXT: j near_f
 
 // CHECK:      Binary Function "_start" after fix-riscv-calls {
diff --git a/bolt/test/RISCV/reloc-branch.s b/bolt/test/RISCV/reloc-branch.s
index 6a8b5a28e19d9..780d978f36f44 100644
--- a/bolt/test/RISCV/reloc-branch.s
+++ b/bolt/test/RISCV/reloc-branch.s
@@ -7,7 +7,7 @@
   .p2align 1
 // CHECK: Binary Function "_start" after building cfg {
 _start:
-// CHECK: beq zero, zero, .Ltmp0
+// CHECK: beqz zero, .Ltmp0
   beq zero, zero, 1f
   nop
 // CHECK: .Ltmp0
diff --git a/bolt/test/RISCV/reloc-jal.s b/bolt/test/RISCV/reloc-jal.s
index ce54265fac05e..62a2f1dbea03a 100644
--- a/bolt/test/RISCV/reloc-jal.s
+++ b/bolt/test/RISCV/reloc-jal.s
@@ -14,7 +14,7 @@ f:
   .globl _start
   .p2align 1
 _start:
-// CHECK: jal ra, f
+// CHECK: jal f
   jal ra, f
   ret
   .size _start, .-_start

From 44726489988a27c3cbd2f94188a2363e2080e045 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Fri, 20 Dec 2024 09:03:28 +0000
Subject: [PATCH 152/209] [ARM] Expand bf16 expanding/rounding fp loads/stores

As with other fp types, these should be expanded to prevent nodes that are
illegal for Arm.
---
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |   3 +
 llvm/test/CodeGen/Thumb2/bf16-instructions.ll | 107 ++++++++++++++++--
 2 files changed, 102 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 92df91534fe07..5ec2d8389c18e 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1113,12 +1113,15 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   for (MVT VT : MVT::fp_valuetypes()) {
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
   }
 
   // ... or truncating stores
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
 
   // ARM does not have i1 sign extending load.
   for (MVT VT : MVT::integer_valuetypes())
diff --git a/llvm/test/CodeGen/Thumb2/bf16-instructions.ll b/llvm/test/CodeGen/Thumb2/bf16-instructions.ll
index 11c9c6028d342..5de7afca25b84 100644
--- a/llvm/test/CodeGen/Thumb2/bf16-instructions.ll
+++ b/llvm/test/CodeGen/Thumb2/bf16-instructions.ll
@@ -220,6 +220,55 @@ define void @test_store(bfloat %a, ptr %b) {
   ret void
 }
 
+define void @test_truncstore32(float %a, ptr %b) {
+; CHECK-NOFP-LABEL: test_truncstore32:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r4, lr}
+; CHECK-NOFP-NEXT:    push {r4, lr}
+; CHECK-NOFP-NEXT:    mov r4, r1
+; CHECK-NOFP-NEXT:    bl __truncsfbf2
+; CHECK-NOFP-NEXT:    strh r0, [r4]
+; CHECK-NOFP-NEXT:    pop {r4, pc}
+;
+; CHECK-FP-LABEL: test_truncstore32:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r4, lr}
+; CHECK-FP-NEXT:    push {r4, lr}
+; CHECK-FP-NEXT:    mov r4, r0
+; CHECK-FP-NEXT:    bl __truncsfbf2
+; CHECK-FP-NEXT:    vmov r0, s0
+; CHECK-FP-NEXT:    strh r0, [r4]
+; CHECK-FP-NEXT:    pop {r4, pc}
+  %r = fptrunc float %a to bfloat
+  store bfloat %r, ptr %b
+  ret void
+}
+
+define void @test_truncstore64(double %a, ptr %b) {
+; CHECK-NOFP-LABEL: test_truncstore64:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r4, lr}
+; CHECK-NOFP-NEXT:    push {r4, lr}
+; CHECK-NOFP-NEXT:    mov r4, r2
+; CHECK-NOFP-NEXT:    bl __truncdfbf2
+; CHECK-NOFP-NEXT:    strh r0, [r4]
+; CHECK-NOFP-NEXT:    pop {r4, pc}
+;
+; CHECK-FP-LABEL: test_truncstore64:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r4, lr}
+; CHECK-FP-NEXT:    push {r4, lr}
+; CHECK-FP-NEXT:    mov r4, r0
+; CHECK-FP-NEXT:    vmov r0, r1, d0
+; CHECK-FP-NEXT:    bl __aeabi_d2f
+; CHECK-FP-NEXT:    lsrs r0, r0, #16
+; CHECK-FP-NEXT:    strh r0, [r4]
+; CHECK-FP-NEXT:    pop {r4, pc}
+  %r = fptrunc double %a to bfloat
+  store bfloat %r, ptr %b
+  ret void
+}
+
 define bfloat @test_load(ptr %a) {
 ; CHECK-NOFP-LABEL: test_load:
 ; CHECK-NOFP:       @ %bb.0:
@@ -235,6 +284,48 @@ define bfloat @test_load(ptr %a) {
   ret bfloat %r
 }
 
+define float @test_loadext32(ptr %a) {
+; CHECK-NOFP-LABEL: test_loadext32:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    ldrh r0, [r0]
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bx lr
+;
+; CHECK-FP-LABEL: test_loadext32:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    ldrh r0, [r0]
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    vmov s0, r0
+; CHECK-FP-NEXT:    bx lr
+  %r = load bfloat, ptr %a
+  %d = fpext bfloat %r to float
+  ret float %d
+}
+
+define double @test_loadext64(ptr %a) {
+; CHECK-NOFP-LABEL: test_loadext64:
+; CHECK-NOFP:       @ %bb.0:
+; CHECK-NOFP-NEXT:    .save {r7, lr}
+; CHECK-NOFP-NEXT:    push {r7, lr}
+; CHECK-NOFP-NEXT:    ldrh r0, [r0]
+; CHECK-NOFP-NEXT:    lsls r0, r0, #16
+; CHECK-NOFP-NEXT:    bl __aeabi_f2d
+; CHECK-NOFP-NEXT:    pop {r7, pc}
+;
+; CHECK-FP-LABEL: test_loadext64:
+; CHECK-FP:       @ %bb.0:
+; CHECK-FP-NEXT:    .save {r7, lr}
+; CHECK-FP-NEXT:    push {r7, lr}
+; CHECK-FP-NEXT:    ldrh r0, [r0]
+; CHECK-FP-NEXT:    lsls r0, r0, #16
+; CHECK-FP-NEXT:    bl __aeabi_f2d
+; CHECK-FP-NEXT:    vmov d0, r0, r1
+; CHECK-FP-NEXT:    pop {r7, pc}
+  %r = load bfloat, ptr %a
+  %d = fpext bfloat %r to double
+  ret double %d
+}
+
 declare bfloat @test_callee(bfloat %a, bfloat %b)
 
 define bfloat @test_call(bfloat %a, bfloat %b) {
@@ -867,8 +958,8 @@ define void @test_fccmp(bfloat %in, ptr %out) {
 ; CHECK-FP-LABEL: test_fccmp:
 ; CHECK-FP:       @ %bb.0:
 ; CHECK-FP-NEXT:    vmov r1, s0
-; CHECK-FP-NEXT:    vldr s0, .LCPI30_0
-; CHECK-FP-NEXT:    vldr s4, .LCPI30_1
+; CHECK-FP-NEXT:    vldr s0, .LCPI34_0
+; CHECK-FP-NEXT:    vldr s4, .LCPI34_1
 ; CHECK-FP-NEXT:    lsls r2, r1, #16
 ; CHECK-FP-NEXT:    vmov s2, r2
 ; CHECK-FP-NEXT:    mov.w r2, #17664
@@ -882,9 +973,9 @@ define void @test_fccmp(bfloat %in, ptr %out) {
 ; CHECK-FP-NEXT:    bx lr
 ; CHECK-FP-NEXT:    .p2align 2
 ; CHECK-FP-NEXT:  @ %bb.1:
-; CHECK-FP-NEXT:  .LCPI30_0:
+; CHECK-FP-NEXT:  .LCPI34_0:
 ; CHECK-FP-NEXT:    .long 0x45000000 @ float 2048
-; CHECK-FP-NEXT:  .LCPI30_1:
+; CHECK-FP-NEXT:  .LCPI34_1:
 ; CHECK-FP-NEXT:    .long 0x48000000 @ float 131072
   %cmp1 = fcmp ogt bfloat %in, 0xR4800
   %cmp2 = fcmp olt bfloat %in, 0xR4500
@@ -941,14 +1032,14 @@ define bfloat @test_phi(ptr %p1) {
 ; CHECK-NOFP-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NOFP-NEXT:    ldrh r6, [r0]
 ; CHECK-NOFP-NEXT:    mov r4, r0
-; CHECK-NOFP-NEXT:  .LBB32_1: @ %loop
+; CHECK-NOFP-NEXT:  .LBB36_1: @ %loop
 ; CHECK-NOFP-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NOFP-NEXT:    mov r0, r4
 ; CHECK-NOFP-NEXT:    mov r5, r6
 ; CHECK-NOFP-NEXT:    ldrh r6, [r4]
 ; CHECK-NOFP-NEXT:    bl test_dummy
 ; CHECK-NOFP-NEXT:    lsls r0, r0, #31
-; CHECK-NOFP-NEXT:    bne .LBB32_1
+; CHECK-NOFP-NEXT:    bne .LBB36_1
 ; CHECK-NOFP-NEXT:  @ %bb.2: @ %return
 ; CHECK-NOFP-NEXT:    mov r0, r5
 ; CHECK-NOFP-NEXT:    pop {r4, r5, r6, pc}
@@ -962,7 +1053,7 @@ define bfloat @test_phi(ptr %p1) {
 ; CHECK-FP-NEXT:    mov r4, r0
 ; CHECK-FP-NEXT:    ldrh r0, [r0]
 ; CHECK-FP-NEXT:    vmov s18, r0
-; CHECK-FP-NEXT:  .LBB32_1: @ %loop
+; CHECK-FP-NEXT:  .LBB36_1: @ %loop
 ; CHECK-FP-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-FP-NEXT:    ldrh r0, [r4]
 ; CHECK-FP-NEXT:    vmov.f32 s16, s18
@@ -970,7 +1061,7 @@ define bfloat @test_phi(ptr %p1) {
 ; CHECK-FP-NEXT:    mov r0, r4
 ; CHECK-FP-NEXT:    bl test_dummy
 ; CHECK-FP-NEXT:    lsls r0, r0, #31
-; CHECK-FP-NEXT:    bne .LBB32_1
+; CHECK-FP-NEXT:    bne .LBB36_1
 ; CHECK-FP-NEXT:  @ %bb.2: @ %return
 ; CHECK-FP-NEXT:    vmov.f32 s0, s16
 ; CHECK-FP-NEXT:    vpop {d8, d9}

From 701f2409befa7d44aea0c31494ac0d3a5d18415e Mon Sep 17 00:00:00 2001
From: donald chen <chenxunyu1993@gmail.com>
Date: Fri, 20 Dec 2024 17:03:57 +0800
Subject: [PATCH 153/209] [mlir] fix crash when scf utils work on llvm.func
 (#120688)

fixed https://github.com/llvm/llvm-project/issues/119378
---
 mlir/lib/Dialect/SCF/Utils/Utils.cpp       |  2 +-
 mlir/test/Transforms/scf-if-utils.mlir     | 27 ++++++++++++++++++++++
 mlir/test/lib/Dialect/SCF/TestSCFUtils.cpp |  4 ++++
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/SCF/Utils/Utils.cpp b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
index e341c3744f1d8..41410a0a56aa9 100644
--- a/mlir/lib/Dialect/SCF/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/SCF/Utils/Utils.cpp
@@ -130,7 +130,7 @@ FailureOr<func::FuncOp> mlir::outlineSingleBlockRegion(RewriterBase &rewriter,
 
   // Outline before current function.
   OpBuilder::InsertionGuard g(rewriter);
-  rewriter.setInsertionPoint(region.getParentOfType<func::FuncOp>());
+  rewriter.setInsertionPoint(region.getParentOfType<FunctionOpInterface>());
 
   SetVector<Value> captures;
   getUsedValuesDefinedAbove(region, captures);
diff --git a/mlir/test/Transforms/scf-if-utils.mlir b/mlir/test/Transforms/scf-if-utils.mlir
index 449be18483741..fd59f5e9295e6 100644
--- a/mlir/test/Transforms/scf-if-utils.mlir
+++ b/mlir/test/Transforms/scf-if-utils.mlir
@@ -73,3 +73,30 @@ func.func @outline_empty_if_else(%cond: i1, %a: index, %b: memref<?xf32>, %c: i8
   }
   return
 }
+
+// -----
+
+// This test checks scf utils can work on llvm func.
+
+//      CHECK: func @outlined_then0() {
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+//      CHECK: func @outlined_else0(%{{.*}}: i1, %{{.*}}: i32) {
+// CHECK-NEXT:   "some_op"(%{{.*}}, %{{.*}}) : (i1, i32) -> ()
+// CHECK-NEXT:   return
+// CHECK-NEXT: }
+//      CHECK: llvm.func @llvm_func(%{{.*}}: i1, %{{.*}}: i32) {
+// CHECK-NEXT:   scf.if %{{.*}} {
+// CHECK-NEXT:     func.call @outlined_then0() : () -> ()
+// CHECK-NEXT:   } else {
+// CHECK-NEXT:     func.call @outlined_else0(%{{.*}}, %{{.*}}) : (i1, i32) -> ()
+// CHECK-NEXT:   }
+// CHECK-NEXT:   llvm.return
+// CHECK-NEXT: }
+llvm.func @llvm_func(%cond: i1, %a: i32) {
+  scf.if %cond {
+  } else {
+    "some_op"(%cond, %a) : (i1, i32) -> ()
+  }
+  llvm.return
+}
diff --git a/mlir/test/lib/Dialect/SCF/TestSCFUtils.cpp b/mlir/test/lib/Dialect/SCF/TestSCFUtils.cpp
index 3ff7f9966e93d..a3be1f94fa28a 100644
--- a/mlir/test/lib/Dialect/SCF/TestSCFUtils.cpp
+++ b/mlir/test/lib/Dialect/SCF/TestSCFUtils.cpp
@@ -79,6 +79,10 @@ struct TestSCFIfUtilsPass
   StringRef getDescription() const final { return "test scf.if utils"; }
   explicit TestSCFIfUtilsPass() = default;
 
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<func::FuncDialect>();
+  }
+
   void runOnOperation() override {
     int count = 0;
     getOperation().walk([&](scf::IfOp ifOp) {

From 53d080c5b5dfbb46eb81d189736864f5b6196492 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 20 Dec 2024 10:14:04 +0100
Subject: [PATCH 154/209] [mlir][Arith] Remove `arith-to-llvm` from
 `func-to-llvm` (#120548)

Do not run `arith-to-llvm` as part of `func-to-llvm`. This commit partly
fixes #70982.

Also simplify the pass pipeline for two math dialect integration tests.

Note for LLVM integration: If you see failures, add `arith-to-llvm` to your pass pipeline.
---
 mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp          | 1 -
 mlir/test/Dialect/ArmSVE/legalize-for-llvm.mlir        | 2 +-
 mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp         | 3 +++
 mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir  | 2 +-
 mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir | 2 +-
 mlir/test/python/execution_engine.py                   | 2 +-
 mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp   | 2 ++
 7 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index c046ea1b824fc..938d7cb9a2004 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -787,7 +787,6 @@ struct ConvertFuncToLLVMPass
 
     // TODO(https://github.com/llvm/llvm-project/issues/70982): Remove these in
     // favor of their dedicated conversion passes.
-    arith::populateArithToLLVMConversionPatterns(typeConverter, patterns);
     cf::populateControlFlowToLLVMConversionPatterns(typeConverter, patterns);
 
     LLVMConversionTarget target(getContext());
diff --git a/mlir/test/Dialect/ArmSVE/legalize-for-llvm.mlir b/mlir/test/Dialect/ArmSVE/legalize-for-llvm.mlir
index 31d5376c32d9c..bdb69a95a52de 100644
--- a/mlir/test/Dialect/ArmSVE/legalize-for-llvm.mlir
+++ b/mlir/test/Dialect/ArmSVE/legalize-for-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-vector-to-llvm="enable-arm-sve" -convert-func-to-llvm -cse -reconcile-unrealized-casts -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -convert-vector-to-llvm="enable-arm-sve" -convert-func-to-llvm -convert-arith-to-llvm -cse -reconcile-unrealized-casts -split-input-file %s | FileCheck %s
 
 func.func @arm_sve_sdot(%a: vector<[16]xi8>,
                    %b: vector<[16]xi8>,
diff --git a/mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp b/mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp
index 10c21612f64ac..b9033df7fe2b2 100644
--- a/mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp
+++ b/mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
 #include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
@@ -73,6 +74,8 @@ void buildTestLowerToLLVM(OpPassManager &pm,
   pm.addPass(createFinalizeMemRefToLLVMConversionPass());
   // Convert Func to LLVM (always needed).
   pm.addPass(createConvertFuncToLLVMPass());
+  // Convert Arith to LLVM (always needed).
+  pm.addPass(createArithToLLVMConversionPass());
   // Convert Index to LLVM (always needed).
   pm.addPass(createConvertIndexToLLVMPass());
   // Convert remaining unrealized_casts (always needed).
diff --git a/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir b/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir
index b8861198d596b..bfd5706580991 100644
--- a/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir
+++ b/mlir/test/mlir-cpu-runner/math-polynomial-approx.mlir
@@ -1,4 +1,4 @@
-// RUN:   mlir-opt %s -pass-pipeline="builtin.module(func.func(test-math-polynomial-approximation,convert-arith-to-llvm),convert-vector-to-scf,convert-scf-to-cf,convert-cf-to-llvm,convert-vector-to-llvm,func.func(convert-math-to-llvm),convert-func-to-llvm,reconcile-unrealized-casts)" \
+// RUN:   mlir-opt %s -pass-pipeline="builtin.module(func.func(test-math-polynomial-approximation),convert-vector-to-scf,convert-scf-to-cf,convert-vector-to-llvm,convert-to-llvm,reconcile-unrealized-casts)" \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN:     -e main -entry-point-result=void -O0                               \
 // RUN:     -shared-libs=%mlir_c_runner_utils  \
diff --git a/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir b/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
index 93de767b55176..140b5f43d5eb8 100644
--- a/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
+++ b/mlir/test/mlir-cpu-runner/test-expand-math-approx.mlir
@@ -1,4 +1,4 @@
-// RUN:   mlir-opt %s -pass-pipeline="builtin.module(func.func(test-expand-math,convert-arith-to-llvm),convert-vector-to-scf,convert-scf-to-cf,convert-cf-to-llvm,convert-vector-to-llvm,func.func(convert-math-to-llvm),convert-func-to-llvm,reconcile-unrealized-casts)" \
+// RUN:   mlir-opt %s -pass-pipeline="builtin.module(func.func(test-expand-math),convert-vector-to-scf,convert-scf-to-cf,convert-vector-to-llvm,convert-to-llvm,reconcile-unrealized-casts)" \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN:     -e main -entry-point-result=void -O0                               \
 // RUN:     -shared-libs=%mlir_c_runner_utils  \
diff --git a/mlir/test/python/execution_engine.py b/mlir/test/python/execution_engine.py
index 7c375ce81de0e..e085bc6d4c8b3 100644
--- a/mlir/test/python/execution_engine.py
+++ b/mlir/test/python/execution_engine.py
@@ -74,7 +74,7 @@ def testInvalidModule():
 
 def lowerToLLVM(module):
     pm = PassManager.parse(
-        "builtin.module(convert-complex-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)"
+        "builtin.module(convert-complex-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)"
     )
     pm.run(module.operation)
     return module
diff --git a/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp b/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
index eab64c97e89e4..7c6ec481979f3 100644
--- a/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
+++ b/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
 #include "mlir/Conversion/GPUToVulkan/ConvertGPUToVulkanPass.h"
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
@@ -55,6 +56,7 @@ static LogicalResult runMLIRPasses(Operation *op, JitRunnerOptions &) {
   funcToLLVMOptions.indexBitwidth =
       DataLayout(module).getTypeSizeInBits(IndexType::get(module.getContext()));
   passManager.addPass(createConvertFuncToLLVMPass(funcToLLVMOptions));
+  passManager.addPass(createArithToLLVMConversionPass());
   passManager.addPass(createReconcileUnrealizedCastsPass());
   passManager.addPass(createConvertVulkanLaunchFuncToVulkanCallsPass());
 

From d0b7633d7ad566579bfb794f95cce9aef294c92b Mon Sep 17 00:00:00 2001
From: Timothy Hoffman <4001421+tim-hoffman@users.noreply.github.com>
Date: Fri, 20 Dec 2024 03:23:36 -0600
Subject: [PATCH 155/209] [mlir] [doc] fix typos in documentation (#120179)

This PR fixes typos within documentation in various files.

Most changes are trivial. The one interesting change is the
documentation for `custom<X>` in `assemblyFormat` that used the wrong
return type. The return type from the `parseX` function should be
`ParseResult` rather than `LogicalResult`. The `ParseResult` type is
necessary due to tablegen generating code like the following within an
Op `parse()` function:
```
    auto odsResult = parseInferredArrayType(parser, elementsTypes, elementsOperands, resultRawTypes[0]);
    if (odsResult) return ::mlir::failure();
```
This will fail to compile if `parseInferredArrayType()` returns
`LogicalResult`. See also `parsePrettyLLVMType()` in LLVMTypes.h,
`parseSingleBlockRegion()` in IRDL.cpp, `parseDynamicIndexList()` in
ViewLikeInterface.cpp, etc.
---
 mlir/docs/DefiningDialects/AttributesAndTypes.md | 8 ++++----
 mlir/docs/PatternRewriter.md                     | 2 +-
 mlir/docs/SymbolsAndSymbolTables.md              | 6 +++---
 mlir/include/mlir/IR/DialectImplementation.h     | 2 +-
 mlir/include/mlir/IR/OpBase.td                   | 2 +-
 mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h   | 2 +-
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/mlir/docs/DefiningDialects/AttributesAndTypes.md b/mlir/docs/DefiningDialects/AttributesAndTypes.md
index 1430edd2ffb02..44be4c8ed3dcc 100644
--- a/mlir/docs/DefiningDialects/AttributesAndTypes.md
+++ b/mlir/docs/DefiningDialects/AttributesAndTypes.md
@@ -890,7 +890,7 @@ The `custom` directive `custom<Foo>($foo)` will in the parser and printer
 respectively generate calls to:
 
 ```c++
-LogicalResult parseFoo(AsmParser &parser, int &foo);
+ParseResult parseFoo(AsmParser &parser, int &foo);
 void printFoo(AsmPrinter &printer, int foo);
 ```
 
@@ -907,7 +907,7 @@ let assemblyFormat = "custom<Fizz>($foobar)";
 It will generate calls expecting the following signature for `parseFizz`:
 
 ```c++
-LogicalResult parseFizz(AsmParser &parser, FailureOr<NotDefaultConstructible> &foobar);
+ParseResult parseFizz(AsmParser &parser, FailureOr<NotDefaultConstructible> &foobar);
 ```
 
 A previously bound variable can be passed as a parameter to a `custom` directive
@@ -916,7 +916,7 @@ the first directive. The second directive references it and expects the
 following printer and parser signatures:
 
 ```c++
-LogicalResult parseBar(AsmParser &parser, int &bar, int foo);
+ParseResult parseBar(AsmParser &parser, int &bar, int foo);
 void printBar(AsmPrinter &printer, int bar, int foo);
 ```
 
@@ -925,7 +925,7 @@ is that the parameter for the parser must use the storage type of the parameter.
 For example, `StringRefParameter` expects the parser and printer signatures as:
 
 ```c++
-LogicalResult parseStringParam(AsmParser &parser, std::string &value);
+ParseResult parseStringParam(AsmParser &parser, std::string &value);
 void printStringParam(AsmPrinter &printer, StringRef value);
 ```
 
diff --git a/mlir/docs/PatternRewriter.md b/mlir/docs/PatternRewriter.md
index 2f1483db8190a..da392b8289332 100644
--- a/mlir/docs/PatternRewriter.md
+++ b/mlir/docs/PatternRewriter.md
@@ -464,7 +464,7 @@ Passes that utilize rewrite patterns should aim to provide a common set of
 options and toggles to simplify the debugging experience when switching between
 different passes/projects/etc. To aid in this endeavor, MLIR provides a common
 set of utilities that can be easily included when defining a custom pass. These
-are defined in `mlir/RewritePassUtil.td`; an example usage is shown below:
+are defined in `mlir/Rewrite/PassUtil.td`; an example usage is shown below:
 
 ```tablegen
 def MyRewritePass : Pass<"..."> {
diff --git a/mlir/docs/SymbolsAndSymbolTables.md b/mlir/docs/SymbolsAndSymbolTables.md
index 9078aef898d8b..aa7b1a71547bf 100644
--- a/mlir/docs/SymbolsAndSymbolTables.md
+++ b/mlir/docs/SymbolsAndSymbolTables.md
@@ -21,14 +21,14 @@ into the system.
 The `Symbol` infrastructure essentially provides a non-SSA mechanism in which to
 refer to an operation symbolically with a name. This allows for referring to
 operations defined above regions that were defined as `IsolatedFromAbove` in a
-safe way. It also allows for symbolically referencing operations define below
+safe way. It also allows for symbolically referencing operations defined below
 other regions as well.
 
 ## Symbol
 
 A `Symbol` is a named operation that resides immediately within a region that
 defines a [`SymbolTable`](#symbol-table). The name of a symbol *must* be unique
-within the parent `SymbolTable`. This name is semantically similarly to an SSA
+within the parent `SymbolTable`. This name is semantically similar to an SSA
 result value, and may be referred to by other operations to provide a symbolic
 link, or use, to the symbol. An example of a `Symbol` operation is
 [`func.func`](Dialects/Builtin.md/#func-mlirfuncop). `func.func` defines a
@@ -125,7 +125,7 @@ Using an attribute, as opposed to an SSA value, has several benefits:
 
     -   If we were to use SSA values, we would need to create some mechanism in
         which to opt-out of certain properties of it such as dominance.
-        Attributes allow for referencing the operations irregardless of the
+        Attributes allow for referencing the operations regardless of the
         order in which they were defined.
     -   Attributes simplify referencing operations within nested symbol tables,
         which are traditionally not visible outside of the parent region.
diff --git a/mlir/include/mlir/IR/DialectImplementation.h b/mlir/include/mlir/IR/DialectImplementation.h
index 303564bf66470..f45b88dc6deca 100644
--- a/mlir/include/mlir/IR/DialectImplementation.h
+++ b/mlir/include/mlir/IR/DialectImplementation.h
@@ -89,7 +89,7 @@ struct FieldParser<
   }
 };
 
-/// Parse an attribute.
+/// Parse a type.
 template <typename TypeT>
 struct FieldParser<
     TypeT, std::enable_if_t<std::is_base_of<Type, TypeT>::value, TypeT>> {
diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td
index 5c82c041c62ee..51b60972203e7 100644
--- a/mlir/include/mlir/IR/OpBase.td
+++ b/mlir/include/mlir/IR/OpBase.td
@@ -401,7 +401,7 @@ class Op<Dialect dialect, string mnemonic, list<Trait> props = []> {
   // an additional `LogicalResult verify()` declaration will be generated on the
   // operation class. The operation should implement this method and verify the
   // additional necessary invariants. This verifier shouldn't access any nested
-  // operations because those operations may ill-formed. Use the
+  // operations because those operations may be ill-formed. Use the
   // `hasRegionVerifier` below instead.
   bit hasVerifier = 0;
 
diff --git a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
index 160585e7da548..09bd86b9581df 100644
--- a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
+++ b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
@@ -325,7 +325,7 @@ LogicalResult MlirOptMain(int argc, char **argv, llvm::StringRef toolName,
                           DialectRegistry &registry);
 
 /// Implementation for tools like `mlir-opt`.
-/// This function can be used with registrationAndParseCLIOptions so that
+/// This function can be used with registerAndParseCLIOptions so that
 /// CLI options can be accessed before running MlirOptMain.
 /// - inputFilename is the name of the input mlir file.
 /// - outputFilename is the name of the output file.

From d8a5fae6913a0f6c7e3c814315c1a11fcfd609a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20Poussineau?= <hpoussin@reactos.org>
Date: Fri, 20 Dec 2024 10:31:38 +0100
Subject: [PATCH 156/209] [MC][Mips] Add
 MipsWinCOFFObjectWriter/MipsWinCOFFStreamer (#114611)

llc is now able to create MIPS COFF files for simple cases.
---
 .../Target/Mips/MCTargetDesc/CMakeLists.txt   |  2 +
 .../Mips/MCTargetDesc/MipsAsmBackend.cpp      | 20 +++++++
 .../Mips/MCTargetDesc/MipsMCAsmInfo.cpp       | 11 ++++
 .../Target/Mips/MCTargetDesc/MipsMCAsmInfo.h  |  8 +++
 .../Mips/MCTargetDesc/MipsMCTargetDesc.cpp    | 18 +++++-
 .../Mips/MCTargetDesc/MipsMCTargetDesc.h      | 14 +++++
 .../MCTargetDesc/MipsWinCOFFObjectWriter.cpp  | 57 +++++++++++++++++++
 .../Mips/MCTargetDesc/MipsWinCOFFStreamer.cpp | 33 +++++++++++
 llvm/lib/Target/Mips/MipsTargetMachine.cpp    |  2 +
 llvm/test/CodeGen/Mips/Fast-ISel/br1.ll       |  4 +-
 llvm/test/CodeGen/Mips/Fast-ISel/icmpbr1.ll   |  2 +-
 llvm/test/CodeGen/Mips/addressing-mode.ll     |  2 +-
 llvm/test/CodeGen/Mips/atomic-min-max-64.ll   |  8 +--
 llvm/test/CodeGen/Mips/atomic-min-max.ll      | 26 ++++-----
 llvm/test/CodeGen/Mips/brconeq.ll             |  2 +-
 llvm/test/CodeGen/Mips/brconeqk.ll            |  2 +-
 llvm/test/CodeGen/Mips/brconeqz.ll            |  2 +-
 llvm/test/CodeGen/Mips/brconge.ll             |  2 +-
 llvm/test/CodeGen/Mips/brcongt.ll             |  2 +-
 llvm/test/CodeGen/Mips/brconle.ll             |  2 +-
 llvm/test/CodeGen/Mips/brconlt.ll             |  4 +-
 llvm/test/CodeGen/Mips/brconne.ll             |  2 +-
 llvm/test/CodeGen/Mips/brconnek.ll            |  2 +-
 llvm/test/CodeGen/Mips/brconnez.ll            |  2 +-
 llvm/test/CodeGen/Mips/cconv/memory-layout.ll | 16 +++---
 llvm/test/CodeGen/Mips/cfi_offset.ll          | 12 ++--
 llvm/test/CodeGen/Mips/dins.ll                |  8 +--
 llvm/test/CodeGen/Mips/dsp-r1.ll              |  2 +-
 llvm/test/CodeGen/Mips/eh-return32.ll         |  6 +-
 llvm/test/CodeGen/Mips/eh-return64.ll         |  8 +--
 llvm/test/CodeGen/Mips/emit-big-cst.ll        |  4 +-
 llvm/test/CodeGen/Mips/ex2.ll                 |  2 +-
 llvm/test/CodeGen/Mips/fpbr.ll                | 12 ++--
 llvm/test/CodeGen/Mips/frame-address.ll       |  2 +-
 llvm/test/CodeGen/Mips/jumptable_labels.ll    |  6 +-
 llvm/test/CodeGen/Mips/llvm-ir/add.ll         | 30 +++++-----
 llvm/test/CodeGen/Mips/llvm-ir/indirectbr.ll  | 22 +++----
 llvm/test/CodeGen/Mips/llvm-ir/select-int.ll  | 30 +++++-----
 .../CodeGen/Mips/load-store-left-right.ll     | 28 ++++-----
 llvm/test/CodeGen/Mips/mcount.ll              | 12 ++--
 .../micromips-lbu16-lhu16-sb16-sh16.ll        |  2 +-
 llvm/test/CodeGen/Mips/mips64directive.ll     |  4 +-
 llvm/test/CodeGen/Mips/msa/2r.ll              |  4 +-
 .../test/CodeGen/Mips/msa/2r_vector_scalar.ll |  8 +--
 llvm/test/CodeGen/Mips/msa/2rf.ll             |  4 +-
 llvm/test/CodeGen/Mips/msa/2rf_exup.ll        |  4 +-
 llvm/test/CodeGen/Mips/msa/2rf_float_int.ll   |  4 +-
 llvm/test/CodeGen/Mips/msa/2rf_fq.ll          |  4 +-
 llvm/test/CodeGen/Mips/msa/2rf_int_float.ll   |  4 +-
 llvm/test/CodeGen/Mips/msa/2rf_tq.ll          |  4 +-
 llvm/test/CodeGen/Mips/msa/3r-a.ll            |  6 +-
 llvm/test/CodeGen/Mips/msa/3r-b.ll            |  4 +-
 llvm/test/CodeGen/Mips/msa/3r-c.ll            |  4 +-
 llvm/test/CodeGen/Mips/msa/3r-d.ll            |  4 +-
 llvm/test/CodeGen/Mips/msa/3r-i.ll            |  4 +-
 llvm/test/CodeGen/Mips/msa/3r-m.ll            |  4 +-
 llvm/test/CodeGen/Mips/msa/3r-p.ll            |  4 +-
 llvm/test/CodeGen/Mips/msa/3r-s.ll            |  4 +-
 llvm/test/CodeGen/Mips/msa/3r-v.ll            |  4 +-
 llvm/test/CodeGen/Mips/msa/3r_4r.ll           |  4 +-
 llvm/test/CodeGen/Mips/msa/3r_4r_widen.ll     |  4 +-
 llvm/test/CodeGen/Mips/msa/3r_splat.ll        |  4 +-
 llvm/test/CodeGen/Mips/msa/3rf.ll             |  4 +-
 llvm/test/CodeGen/Mips/msa/3rf_4rf.ll         |  4 +-
 llvm/test/CodeGen/Mips/msa/3rf_4rf_q.ll       |  4 +-
 llvm/test/CodeGen/Mips/msa/3rf_exdo.ll        |  4 +-
 llvm/test/CodeGen/Mips/msa/3rf_float_int.ll   |  4 +-
 llvm/test/CodeGen/Mips/msa/3rf_int_float.ll   |  4 +-
 llvm/test/CodeGen/Mips/msa/3rf_q.ll           |  4 +-
 .../test/CodeGen/Mips/msa/arithmetic_float.ll |  4 +-
 llvm/test/CodeGen/Mips/msa/bit.ll             |  4 +-
 llvm/test/CodeGen/Mips/msa/bitcast.ll         |  4 +-
 llvm/test/CodeGen/Mips/msa/compare.ll         |  4 +-
 llvm/test/CodeGen/Mips/msa/compare_float.ll   |  4 +-
 llvm/test/CodeGen/Mips/msa/elm_copy.ll        |  8 +--
 llvm/test/CodeGen/Mips/msa/elm_cxcmsa.ll      |  4 +-
 llvm/test/CodeGen/Mips/msa/elm_insv.ll        |  8 +--
 llvm/test/CodeGen/Mips/msa/elm_move.ll        |  4 +-
 llvm/test/CodeGen/Mips/msa/elm_shift_slide.ll |  4 +-
 llvm/test/CodeGen/Mips/msa/endian.ll          |  4 +-
 llvm/test/CodeGen/Mips/msa/frameindex.ll      |  4 +-
 llvm/test/CodeGen/Mips/msa/i10.ll             |  4 +-
 llvm/test/CodeGen/Mips/msa/i5-a.ll            |  4 +-
 llvm/test/CodeGen/Mips/msa/i5-c.ll            |  4 +-
 llvm/test/CodeGen/Mips/msa/i5-m.ll            |  4 +-
 llvm/test/CodeGen/Mips/msa/i5_ld_st.ll        |  4 +-
 llvm/test/CodeGen/Mips/msa/i8.ll              |  4 +-
 llvm/test/CodeGen/Mips/msa/remat-ldi.ll       |  2 +-
 .../test/CodeGen/Mips/msa/shift-dagcombine.ll |  2 +-
 .../CodeGen/Mips/msa/shift_constant_pool.ll   |  8 +--
 llvm/test/CodeGen/Mips/msa/special.ll         |  8 +--
 llvm/test/CodeGen/Mips/msa/spill.ll           |  4 +-
 llvm/test/CodeGen/Mips/msa/vec.ll             |  4 +-
 llvm/test/CodeGen/Mips/msa/vecs10.ll          |  4 +-
 llvm/test/CodeGen/Mips/octeon.ll              |  6 +-
 llvm/test/CodeGen/Mips/prevent-hoisting.ll    |  2 +-
 llvm/test/CodeGen/Mips/selTBteqzCmpi.ll       |  2 +-
 llvm/test/CodeGen/Mips/selTBtnezCmpi.ll       |  2 +-
 llvm/test/CodeGen/Mips/selTBtnezSlti.ll       |  2 +-
 llvm/test/CodeGen/Mips/seleq.ll               |  2 +-
 llvm/test/CodeGen/Mips/seleqk.ll              |  2 +-
 llvm/test/CodeGen/Mips/selgek.ll              |  2 +-
 llvm/test/CodeGen/Mips/selgt.ll               |  2 +-
 llvm/test/CodeGen/Mips/selle.ll               |  2 +-
 llvm/test/CodeGen/Mips/selltk.ll              |  2 +-
 llvm/test/CodeGen/Mips/selne.ll               |  2 +-
 llvm/test/CodeGen/Mips/selnek.ll              |  2 +-
 llvm/test/CodeGen/Mips/selpat.ll              |  2 +-
 llvm/test/CodeGen/Mips/unalignedload.ll       | 12 ++--
 llvm/test/DebugInfo/Mips/tls.ll               |  4 +-
 llvm/test/MC/Mips/coff-basic.ll               |  7 +++
 llvm/test/MC/Mips/coff-relocs.ll              | 42 ++++++++++++++
 112 files changed, 485 insertions(+), 273 deletions(-)
 create mode 100644 llvm/lib/Target/Mips/MCTargetDesc/MipsWinCOFFObjectWriter.cpp
 create mode 100644 llvm/lib/Target/Mips/MCTargetDesc/MipsWinCOFFStreamer.cpp
 create mode 100644 llvm/test/MC/Mips/coff-basic.ll
 create mode 100644 llvm/test/MC/Mips/coff-relocs.ll

diff --git a/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
index 97a6f886d114e..d3f16e5042c3a 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
+++ b/llvm/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
@@ -12,6 +12,8 @@ add_llvm_component_library(LLVMMipsDesc
   MipsNaClELFStreamer.cpp
   MipsOptionRecord.cpp
   MipsTargetStreamer.cpp
+  MipsWinCOFFObjectWriter.cpp
+  MipsWinCOFFStreamer.cpp
 
   LINK_COMPONENTS
   CodeGenTypes
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index e41b8797f9ce6..6001d9d51d16a 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -593,10 +593,30 @@ bool MipsAsmBackend::isMicroMips(const MCSymbol *Sym) const {
   return false;
 }
 
+namespace {
+
+class WindowsMipsAsmBackend : public MipsAsmBackend {
+public:
+  WindowsMipsAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                        const MCSubtargetInfo &STI)
+      : MipsAsmBackend(T, MRI, STI.getTargetTriple(), STI.getCPU(), false) {}
+
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    return createMipsWinCOFFObjectWriter();
+  }
+};
+
+} // end anonymous namespace
+
 MCAsmBackend *llvm::createMipsAsmBackend(const Target &T,
                                          const MCSubtargetInfo &STI,
                                          const MCRegisterInfo &MRI,
                                          const MCTargetOptions &Options) {
+  const Triple &TheTriple = STI.getTargetTriple();
+  if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF())
+    return new WindowsMipsAsmBackend(T, MRI, STI);
+
   MipsABIInfo ABI = MipsABIInfo::computeTargetABI(STI.getTargetTriple(),
                                                   STI.getCPU(), Options);
   return new MipsAsmBackend(T, MRI, STI.getTargetTriple(), STI.getCPU(),
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 074a58cadb556..fa09a14b3e238 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -51,3 +51,14 @@ MipsELFMCAsmInfo::MipsELFMCAsmInfo(const Triple &TheTriple,
   DwarfRegNumForCFI = true;
   HasMipsExpressions = true;
 }
+
+void MipsCOFFMCAsmInfo::anchor() {}
+
+MipsCOFFMCAsmInfo::MipsCOFFMCAsmInfo() {
+  HasSingleParameterDotFile = true;
+  WinEHEncodingType = WinEH::EncodingType::Itanium;
+
+  ExceptionsType = ExceptionHandling::WinEH;
+
+  AllowAtInName = true;
+}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index b52ed12d3a0e7..3a2895a79f9c7 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCASMINFO_H
 #define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCASMINFO_H
 
+#include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
@@ -26,6 +27,13 @@ class MipsELFMCAsmInfo : public MCAsmInfoELF {
                             const MCTargetOptions &Options);
 };
 
+class MipsCOFFMCAsmInfo : public MCAsmInfoGNUCOFF {
+  void anchor() override;
+
+public:
+  explicit MipsCOFFMCAsmInfo();
+};
+
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index 3b655363ce26f..6558988175829 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -44,6 +44,13 @@ using namespace llvm;
 #define GET_REGINFO_MC_DESC
 #include "MipsGenRegisterInfo.inc"
 
+namespace {
+class MipsWinCOFFTargetStreamer : public MipsTargetStreamer {
+public:
+  MipsWinCOFFTargetStreamer(MCStreamer &S) : MipsTargetStreamer(S) {}
+};
+} // end namespace
+
 /// Select the Mips CPU for the given triple and cpu name.
 StringRef MIPS_MC::selectMipsCPU(const Triple &TT, StringRef CPU) {
   if (CPU.empty() || CPU == "generic") {
@@ -83,7 +90,12 @@ static MCSubtargetInfo *createMipsMCSubtargetInfo(const Triple &TT,
 static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI,
                                       const Triple &TT,
                                       const MCTargetOptions &Options) {
-  MCAsmInfo *MAI = new MipsELFMCAsmInfo(TT, Options);
+  MCAsmInfo *MAI;
+
+  if (TT.isOSWindows())
+    MAI = new MipsCOFFMCAsmInfo();
+  else
+    MAI = new MipsELFMCAsmInfo(TT, Options);
 
   unsigned SP = MRI.getDwarfRegNum(Mips::SP, true);
   MCCFIInstruction Inst = MCCFIInstruction::createDefCfaRegister(nullptr, SP);
@@ -126,6 +138,8 @@ static MCTargetStreamer *createMipsNullTargetStreamer(MCStreamer &S) {
 
 static MCTargetStreamer *
 createMipsObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+  if (STI.getTargetTriple().isOSBinFormatCOFF())
+    return new MipsWinCOFFTargetStreamer(S);
   return new MipsTargetELFStreamer(S, STI);
 }
 
@@ -185,6 +199,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTargetMC() {
     TargetRegistry::RegisterNullTargetStreamer(*T,
                                                createMipsNullTargetStreamer);
 
+    TargetRegistry::RegisterCOFFStreamer(*T, createMipsWinCOFFStreamer);
+
     // Register the MC subtarget info.
     TargetRegistry::RegisterMCSubtargetInfo(*T, createMipsMCSubtargetInfo);
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
index d51f3b9abcfd1..c5293b03b0ac5 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -23,7 +23,9 @@ class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
 class MCObjectTargetWriter;
+class MCObjectWriter;
 class MCRegisterInfo;
+class MCStreamer;
 class MCSubtargetInfo;
 class MCTargetOptions;
 class StringRef;
@@ -39,8 +41,20 @@ MCAsmBackend *createMipsAsmBackend(const Target &T, const MCSubtargetInfo &STI,
                                    const MCRegisterInfo &MRI,
                                    const MCTargetOptions &Options);
 
+/// Construct an MIPS Windows COFF machine code streamer which will generate
+/// PE/COFF format object files.
+///
+/// Takes ownership of \p AB and \p CE.
+MCStreamer *createMipsWinCOFFStreamer(MCContext &C,
+                                      std::unique_ptr<MCAsmBackend> &&AB,
+                                      std::unique_ptr<MCObjectWriter> &&OW,
+                                      std::unique_ptr<MCCodeEmitter> &&CE);
+
+/// Construct a Mips ELF object writer.
 std::unique_ptr<MCObjectTargetWriter>
 createMipsELFObjectWriter(const Triple &TT, bool IsN32);
+/// Construct a Mips Win COFF object writer.
+std::unique_ptr<MCObjectTargetWriter> createMipsWinCOFFObjectWriter();
 
 namespace MIPS_MC {
 StringRef selectMipsCPU(const Triple &TT, StringRef CPU);
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsWinCOFFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsWinCOFFObjectWriter.cpp
new file mode 100644
index 0000000000000..94187c71ba70d
--- /dev/null
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsWinCOFFObjectWriter.cpp
@@ -0,0 +1,57 @@
+//===- MipsWinCOFFObjectWriter.cpp------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MipsFixupKinds.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
+
+using namespace llvm;
+
+namespace {
+
+class MipsWinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+public:
+  MipsWinCOFFObjectWriter();
+
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsCrossSection,
+                        const MCAsmBackend &MAB) const override;
+};
+
+} // end anonymous namespace
+
+MipsWinCOFFObjectWriter::MipsWinCOFFObjectWriter()
+    : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_R4000) {}
+
+unsigned MipsWinCOFFObjectWriter::getRelocType(MCContext &Ctx,
+                                               const MCValue &Target,
+                                               const MCFixup &Fixup,
+                                               bool IsCrossSection,
+                                               const MCAsmBackend &MAB) const {
+  unsigned FixupKind = Fixup.getKind();
+
+  switch (FixupKind) {
+  case FK_Data_4:
+    return COFF::IMAGE_REL_MIPS_REFWORD;
+  case Mips::fixup_Mips_26:
+    return COFF::IMAGE_REL_MIPS_JMPADDR;
+  case Mips::fixup_Mips_HI16:
+    return COFF::IMAGE_REL_MIPS_REFHI;
+  case Mips::fixup_Mips_LO16:
+    return COFF::IMAGE_REL_MIPS_REFLO;
+  default:
+    Ctx.reportError(Fixup.getLoc(), "unsupported relocation type");
+    return COFF::IMAGE_REL_MIPS_REFWORD;
+  }
+}
+
+std::unique_ptr<MCObjectTargetWriter> llvm::createMipsWinCOFFObjectWriter() {
+  return std::make_unique<MipsWinCOFFObjectWriter>();
+}
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsWinCOFFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsWinCOFFStreamer.cpp
new file mode 100644
index 0000000000000..22bf2e1be203c
--- /dev/null
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsWinCOFFStreamer.cpp
@@ -0,0 +1,33 @@
+//===- MipsWinCOFFStreamer.cpp-----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#include "MipsMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
+
+using namespace llvm;
+
+namespace {
+class MipsWinCOFFStreamer : public MCWinCOFFStreamer {
+public:
+  MipsWinCOFFStreamer(MCContext &C, std::unique_ptr<MCAsmBackend> AB,
+                      std::unique_ptr<MCCodeEmitter> CE,
+                      std::unique_ptr<MCObjectWriter> OW)
+      : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
+};
+} // namespace
+
+MCStreamer *llvm::createMipsWinCOFFStreamer(
+    MCContext &C, std::unique_ptr<MCAsmBackend> &&AB,
+    std::unique_ptr<MCObjectWriter> &&OW, std::unique_ptr<MCCodeEmitter> &&CE) {
+  return new MipsWinCOFFStreamer(C, std::move(AB), std::move(CE),
+                                 std::move(OW));
+}
diff --git a/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
index 2cd5098f608cd..30eb739212113 100644
--- a/llvm/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -70,6 +70,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMipsTarget() {
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
+  if (TT.isOSBinFormatCOFF())
+    return std::make_unique<TargetLoweringObjectFileCOFF>();
   return std::make_unique<MipsTargetObjectFile>();
 }
 
diff --git a/llvm/test/CodeGen/Mips/Fast-ISel/br1.ll b/llvm/test/CodeGen/Mips/Fast-ISel/br1.ll
index 203202e0bd053..b5bdf840facf4 100644
--- a/llvm/test/CodeGen/Mips/Fast-ISel/br1.ll
+++ b/llvm/test/CodeGen/Mips/Fast-ISel/br1.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=mipsel -relocation-model=pic -O0 -fast-isel-abort=3 -mcpu=mips32r2 \
+; RUN: llc -mtriple=mipsel-elf -relocation-model=pic -O0 -fast-isel-abort=3 -mcpu=mips32r2 \
 ; RUN:     < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -relocation-model=pic -O0 -fast-isel-abort=3 -mcpu=mips32 \
+; RUN: llc -mtriple=mipsel-elf -relocation-model=pic -O0 -fast-isel-abort=3 -mcpu=mips32 \
 ; RUN:     < %s | FileCheck %s
 
 @b = global i32 1, align 4
diff --git a/llvm/test/CodeGen/Mips/Fast-ISel/icmpbr1.ll b/llvm/test/CodeGen/Mips/Fast-ISel/icmpbr1.ll
index 84cdd9456dc82..3462f1d2b9d46 100644
--- a/llvm/test/CodeGen/Mips/Fast-ISel/icmpbr1.ll
+++ b/llvm/test/CodeGen/Mips/Fast-ISel/icmpbr1.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=mipsel -relocation-model=pic -O0 -fast-isel=true -mcpu=mips32r2 \
+; RUN: llc -mtriple=mipsel-elf -relocation-model=pic -O0 -fast-isel=true -mcpu=mips32r2 \
 ; RUN:     < %s -verify-machineinstrs | FileCheck %s
 
 
diff --git a/llvm/test/CodeGen/Mips/addressing-mode.ll b/llvm/test/CodeGen/Mips/addressing-mode.ll
index 74543f6cdb9fd..9d4363765c96a 100644
--- a/llvm/test/CodeGen/Mips/addressing-mode.ll
+++ b/llvm/test/CodeGen/Mips/addressing-mode.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf < %s | FileCheck %s
 
 @g0 = common global i32 0, align 4
 @g1 = common global i32 0, align 4
diff --git a/llvm/test/CodeGen/Mips/atomic-min-max-64.ll b/llvm/test/CodeGen/Mips/atomic-min-max-64.ll
index 62af633ea8957..f3308c4b6ad12 100644
--- a/llvm/test/CodeGen/Mips/atomic-min-max-64.ll
+++ b/llvm/test/CodeGen/Mips/atomic-min-max-64.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=mips64 -O0 -mcpu=mips64r2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPS
-; RUN: llc -mtriple=mips64el -O0 -mcpu=mips64r2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPS
-; RUN: llc -mtriple=mips64 -O0 -mcpu=mips64r6 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPSR6
-; RUN: llc -mtriple=mips64el -O0 -mcpu=mips64r6 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPSR6
+; RUN: llc -mtriple=mips64-elf -O0 -mcpu=mips64r2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPS
+; RUN: llc -mtriple=mips64el-elf -O0 -mcpu=mips64r2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPS
+; RUN: llc -mtriple=mips64-elf -O0 -mcpu=mips64r6 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPSR6
+; RUN: llc -mtriple=mips64el-elf -O0 -mcpu=mips64r6 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPSR6
 
 define i64 @test_max(ptr nocapture %ptr, i64 signext %val) {
 ; MIPS-LABEL: test_max:
diff --git a/llvm/test/CodeGen/Mips/atomic-min-max.ll b/llvm/test/CodeGen/Mips/atomic-min-max.ll
index a10db052a4ff2..85bf6d02c7d8f 100644
--- a/llvm/test/CodeGen/Mips/atomic-min-max.ll
+++ b/llvm/test/CodeGen/Mips/atomic-min-max.ll
@@ -1,17 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=mips -O0 -mcpu=mips32r2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPS
-; RUN: llc -mtriple=mips -O0 -mcpu=mips32r6 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPSR6
-; RUN: llc -mtriple=mips -O0 -mcpu=mips32r2 -mattr=+micromips -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MM
-; RUN: llc -mtriple=mips -O0 -mcpu=mips32r6 -mattr=+micromips -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MMR6
-; RUN: llc -mtriple=mipsel -O0 -mcpu=mips32 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPS32
-; RUN: llc -mtriple=mipsel -O0 -mcpu=mips32r2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPSEL
-; RUN: llc -mtriple=mipsel -O0 -mcpu=mips32r6 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPSELR6
-; RUN: llc -mtriple=mipsel -O0 -mcpu=mips32r2 -mattr=+micromips -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MMEL
-; RUN: llc -mtriple=mipsel -O0 -mcpu=mips32r6 -mattr=+micromips -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MMELR6
-; RUN: llc -mtriple=mips64 -O0 -mcpu=mips64r2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPS64
-; RUN: llc -mtriple=mips64 -O0 -mcpu=mips64r6 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPS64R6
-; RUN: llc -mtriple=mips64el -O0 -mcpu=mips64r2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPS64EL
-; RUN: llc -mtriple=mips64el -O0 -mcpu=mips64r6 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPS64ELR6
+; RUN: llc -mtriple=mips-elf -O0 -mcpu=mips32r2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPS
+; RUN: llc -mtriple=mips-elf -O0 -mcpu=mips32r6 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPSR6
+; RUN: llc -mtriple=mips-elf -O0 -mcpu=mips32r2 -mattr=+micromips -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MM
+; RUN: llc -mtriple=mips-elf -O0 -mcpu=mips32r6 -mattr=+micromips -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MMR6
+; RUN: llc -mtriple=mipsel-elf -O0 -mcpu=mips32 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPS32
+; RUN: llc -mtriple=mipsel-elf -O0 -mcpu=mips32r2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPSEL
+; RUN: llc -mtriple=mipsel-elf -O0 -mcpu=mips32r6 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPSELR6
+; RUN: llc -mtriple=mipsel-elf -O0 -mcpu=mips32r2 -mattr=+micromips -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MMEL
+; RUN: llc -mtriple=mipsel-elf -O0 -mcpu=mips32r6 -mattr=+micromips -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MMELR6
+; RUN: llc -mtriple=mips64-elf -O0 -mcpu=mips64r2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPS64
+; RUN: llc -mtriple=mips64-elf -O0 -mcpu=mips64r6 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPS64R6
+; RUN: llc -mtriple=mips64el-elf -O0 -mcpu=mips64r2 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPS64EL
+; RUN: llc -mtriple=mips64el-elf -O0 -mcpu=mips64r6 -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=MIPS64ELR6
 
 define i32 @test_max_32(ptr nocapture %ptr, i32 signext %val) {
 ; MIPS-LABEL: test_max_32:
diff --git a/llvm/test/CodeGen/Mips/brconeq.ll b/llvm/test/CodeGen/Mips/brconeq.ll
index 468456effd477..7c23db8d96fc4 100644
--- a/llvm/test/CodeGen/Mips/brconeq.ll
+++ b/llvm/test/CodeGen/Mips/brconeq.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -mtriple=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 5, align 4
 @j = global i32 10, align 4
diff --git a/llvm/test/CodeGen/Mips/brconeqk.ll b/llvm/test/CodeGen/Mips/brconeqk.ll
index d0c6656cb52e1..98d8b07bc8091 100644
--- a/llvm/test/CodeGen/Mips/brconeqk.ll
+++ b/llvm/test/CodeGen/Mips/brconeqk.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -mtriple=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 5, align 4
 @result = global i32 0, align 4
diff --git a/llvm/test/CodeGen/Mips/brconeqz.ll b/llvm/test/CodeGen/Mips/brconeqz.ll
index c99c5a2e47d6e..fbc50a7701b35 100644
--- a/llvm/test/CodeGen/Mips/brconeqz.ll
+++ b/llvm/test/CodeGen/Mips/brconeqz.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -mtriple=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 5, align 4
 @result = global i32 0, align 4
diff --git a/llvm/test/CodeGen/Mips/brconge.ll b/llvm/test/CodeGen/Mips/brconge.ll
index 44d7556b5577b..4e91f4624aa6d 100644
--- a/llvm/test/CodeGen/Mips/brconge.ll
+++ b/llvm/test/CodeGen/Mips/brconge.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -mtriple=mipsel -mattr=mips16 -relocation-model=pic -O2 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic -O2 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 5, align 4
 @j = global i32 10, align 4
diff --git a/llvm/test/CodeGen/Mips/brcongt.ll b/llvm/test/CodeGen/Mips/brcongt.ll
index c332820a83a8c..1152167f3a8ab 100644
--- a/llvm/test/CodeGen/Mips/brcongt.ll
+++ b/llvm/test/CodeGen/Mips/brcongt.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -mtriple=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 5, align 4
 @j = global i32 10, align 4
diff --git a/llvm/test/CodeGen/Mips/brconle.ll b/llvm/test/CodeGen/Mips/brconle.ll
index e695f4be18f1f..d68362f253a3a 100644
--- a/llvm/test/CodeGen/Mips/brconle.ll
+++ b/llvm/test/CodeGen/Mips/brconle.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -mtriple=mipsel -mattr=mips16 -relocation-model=pic -O2 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic -O2 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 -5, align 4
 @j = global i32 10, align 4
diff --git a/llvm/test/CodeGen/Mips/brconlt.ll b/llvm/test/CodeGen/Mips/brconlt.ll
index 6ae8c64b4f09c..522db0d9e2da5 100644
--- a/llvm/test/CodeGen/Mips/brconlt.ll
+++ b/llvm/test/CodeGen/Mips/brconlt.ll
@@ -1,5 +1,5 @@
-; RUN: llc  -mtriple=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
-; RUN: llc  -mtriple=mips -mattr=micromips -mcpu=mips32r6 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=MM32R6
+; RUN: llc  -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -mtriple=mips-elf -mattr=micromips -mcpu=mips32r6 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=MM32R6
 
 @i = global i32 5, align 4
 @j = global i32 10, align 4
diff --git a/llvm/test/CodeGen/Mips/brconne.ll b/llvm/test/CodeGen/Mips/brconne.ll
index 40a15cdefba73..e673727def7d9 100644
--- a/llvm/test/CodeGen/Mips/brconne.ll
+++ b/llvm/test/CodeGen/Mips/brconne.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -mtriple=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 5, align 4
 @j = global i32 5, align 4
diff --git a/llvm/test/CodeGen/Mips/brconnek.ll b/llvm/test/CodeGen/Mips/brconnek.ll
index 3b74c777e0c2e..f963be59c12f4 100644
--- a/llvm/test/CodeGen/Mips/brconnek.ll
+++ b/llvm/test/CodeGen/Mips/brconnek.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -mtriple=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @j = global i32 5, align 4
 @result = global i32 0, align 4
diff --git a/llvm/test/CodeGen/Mips/brconnez.ll b/llvm/test/CodeGen/Mips/brconnez.ll
index e153964fab40a..15ba7c16cb3dd 100644
--- a/llvm/test/CodeGen/Mips/brconnez.ll
+++ b/llvm/test/CodeGen/Mips/brconnez.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -mtriple=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @j = global i32 0, align 4
 @result = global i32 0, align 4
diff --git a/llvm/test/CodeGen/Mips/cconv/memory-layout.ll b/llvm/test/CodeGen/Mips/cconv/memory-layout.ll
index 42f9d1890cc37..c95fe09de20e7 100644
--- a/llvm/test/CodeGen/Mips/cconv/memory-layout.ll
+++ b/llvm/test/CodeGen/Mips/cconv/memory-layout.ll
@@ -1,14 +1,14 @@
-; RUN: llc -mtriple=mips < %s | FileCheck --check-prefixes=ALL,O32 %s
-; RUN: llc -mtriple=mipsel < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN: llc -mtriple=mips-elf < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN: llc -mtriple=mipsel-elf < %s | FileCheck --check-prefixes=ALL,O32 %s
 
-; RUN-TODO: llc -mtriple=mips64 -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
-; RUN-TODO: llc -mtriple=mips64el -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN-TODO: llc -mtriple=mips64-elf -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
+; RUN-TODO: llc -mtriple=mips64el-elf -target-abi o32 < %s | FileCheck --check-prefixes=ALL,O32 %s
 
-; RUN: llc -mtriple=mips64 -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
-; RUN: llc -mtriple=mips64el -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
+; RUN: llc -mtriple=mips64-elf -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
+; RUN: llc -mtriple=mips64el-elf -target-abi n32 < %s | FileCheck --check-prefixes=ALL,N32 %s
 
-; RUN: llc -mtriple=mips64 -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
-; RUN: llc -mtriple=mips64el -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
+; RUN: llc -mtriple=mips64-elf -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
+; RUN: llc -mtriple=mips64el-elf -target-abi n64 < %s | FileCheck --check-prefixes=ALL,N64 %s
 
 ; Test the memory layout for all ABI's and byte orders as specified by section
 ; 4 of MD00305 (MIPS ABIs Described).
diff --git a/llvm/test/CodeGen/Mips/cfi_offset.ll b/llvm/test/CodeGen/Mips/cfi_offset.ll
index f687212558843..e55924e2f5353 100644
--- a/llvm/test/CodeGen/Mips/cfi_offset.ll
+++ b/llvm/test/CodeGen/Mips/cfi_offset.ll
@@ -1,9 +1,9 @@
-; RUN: llc -mtriple=mips -mattr=+o32 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-EB
-; RUN: llc -mtriple=mipsel -mattr=+o32 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-EL
-; RUN: llc -mtriple=mips -mattr=+o32,+fpxx < %s | FileCheck %s --check-prefixes=CHECK,CHECK-EB
-; RUN: llc -mtriple=mipsel -mattr=+o32,+fpxx < %s | FileCheck %s --check-prefixes=CHECK,CHECK-EL
-; RUN: llc -mtriple=mips -mattr=+o32,+fp64,+mips32r2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-EB
-; RUN: llc -mtriple=mipsel -mattr=+o32,+fp64,+mips32r2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-EL
+; RUN: llc -mtriple=mips-elf -mattr=+o32 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-EB
+; RUN: llc -mtriple=mipsel-elf -mattr=+o32 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-EL
+; RUN: llc -mtriple=mips-elf -mattr=+o32,+fpxx < %s | FileCheck %s --check-prefixes=CHECK,CHECK-EB
+; RUN: llc -mtriple=mipsel-elf -mattr=+o32,+fpxx < %s | FileCheck %s --check-prefixes=CHECK,CHECK-EL
+; RUN: llc -mtriple=mips-elf -mattr=+o32,+fp64,+mips32r2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-EB
+; RUN: llc -mtriple=mipsel-elf -mattr=+o32,+fp64,+mips32r2 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-EL
 
 @var = global double 0.0
 
diff --git a/llvm/test/CodeGen/Mips/dins.ll b/llvm/test/CodeGen/Mips/dins.ll
index aecc06bc7203a..cdb8f419eb2be 100644
--- a/llvm/test/CodeGen/Mips/dins.ll
+++ b/llvm/test/CodeGen/Mips/dins.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -O2 -verify-machineinstrs -mtriple=mips64 -mcpu=mips64r2 \
+; RUN: llc -O2 -verify-machineinstrs -mtriple=mips64-elf -mcpu=mips64r2 \
 ; RUN:   -target-abi=n64 < %s -o - | FileCheck %s -check-prefix=MIPS64R2
-; RUN: llc -O2 -verify-machineinstrs -mtriple=mips -mcpu=mips32r2 < %s -o - \
+; RUN: llc -O2 -verify-machineinstrs -mtriple=mips-elf -mcpu=mips32r2 < %s -o - \
 ; RUN:   | FileCheck %s -check-prefix=MIPS32R2
-; RUN: llc -O2 -verify-machineinstrs -mtriple=mips -mattr=mips16 < %s -o - \
+; RUN: llc -O2 -verify-machineinstrs -mtriple=mips-elf -mattr=mips16 < %s -o - \
 ; RUN:   | FileCheck %s -check-prefix=MIPS16
-; RUN: llc -O2 -verify-machineinstrs -mtriple=mips64 -mcpu=mips64r2 \
+; RUN: llc -O2 -verify-machineinstrs -mtriple=mips64-elf -mcpu=mips64r2 \
 ; RUN:   -target-abi=n32 < %s -o - | FileCheck %s -check-prefix=MIPS64R2N32
 
 ; #include <stdint.h>
diff --git a/llvm/test/CodeGen/Mips/dsp-r1.ll b/llvm/test/CodeGen/Mips/dsp-r1.ll
index 2b5a0d25aed7f..7a661d6c70514 100644
--- a/llvm/test/CodeGen/Mips/dsp-r1.ll
+++ b/llvm/test/CodeGen/Mips/dsp-r1.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel -mcpu=mips32 -mattr=+dsp -verify-machineinstrs < %s | \
+; RUN: llc -mtriple=mipsel-elf -mcpu=mips32 -mattr=+dsp -verify-machineinstrs < %s | \
 ; RUN:     FileCheck %s
 
 define i32 @test__builtin_mips_extr_w1(i32 %i0, i32, i64 %a0) nounwind {
diff --git a/llvm/test/CodeGen/Mips/eh-return32.ll b/llvm/test/CodeGen/Mips/eh-return32.ll
index 50c8b1dca9901..0c60c47310952 100644
--- a/llvm/test/CodeGen/Mips/eh-return32.ll
+++ b/llvm/test/CodeGen/Mips/eh-return32.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=mipsel -mcpu=mips32   -asm-show-inst -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,NOT-R6
-; RUN: llc -mtriple=mipsel -mcpu=mips32r2 -asm-show-inst -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,NOT-R6
-; RUN: llc -mtriple=mipsel -mcpu=mips32r6 -asm-show-inst -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,R6
+; RUN: llc -mtriple=mipsel-elf -mcpu=mips32   -asm-show-inst -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,NOT-R6
+; RUN: llc -mtriple=mipsel-elf -mcpu=mips32r2 -asm-show-inst -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,NOT-R6
+; RUN: llc -mtriple=mipsel-elf -mcpu=mips32r6 -asm-show-inst -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,R6
 
 declare void @llvm.eh.return.i32(i32, ptr)
 declare void @foo(...)
diff --git a/llvm/test/CodeGen/Mips/eh-return64.ll b/llvm/test/CodeGen/Mips/eh-return64.ll
index 3a2fb2a4868d8..f5a547a3b608c 100644
--- a/llvm/test/CodeGen/Mips/eh-return64.ll
+++ b/llvm/test/CodeGen/Mips/eh-return64.ll
@@ -1,7 +1,7 @@
-; RUN: llc -mtriple=mips64el -mcpu=mips4    -asm-show-inst -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,NOT-R6
-; RUN: llc -mtriple=mips64el -mcpu=mips64   -asm-show-inst -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,NOT-R6
-; RUN: llc -mtriple=mips64el -mcpu=mips64r2 -asm-show-inst -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,NOT-R6
-; RUN: llc -mtriple=mips64el -mcpu=mips64r6 -asm-show-inst -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,R6
+; RUN: llc -mtriple=mips64el-elf -mcpu=mips4    -asm-show-inst -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,NOT-R6
+; RUN: llc -mtriple=mips64el-elf -mcpu=mips64   -asm-show-inst -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,NOT-R6
+; RUN: llc -mtriple=mips64el-elf -mcpu=mips64r2 -asm-show-inst -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,NOT-R6
+; RUN: llc -mtriple=mips64el-elf -mcpu=mips64r6 -asm-show-inst -relocation-model=pic < %s | FileCheck %s -check-prefixes=CHECK,R6
 
 declare void @llvm.eh.return.i64(i64, ptr)
 declare void @foo(...)
diff --git a/llvm/test/CodeGen/Mips/emit-big-cst.ll b/llvm/test/CodeGen/Mips/emit-big-cst.ll
index 5a8852e38e3df..5171e22abab6f 100644
--- a/llvm/test/CodeGen/Mips/emit-big-cst.ll
+++ b/llvm/test/CodeGen/Mips/emit-big-cst.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=mips < %s | FileCheck %s --check-prefix=BE
-; RUN: llc -mtriple=mipsel < %s | FileCheck %s --check-prefix=LE
+; RUN: llc -mtriple=mips-elf < %s | FileCheck %s --check-prefix=BE
+; RUN: llc -mtriple=mipsel-elf < %s | FileCheck %s --check-prefix=LE
 ; Check assembly printing of odd constants.
 
 ; BE-LABEL: bigCst:
diff --git a/llvm/test/CodeGen/Mips/ex2.ll b/llvm/test/CodeGen/Mips/ex2.ll
index bdc676713ad7b..d0fa4058e41e7 100644
--- a/llvm/test/CodeGen/Mips/ex2.ll
+++ b/llvm/test/CodeGen/Mips/ex2.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -mtriple=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @.str = private unnamed_addr constant [6 x i8] c"hello\00", align 1
 @_ZTIPKc = external constant ptr
diff --git a/llvm/test/CodeGen/Mips/fpbr.ll b/llvm/test/CodeGen/Mips/fpbr.ll
index 9f7baca881fc0..7193a426ab0d2 100644
--- a/llvm/test/CodeGen/Mips/fpbr.ll
+++ b/llvm/test/CodeGen/Mips/fpbr.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -mtriple=mipsel -mcpu=mips32   -relocation-model=pic  | FileCheck %s -check-prefixes=ALL,32-FCC
-; RUN: llc < %s -mtriple=mipsel -mcpu=mips32r2 -relocation-model=pic  | FileCheck %s -check-prefixes=ALL,32-FCC
-; RUN: llc < %s -mtriple=mipsel -mcpu=mips32r6 -relocation-model=pic  | FileCheck %s -check-prefixes=ALL,GPR,32-GPR
-; RUN: llc < %s -mtriple=mips64el -mcpu=mips64   | FileCheck %s -check-prefixes=ALL,64-FCC
-; RUN: llc < %s -mtriple=mips64el -mcpu=mips64r2 | FileCheck %s -check-prefixes=ALL,64-FCC
-; RUN: llc < %s -mtriple=mips64el -mcpu=mips64r6 | FileCheck %s -check-prefixes=ALL,GPR,64-GPR
+; RUN: llc < %s -mtriple=mipsel-elf -mcpu=mips32   -relocation-model=pic  | FileCheck %s -check-prefixes=ALL,32-FCC
+; RUN: llc < %s -mtriple=mipsel-elf -mcpu=mips32r2 -relocation-model=pic  | FileCheck %s -check-prefixes=ALL,32-FCC
+; RUN: llc < %s -mtriple=mipsel-elf -mcpu=mips32r6 -relocation-model=pic  | FileCheck %s -check-prefixes=ALL,GPR,32-GPR
+; RUN: llc < %s -mtriple=mips64el-elf -mcpu=mips64   | FileCheck %s -check-prefixes=ALL,64-FCC
+; RUN: llc < %s -mtriple=mips64el-elf -mcpu=mips64r2 | FileCheck %s -check-prefixes=ALL,64-FCC
+; RUN: llc < %s -mtriple=mips64el-elf -mcpu=mips64r6 | FileCheck %s -check-prefixes=ALL,GPR,64-GPR
 
 define void @func0(float %f2, float %f3) nounwind {
 entry:
diff --git a/llvm/test/CodeGen/Mips/frame-address.ll b/llvm/test/CodeGen/Mips/frame-address.ll
index 7e92e3e7de6da..8f73cb33c4b68 100644
--- a/llvm/test/CodeGen/Mips/frame-address.ll
+++ b/llvm/test/CodeGen/Mips/frame-address.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=mipsel < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf < %s | FileCheck %s
 
 declare ptr @llvm.frameaddress(i32) nounwind readnone
 
diff --git a/llvm/test/CodeGen/Mips/jumptable_labels.ll b/llvm/test/CodeGen/Mips/jumptable_labels.ll
index fa8180847885a..075b57e08d35e 100644
--- a/llvm/test/CodeGen/Mips/jumptable_labels.ll
+++ b/llvm/test/CodeGen/Mips/jumptable_labels.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=mips < %s | FileCheck %s -check-prefix=O32
-; RUN: llc -mtriple=mips64 -target-abi=n32 < %s | FileCheck %s -check-prefix=N32
-; RUN: llc -mtriple=mips64 < %s | FileCheck %s -check-prefix=N64
+; RUN: llc -mtriple=mips-elf < %s | FileCheck %s -check-prefix=O32
+; RUN: llc -mtriple=mips64-elf -target-abi=n32 < %s | FileCheck %s -check-prefix=N32
+; RUN: llc -mtriple=mips64-elf < %s | FileCheck %s -check-prefix=N64
 
 ; We only use the '$' prefix on O32. The others use the ELF convention.
 ; O32: $JTI0_0
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/add.ll b/llvm/test/CodeGen/Mips/llvm-ir/add.ll
index 6a08b29458485..f6b3b96aaa0ce 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/add.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/add.ll
@@ -1,32 +1,32 @@
-; RUN: llc < %s -mtriple=mips -mcpu=mips2 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips-elf -mcpu=mips2 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP32,PRE4
-; RUN: llc < %s -mtriple=mips -mcpu=mips32 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips-elf -mcpu=mips32 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP32,GP32-CMOV
-; RUN: llc < %s -mtriple=mips -mcpu=mips32r2 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips-elf -mcpu=mips32r2 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,R2-R6,GP32,GP32-CMOV
-; RUN: llc < %s -mtriple=mips -mcpu=mips32r3 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips-elf -mcpu=mips32r3 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,R2-R6,GP32,GP32-CMOV
-; RUN: llc < %s -mtriple=mips -mcpu=mips32r5 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips-elf -mcpu=mips32r5 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,R2-R6,GP32,GP32-CMOV
-; RUN: llc < %s -mtriple=mips -mcpu=mips32r6 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips-elf -mcpu=mips32r6 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,R2-R6,GP32
-; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips64-elf -mcpu=mips3 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP64,GP64-NOT-R2-R6
-; RUN: llc < %s -mtriple=mips64 -mcpu=mips4 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips64-elf -mcpu=mips4 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP64,GP64-NOT-R2-R6
-; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips64-elf -mcpu=mips64 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,NOT-R2-R6,GP64,GP64-NOT-R2-R6
-; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r2 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips64-elf -mcpu=mips64r2 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,R2-R6,GP64,GP64-R2-R6
-; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r3 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips64-elf -mcpu=mips64r3 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,R2-R6,GP64,GP64-R2-R6
-; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r5 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips64-elf -mcpu=mips64r5 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,R2-R6,GP64,GP64-R2-R6
-; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r6 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips64-elf -mcpu=mips64r6 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,R2-R6,GP64,GP64-R2-R6
-; RUN: llc < %s -mtriple=mips -mcpu=mips32r3 -mattr=+micromips -O2 -verify-machineinstrs | FileCheck %s \
+; RUN: llc < %s -mtriple=mips-elf -mcpu=mips32r3 -mattr=+micromips -O2 -verify-machineinstrs | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,MMR3,MM32
-; RUN: llc < %s -mtriple=mips -mcpu=mips32r6 -mattr=+micromips -O2 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips-elf -mcpu=mips32r6 -mattr=+micromips -O2 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,MMR6,MM32
 
 
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/indirectbr.ll b/llvm/test/CodeGen/Mips/llvm-ir/indirectbr.ll
index c5176669fec2e..c9490e59a623b 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/indirectbr.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/indirectbr.ll
@@ -1,16 +1,16 @@
 ; Test all important variants of the unconditional 'br' instruction.
 
-; RUN: llc -mtriple=mips   -mcpu=mips32   -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
-; RUN: llc -mtriple=mips   -mcpu=mips32r2 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
-; RUN: llc -mtriple=mips   -mcpu=mips32r3 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
-; RUN: llc -mtriple=mips   -mcpu=mips32r5 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
-; RUN: llc -mtriple=mips   -mcpu=mips32r6 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,R6C
-; RUN: llc -mtriple=mips64 -mcpu=mips4    -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
-; RUN: llc -mtriple=mips64 -mcpu=mips64   -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
-; RUN: llc -mtriple=mips64 -mcpu=mips64r2 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
-; RUN: llc -mtriple=mips64 -mcpu=mips64r3 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
-; RUN: llc -mtriple=mips64 -mcpu=mips64r5 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
-; RUN: llc -mtriple=mips64 -mcpu=mips64r6 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,R6
+; RUN: llc -mtriple=mips-elf   -mcpu=mips32   -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
+; RUN: llc -mtriple=mips-elf   -mcpu=mips32r2 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
+; RUN: llc -mtriple=mips-elf   -mcpu=mips32r3 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
+; RUN: llc -mtriple=mips-elf   -mcpu=mips32r5 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
+; RUN: llc -mtriple=mips-elf   -mcpu=mips32r6 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,R6C
+; RUN: llc -mtriple=mips64-elf -mcpu=mips4    -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
+; RUN: llc -mtriple=mips64-elf -mcpu=mips64   -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
+; RUN: llc -mtriple=mips64-elf -mcpu=mips64r2 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
+; RUN: llc -mtriple=mips64-elf -mcpu=mips64r3 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
+; RUN: llc -mtriple=mips64-elf -mcpu=mips64r5 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,NOT-R6
+; RUN: llc -mtriple=mips64-elf -mcpu=mips64r6 -asm-show-inst < %s | FileCheck %s -check-prefixes=ALL,R6
 
 define i32 @br(ptr %addr) {
 ; ALL-LABEL: br:
diff --git a/llvm/test/CodeGen/Mips/llvm-ir/select-int.ll b/llvm/test/CodeGen/Mips/llvm-ir/select-int.ll
index 12728177380fc..20a06139a02b1 100644
--- a/llvm/test/CodeGen/Mips/llvm-ir/select-int.ll
+++ b/llvm/test/CodeGen/Mips/llvm-ir/select-int.ll
@@ -1,32 +1,32 @@
-; RUN: llc < %s -mtriple=mips -mcpu=mips2 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips-elf -mcpu=mips2 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,M2,M2-M3
-; RUN: llc < %s -mtriple=mips -mcpu=mips32 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips-elf -mcpu=mips32 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,CMOV,CMOV-32
-; RUN: llc < %s -mtriple=mips -mcpu=mips32r2 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips-elf -mcpu=mips32r2 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,CMOV,CMOV-32
-; RUN: llc < %s -mtriple=mips -mcpu=mips32r3 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips-elf -mcpu=mips32r3 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,CMOV,CMOV-32
-; RUN: llc < %s -mtriple=mips -mcpu=mips32r5 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips-elf -mcpu=mips32r5 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,CMOV,CMOV-32
-; RUN: llc < %s -mtriple=mips -mcpu=mips32r6 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips-elf -mcpu=mips32r6 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,SEL,SEL-32
-; RUN: llc < %s -mtriple=mips64 -mcpu=mips3 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips64-elf -mcpu=mips3 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,M3,M2-M3
-; RUN: llc < %s -mtriple=mips64 -mcpu=mips4 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips64-elf -mcpu=mips4 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,CMOV,CMOV-64
-; RUN: llc < %s -mtriple=mips64 -mcpu=mips64 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips64-elf -mcpu=mips64 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,CMOV,CMOV-64
-; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r2 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips64-elf -mcpu=mips64r2 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,CMOV,CMOV-64
-; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r3 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips64-elf -mcpu=mips64r3 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,CMOV,CMOV-64
-; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r5 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips64-elf -mcpu=mips64r5 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,CMOV,CMOV-64
-; RUN: llc < %s -mtriple=mips64 -mcpu=mips64r6 | FileCheck %s \
+; RUN: llc < %s -mtriple=mips64-elf -mcpu=mips64r6 | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,SEL,SEL-64
-; RUN: llc < %s -mtriple=mips -mcpu=mips32r3 -mattr=+micromips -asm-show-inst | FileCheck %s \
+; RUN: llc < %s -mtriple=mips-elf -mcpu=mips32r3 -mattr=+micromips -asm-show-inst | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,MM32R3
-; RUN: llc < %s -mtriple=mips -mcpu=mips32r6 -mattr=+micromips | FileCheck %s \
+; RUN: llc < %s -mtriple=mips-elf -mcpu=mips32r6 -mattr=+micromips | FileCheck %s \
 ; RUN:    -check-prefixes=ALL,MMR6,MM32R6
 
 define signext i1 @tst_select_i1_i1(i1 signext %s,
diff --git a/llvm/test/CodeGen/Mips/load-store-left-right.ll b/llvm/test/CodeGen/Mips/load-store-left-right.ll
index 3925b73527b43..0b7e51cbf7dc6 100644
--- a/llvm/test/CodeGen/Mips/load-store-left-right.ll
+++ b/llvm/test/CodeGen/Mips/load-store-left-right.ll
@@ -1,18 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=mipsel   -mcpu=mips32              -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS32,MIPS32-EL %s
-; RUN: llc -mtriple=mips     -mcpu=mips32              -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS32,MIPS32-EB %s
-; RUN: llc -mtriple=mipsel   -mcpu=mips32r2            -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS32,MIPS32-EL %s
-; RUN: llc -mtriple=mips     -mcpu=mips32r2            -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS32,MIPS32-EB %s
-; RUN: llc -mtriple=mipsel   -mcpu=mips32r6            -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS32R6,MIPS32R6-EL %s
-; RUN: llc -mtriple=mips     -mcpu=mips32r6            -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS32R6,MIPS32R6-EB %s
-; RUN: llc -mtriple=mips64el -mcpu=mips4    -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS64,MIPS64-EL %s
-; RUN: llc -mtriple=mips64   -mcpu=mips4    -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS64,MIPS64-EB %s
-; RUN: llc -mtriple=mips64el -mcpu=mips64   -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS64,MIPS64-EL %s
-; RUN: llc -mtriple=mips64   -mcpu=mips64   -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS64,MIPS64-EB %s
-; RUN: llc -mtriple=mips64el -mcpu=mips64r2 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS64,MIPS64R2-EL %s
-; RUN: llc -mtriple=mips64   -mcpu=mips64r2 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS64,MIPS64R2-EB %s
-; RUN: llc -mtriple=mips64el -mcpu=mips64r6 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS64R6 %s
-; RUN: llc -mtriple=mips64   -mcpu=mips64r6 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS64R6 %s
+; RUN: llc -mtriple=mipsel-elf   -mcpu=mips32              -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS32,MIPS32-EL %s
+; RUN: llc -mtriple=mips-elf     -mcpu=mips32              -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS32,MIPS32-EB %s
+; RUN: llc -mtriple=mipsel-elf   -mcpu=mips32r2            -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS32,MIPS32-EL %s
+; RUN: llc -mtriple=mips-elf     -mcpu=mips32r2            -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS32,MIPS32-EB %s
+; RUN: llc -mtriple=mipsel-elf   -mcpu=mips32r6            -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS32R6,MIPS32R6-EL %s
+; RUN: llc -mtriple=mips-elf     -mcpu=mips32r6            -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS32R6,MIPS32R6-EB %s
+; RUN: llc -mtriple=mips64el-elf -mcpu=mips4    -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS64,MIPS64-EL %s
+; RUN: llc -mtriple=mips64-elf   -mcpu=mips4    -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS64,MIPS64-EB %s
+; RUN: llc -mtriple=mips64el-elf -mcpu=mips64   -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS64,MIPS64-EL %s
+; RUN: llc -mtriple=mips64-elf   -mcpu=mips64   -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS64,MIPS64-EB %s
+; RUN: llc -mtriple=mips64el-elf -mcpu=mips64r2 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS64,MIPS64R2-EL %s
+; RUN: llc -mtriple=mips64-elf   -mcpu=mips64r2 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS64,MIPS64R2-EB %s
+; RUN: llc -mtriple=mips64el-elf -mcpu=mips64r6 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS64R6 %s
+; RUN: llc -mtriple=mips64-elf   -mcpu=mips64r6 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=MIPS64R6 %s
 
 %struct.SLL = type { i64 }
 %struct.SI = type { i32 }
diff --git a/llvm/test/CodeGen/Mips/mcount.ll b/llvm/test/CodeGen/Mips/mcount.ll
index b45b59eedbc9c..41100e6cbeb6f 100644
--- a/llvm/test/CodeGen/Mips/mcount.ll
+++ b/llvm/test/CodeGen/Mips/mcount.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 
-; RUN: llc -mtriple=mips -verify-machineinstrs \
+; RUN: llc -mtriple=mips-elf -verify-machineinstrs \
 ; RUN:   < %s | FileCheck %s -check-prefix=MIPS32
-; RUN: llc -mtriple=mips -verify-machineinstrs -relocation-model=pic \
+; RUN: llc -mtriple=mips-elf -verify-machineinstrs -relocation-model=pic \
 ; RUN:   < %s | FileCheck %s -check-prefix=MIPS32-PIC
-; RUN: llc -mtriple=mips64 -verify-machineinstrs \
+; RUN: llc -mtriple=mips64-elf -verify-machineinstrs \
 ; RUN:   < %s | FileCheck %s -check-prefix=MIPS64
-; RUN: llc -mtriple=mips64 -verify-machineinstrs -relocation-model=pic \
+; RUN: llc -mtriple=mips64-elf -verify-machineinstrs -relocation-model=pic \
 ; RUN:   < %s | FileCheck %s -check-prefix=MIPS64-PIC
-; RUN: llc -mtriple=mips -verify-machineinstrs -mattr=+micromips \
+; RUN: llc -mtriple=mips-elf -verify-machineinstrs -mattr=+micromips \
 ; RUN:   < %s | FileCheck %s -check-prefix=MIPS32-MM
-; RUN: llc -mtriple=mips -verify-machineinstrs -relocation-model=pic -mattr=+micromips \
+; RUN: llc -mtriple=mips-elf -verify-machineinstrs -relocation-model=pic -mattr=+micromips \
 ; RUN:   < %s | FileCheck %s -check-prefix=MIPS32-MM-PIC
 
 ; Test that checks ABI for _mcount calls.
diff --git a/llvm/test/CodeGen/Mips/micromips-sizereduction/micromips-lbu16-lhu16-sb16-sh16.ll b/llvm/test/CodeGen/Mips/micromips-sizereduction/micromips-lbu16-lhu16-sb16-sh16.ll
index ca4d6856f87f1..663fb078ecb82 100644
--- a/llvm/test/CodeGen/Mips/micromips-sizereduction/micromips-lbu16-lhu16-sb16-sh16.ll
+++ b/llvm/test/CodeGen/Mips/micromips-sizereduction/micromips-lbu16-lhu16-sb16-sh16.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=mipsel -mcpu=mips32r2 -mattr=+micromips -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mcpu=mips32r2 -mattr=+micromips -verify-machineinstrs < %s | FileCheck %s
 
 define void @f1(ptr %p) {
 ; CHECK-LABEL: f1:
diff --git a/llvm/test/CodeGen/Mips/mips64directive.ll b/llvm/test/CodeGen/Mips/mips64directive.ll
index 3d5a32f43982e..1434be0b6bf64 100644
--- a/llvm/test/CodeGen/Mips/mips64directive.ll
+++ b/llvm/test/CodeGen/Mips/mips64directive.ll
@@ -1,5 +1,5 @@
-; RUN: llc  < %s -mtriple=mips64el -mcpu=mips4 -target-abi=n64 | FileCheck %s
-; RUN: llc  < %s -mtriple=mips64el -mcpu=mips64 -target-abi=n64 | FileCheck %s
+; RUN: llc  < %s -mtriple=mips64el-elf -mcpu=mips4 -target-abi=n64 | FileCheck %s
+; RUN: llc  < %s -mtriple=mips64el-elf -mcpu=mips64 -target-abi=n64 | FileCheck %s
 
 @gl = global i64 1250999896321, align 8
 
diff --git a/llvm/test/CodeGen/Mips/msa/2r.ll b/llvm/test/CodeGen/Mips/msa/2r.ll
index 32a8734a4a170..f5cde12f4efca 100644
--- a/llvm/test/CodeGen/Mips/msa/2r.ll
+++ b/llvm/test/CodeGen/Mips/msa/2r.ll
@@ -1,7 +1,7 @@
 ; Test the MSA intrinsics that are encoded with the 2R instruction format.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
 
 @llvm_mips_nloc_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_nloc_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/2r_vector_scalar.ll b/llvm/test/CodeGen/Mips/msa/2r_vector_scalar.ll
index 615bbbad66ae0..23857db4b06b9 100644
--- a/llvm/test/CodeGen/Mips/msa/2r_vector_scalar.ll
+++ b/llvm/test/CodeGen/Mips/msa/2r_vector_scalar.ll
@@ -1,13 +1,13 @@
 ; Test the MSA intrinsics that are encoded with the 2R instruction format and
 ; convert scalars to vectors.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | \
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | \
 ; RUN:   FileCheck %s -check-prefixes=MIPS-ANY,MIPS32
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | \
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | \
 ; RUN:   FileCheck %s -check-prefixes=MIPS-ANY,MIPS32
-; RUN: llc -mtriple=mips64 -mcpu=mips64r2 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
+; RUN: llc -mtriple=mips64-elf -mcpu=mips64r2 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
 ; RUN:   FileCheck %s -check-prefixes=MIPS-ANY,MIPS64
-; RUN: llc -mtriple=mips64el -mcpu=mips64r2 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
+; RUN: llc -mtriple=mips64el-elf -mcpu=mips64r2 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
 ; RUN:   FileCheck %s -check-prefixes=MIPS-ANY,MIPS64
 
 @llvm_mips_fill_b_ARG1 = global i32 23, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/2rf.ll b/llvm/test/CodeGen/Mips/msa/2rf.ll
index 4a272e3fa17d1..61593f4690d9a 100644
--- a/llvm/test/CodeGen/Mips/msa/2rf.ll
+++ b/llvm/test/CodeGen/Mips/msa/2rf.ll
@@ -1,7 +1,7 @@
 ; Test the MSA intrinsics that are encoded with the 2RF instruction format.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
 
 @llvm_mips_flog2_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
 @llvm_mips_flog2_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/2rf_exup.ll b/llvm/test/CodeGen/Mips/msa/2rf_exup.ll
index cf2e604c2623f..7c8376746df4d 100644
--- a/llvm/test/CodeGen/Mips/msa/2rf_exup.ll
+++ b/llvm/test/CodeGen/Mips/msa/2rf_exup.ll
@@ -1,8 +1,8 @@
 ; Test the MSA floating point conversion intrinsics (e.g. float->double) that
 ; are encoded with the 2RF instruction format.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_fexupl_w_ARG1 = global <8 x half> <half 0.000000e+00, half 1.000000e+00, half 2.000000e+00, half 3.000000e+00, half 4.000000e+00, half 5.000000e+00, half 6.000000e+00, half 7.000000e+00>, align 16
 @llvm_mips_fexupl_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/2rf_float_int.ll b/llvm/test/CodeGen/Mips/msa/2rf_float_int.ll
index b8593a7fb9368..2c7eb0f9ca102 100644
--- a/llvm/test/CodeGen/Mips/msa/2rf_float_int.ll
+++ b/llvm/test/CodeGen/Mips/msa/2rf_float_int.ll
@@ -1,8 +1,8 @@
 ; Test the MSA integer to floating point conversion intrinsics that are encoded
 ; with the 2RF instruction format.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
 
 @llvm_mips_ffint_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
 @llvm_mips_ffint_s_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/2rf_fq.ll b/llvm/test/CodeGen/Mips/msa/2rf_fq.ll
index 9a5c9c14b47e8..3f4a766575408 100644
--- a/llvm/test/CodeGen/Mips/msa/2rf_fq.ll
+++ b/llvm/test/CodeGen/Mips/msa/2rf_fq.ll
@@ -1,8 +1,8 @@
 ; Test the MSA fixed-point to floating point conversion intrinsics that are
 ; encoded with the 2RF instruction format.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_ffql_w_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
 @llvm_mips_ffql_w_RES  = global <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/2rf_int_float.ll b/llvm/test/CodeGen/Mips/msa/2rf_int_float.ll
index ec700da6f3c06..ee06361f370f2 100644
--- a/llvm/test/CodeGen/Mips/msa/2rf_int_float.ll
+++ b/llvm/test/CodeGen/Mips/msa/2rf_int_float.ll
@@ -2,8 +2,8 @@
 ; 2RF instruction format. This includes conversions but other instructions such
 ; as fclass are also here.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
 
 @llvm_mips_fclass_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
 @llvm_mips_fclass_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/2rf_tq.ll b/llvm/test/CodeGen/Mips/msa/2rf_tq.ll
index e3fdbecc6b7eb..dfb1458842e26 100644
--- a/llvm/test/CodeGen/Mips/msa/2rf_tq.ll
+++ b/llvm/test/CodeGen/Mips/msa/2rf_tq.ll
@@ -1,8 +1,8 @@
 ; Test the MSA floating-point to fixed-point conversion intrinsics that are
 ; encoded with the 2RF instruction format.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_ftq_h_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
 @llvm_mips_ftq_h_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/3r-a.ll b/llvm/test/CodeGen/Mips/msa/3r-a.ll
index 69e862850a473..8f97d6b12e91b 100644
--- a/llvm/test/CodeGen/Mips/msa/3r-a.ll
+++ b/llvm/test/CodeGen/Mips/msa/3r-a.ll
@@ -1,11 +1,11 @@
 ; Test the MSA intrinsics that are encoded with the 3R instruction format.
 ; There are lots of these so this covers those beginning with 'a'
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
 
 ; It should fail to compile without fp64.
-; RUN: not llc -mtriple=mips -mattr=+msa < %s 2>&1 | \
+; RUN: not llc -mtriple=mips-elf -mattr=+msa < %s 2>&1 | \
 ; RUN:    FileCheck -check-prefix=FP32ERROR %s
 ; FP32ERROR: LLVM ERROR: MSA requires a 64-bit FPU register file (FR=1 mode).
 
diff --git a/llvm/test/CodeGen/Mips/msa/3r-b.ll b/llvm/test/CodeGen/Mips/msa/3r-b.ll
index f2c5ebb4a580a..fc1f0c1071e2c 100644
--- a/llvm/test/CodeGen/Mips/msa/3r-b.ll
+++ b/llvm/test/CodeGen/Mips/msa/3r-b.ll
@@ -1,8 +1,8 @@
 ; Test the MSA intrinsics that are encoded with the 3R instruction format.
 ; There are lots of these so this covers those beginning with 'b'
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
 
 @llvm_mips_bclr_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_bclr_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/3r-c.ll b/llvm/test/CodeGen/Mips/msa/3r-c.ll
index 38cb386b5a99b..000ebf45c472d 100644
--- a/llvm/test/CodeGen/Mips/msa/3r-c.ll
+++ b/llvm/test/CodeGen/Mips/msa/3r-c.ll
@@ -1,8 +1,8 @@
 ; Test the MSA intrinsics that are encoded with the 3R instruction format.
 ; There are lots of these so this covers those beginning with 'c'
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_ceq_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_ceq_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/3r-d.ll b/llvm/test/CodeGen/Mips/msa/3r-d.ll
index 116e0ada6de7b..e46cfb0bedfdf 100644
--- a/llvm/test/CodeGen/Mips/msa/3r-d.ll
+++ b/llvm/test/CodeGen/Mips/msa/3r-d.ll
@@ -1,8 +1,8 @@
 ; Test the MSA intrinsics that are encoded with the 3R instruction format.
 ; There are lots of these so this covers those beginning with 'd'
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_div_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_div_s_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/3r-i.ll b/llvm/test/CodeGen/Mips/msa/3r-i.ll
index e20064bff84e9..e9af30d8e73bd 100644
--- a/llvm/test/CodeGen/Mips/msa/3r-i.ll
+++ b/llvm/test/CodeGen/Mips/msa/3r-i.ll
@@ -1,8 +1,8 @@
 ; Test the MSA intrinsics that are encoded with the 3R instruction format.
 ; There are lots of these so this covers those beginning with 'i'
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_ilvev_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_ilvev_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/3r-m.ll b/llvm/test/CodeGen/Mips/msa/3r-m.ll
index 580ae4b41c621..7a318ceed832d 100644
--- a/llvm/test/CodeGen/Mips/msa/3r-m.ll
+++ b/llvm/test/CodeGen/Mips/msa/3r-m.ll
@@ -1,8 +1,8 @@
 ; Test the MSA intrinsics that are encoded with the 3R instruction format.
 ; There are lots of these so this covers those beginning with 'm'
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_max_a_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_max_a_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/3r-p.ll b/llvm/test/CodeGen/Mips/msa/3r-p.ll
index 455ebb4a31616..4b3862c8f12a2 100644
--- a/llvm/test/CodeGen/Mips/msa/3r-p.ll
+++ b/llvm/test/CodeGen/Mips/msa/3r-p.ll
@@ -1,8 +1,8 @@
 ; Test the MSA intrinsics that are encoded with the 3R instruction format.
 ; There are lots of these so this covers those beginning with 'p'
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_pckev_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_pckev_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/3r-s.ll b/llvm/test/CodeGen/Mips/msa/3r-s.ll
index 6ea52fa856716..b86a560a29ffd 100644
--- a/llvm/test/CodeGen/Mips/msa/3r-s.ll
+++ b/llvm/test/CodeGen/Mips/msa/3r-s.ll
@@ -1,8 +1,8 @@
 ; Test the MSA intrinsics that are encoded with the 3R instruction format.
 ; There are lots of these so this covers those beginning with 's'
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
 
 @llvm_mips_sld_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_sld_b_ARG2 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/3r-v.ll b/llvm/test/CodeGen/Mips/msa/3r-v.ll
index c71efdc0bb745..dacbf036f93c1 100644
--- a/llvm/test/CodeGen/Mips/msa/3r-v.ll
+++ b/llvm/test/CodeGen/Mips/msa/3r-v.ll
@@ -1,8 +1,8 @@
 ; Test the MSA intrinsics that are encoded with the 3R instruction format.
 ; There are lots of these so this covers those beginning with 'v'
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_vshf_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_vshf_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/3r_4r.ll b/llvm/test/CodeGen/Mips/msa/3r_4r.ll
index d32af452c4fee..7e9de2152e0d0 100644
--- a/llvm/test/CodeGen/Mips/msa/3r_4r.ll
+++ b/llvm/test/CodeGen/Mips/msa/3r_4r.ll
@@ -1,8 +1,8 @@
 ; Test the MSA intrinsics that are encoded with the 3R instruction format and
 ; use the result as a third operand.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_maddv_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_maddv_b_ARG2 = global <16 x i8> <i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/3r_4r_widen.ll b/llvm/test/CodeGen/Mips/msa/3r_4r_widen.ll
index 13546b6bf6fc3..a6e753ec0cf10 100644
--- a/llvm/test/CodeGen/Mips/msa/3r_4r_widen.ll
+++ b/llvm/test/CodeGen/Mips/msa/3r_4r_widen.ll
@@ -2,8 +2,8 @@
 ; use the result as a third operand and results in wider elements than the
 ; operands had.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_dpadd_s_h_ARG2 = global <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>, align 16
 @llvm_mips_dpadd_s_h_ARG3 = global <16 x i8> <i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/3r_splat.ll b/llvm/test/CodeGen/Mips/msa/3r_splat.ll
index c8c7f0585412c..6b353e6cdcd03 100644
--- a/llvm/test/CodeGen/Mips/msa/3r_splat.ll
+++ b/llvm/test/CodeGen/Mips/msa/3r_splat.ll
@@ -1,9 +1,9 @@
 ; Test the MSA splat intrinsics that are encoded with the 3R instruction
 ; format.
 
-; RUN: llc -mtriple=mips -mcpu=mips32r5 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
+; RUN: llc -mtriple=mips-elf -mcpu=mips32r5 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
 ; RUN:     FileCheck -check-prefix=MIPS32 %s
-; RUN: llc -mtriple=mipsel -mcpu=mips32r5 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
+; RUN: llc -mtriple=mipsel-elf -mcpu=mips32r5 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
 ; RUN:     FileCheck -check-prefix=MIPS32 %s
 
 @llvm_mips_splat_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/3rf.ll b/llvm/test/CodeGen/Mips/msa/3rf.ll
index 36114fa9094b5..eed5bd2292561 100644
--- a/llvm/test/CodeGen/Mips/msa/3rf.ll
+++ b/llvm/test/CodeGen/Mips/msa/3rf.ll
@@ -1,7 +1,7 @@
 ; Test the MSA intrinsics that are encoded with the 3RF instruction format.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_fadd_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
 @llvm_mips_fadd_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/3rf_4rf.ll b/llvm/test/CodeGen/Mips/msa/3rf_4rf.ll
index 94c66715035a3..a53e3d58fa72e 100644
--- a/llvm/test/CodeGen/Mips/msa/3rf_4rf.ll
+++ b/llvm/test/CodeGen/Mips/msa/3rf_4rf.ll
@@ -1,8 +1,8 @@
 ; Test the MSA intrinsics that are encoded with the 3RF instruction format and
 ; use the result as a third operand.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_fmadd_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
 @llvm_mips_fmadd_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/3rf_4rf_q.ll b/llvm/test/CodeGen/Mips/msa/3rf_4rf_q.ll
index 049042df39c11..41011024e09f7 100644
--- a/llvm/test/CodeGen/Mips/msa/3rf_4rf_q.ll
+++ b/llvm/test/CodeGen/Mips/msa/3rf_4rf_q.ll
@@ -1,8 +1,8 @@
 ; Test the MSA intrinsics that are encoded with the 3RF instruction format and
 ; use the result as a third operand and perform fixed-point operations.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_madd_q_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
 @llvm_mips_madd_q_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/3rf_exdo.ll b/llvm/test/CodeGen/Mips/msa/3rf_exdo.ll
index b0ee043b8f371..58a890634eb44 100644
--- a/llvm/test/CodeGen/Mips/msa/3rf_exdo.ll
+++ b/llvm/test/CodeGen/Mips/msa/3rf_exdo.ll
@@ -1,8 +1,8 @@
 ; Test the MSA floating-point conversion intrinsics that are encoded with the
 ; 3RF instruction format.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_fexdo_h_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
 @llvm_mips_fexdo_h_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/3rf_float_int.ll b/llvm/test/CodeGen/Mips/msa/3rf_float_int.ll
index 49e31465d8477..d628d5ab1bbbe 100644
--- a/llvm/test/CodeGen/Mips/msa/3rf_float_int.ll
+++ b/llvm/test/CodeGen/Mips/msa/3rf_float_int.ll
@@ -1,8 +1,8 @@
 ; Test the MSA intrinsics that are encoded with the 3RF instruction format and
 ; take an integer as an operand.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_fexp2_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
 @llvm_mips_fexp2_w_ARG2 = global <4 x i32> <i32 4, i32 5, i32 6, i32 7>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/3rf_int_float.ll b/llvm/test/CodeGen/Mips/msa/3rf_int_float.ll
index 7c1b2ffd100c3..137230aa6a6f7 100644
--- a/llvm/test/CodeGen/Mips/msa/3rf_int_float.ll
+++ b/llvm/test/CodeGen/Mips/msa/3rf_int_float.ll
@@ -1,8 +1,8 @@
 ; Test the MSA intrinsics that are encoded with the 3RF instruction format and
 ; produce an integer as a result.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_fcaf_w_ARG1 = global <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, align 16
 @llvm_mips_fcaf_w_ARG2 = global <4 x float> <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/3rf_q.ll b/llvm/test/CodeGen/Mips/msa/3rf_q.ll
index fcae0713056e3..cc1588ce7de56 100644
--- a/llvm/test/CodeGen/Mips/msa/3rf_q.ll
+++ b/llvm/test/CodeGen/Mips/msa/3rf_q.ll
@@ -1,8 +1,8 @@
 ; Test the MSA fixed-point intrinsics that are encoded with the 3RF instruction
 ; format.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_mul_q_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
 @llvm_mips_mul_q_h_ARG2 = global <8 x i16> <i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/arithmetic_float.ll b/llvm/test/CodeGen/Mips/msa/arithmetic_float.ll
index 84496e012908a..b97e0539fcf69 100644
--- a/llvm/test/CodeGen/Mips/msa/arithmetic_float.ll
+++ b/llvm/test/CodeGen/Mips/msa/arithmetic_float.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 define void @add_v4f32(ptr %c, ptr %a, ptr %b) nounwind {
   ; CHECK: add_v4f32:
diff --git a/llvm/test/CodeGen/Mips/msa/bit.ll b/llvm/test/CodeGen/Mips/msa/bit.ll
index cc200eb915866..ea32774226510 100644
--- a/llvm/test/CodeGen/Mips/msa/bit.ll
+++ b/llvm/test/CodeGen/Mips/msa/bit.ll
@@ -1,7 +1,7 @@
 ; Test the MSA intrinsics that are encoded with the BIT instruction format.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_sat_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_sat_s_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/bitcast.ll b/llvm/test/CodeGen/Mips/msa/bitcast.ll
index 12b118b0af53c..c34e89b196e8f 100644
--- a/llvm/test/CodeGen/Mips/msa/bitcast.ll
+++ b/llvm/test/CodeGen/Mips/msa/bitcast.ll
@@ -1,7 +1,7 @@
 ; Test the bitcast operation for big-endian and little-endian.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck -check-prefix=BIGENDIAN %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck -check-prefix=LITENDIAN %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck -check-prefix=BIGENDIAN %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck -check-prefix=LITENDIAN %s
 
 define void @v16i8_to_v16i8(ptr %src, ptr %dst) nounwind {
 entry:
diff --git a/llvm/test/CodeGen/Mips/msa/compare.ll b/llvm/test/CodeGen/Mips/msa/compare.ll
index 5027c45ecf52e..351f0f1f79a34 100644
--- a/llvm/test/CodeGen/Mips/msa/compare.ll
+++ b/llvm/test/CodeGen/Mips/msa/compare.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 define void @ceq_v16i8(ptr %c, ptr %a, ptr %b) nounwind {
   ; CHECK: ceq_v16i8:
diff --git a/llvm/test/CodeGen/Mips/msa/compare_float.ll b/llvm/test/CodeGen/Mips/msa/compare_float.ll
index 396d8c421d123..2656cb839768c 100644
--- a/llvm/test/CodeGen/Mips/msa/compare_float.ll
+++ b/llvm/test/CodeGen/Mips/msa/compare_float.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 declare <4 x float> @llvm.mips.fmax.w(<4 x float>, <4 x float>) nounwind
 declare <2 x double> @llvm.mips.fmax.d(<2 x double>, <2 x double>) nounwind
diff --git a/llvm/test/CodeGen/Mips/msa/elm_copy.ll b/llvm/test/CodeGen/Mips/msa/elm_copy.ll
index 232f09fc3a668..27d2faa8e66f1 100644
--- a/llvm/test/CodeGen/Mips/msa/elm_copy.ll
+++ b/llvm/test/CodeGen/Mips/msa/elm_copy.ll
@@ -1,13 +1,13 @@
 ; Test the MSA intrinsics that are encoded with the ELM instruction format and
 ; are element extraction operations.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | \
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | \
 ; RUN:   FileCheck %s -check-prefixes=MIPS-ANY,MIPS32
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | \
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | \
 ; RUN:   FileCheck %s -check-prefixes=MIPS-ANY,MIPS32
-; RUN: llc -mtriple=mips64 -mcpu=mips64r2 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
+; RUN: llc -mtriple=mips64-elf -mcpu=mips64r2 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
 ; RUN:   FileCheck %s -check-prefixes=MIPS-ANY,MIPS64
-; RUN: llc -mtriple=mips64el -mcpu=mips64r2 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
+; RUN: llc -mtriple=mips64el-elf -mcpu=mips64r2 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
 ; RUN:   FileCheck %s -check-prefixes=MIPS-ANY,MIPS64
 
 @llvm_mips_copy_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/elm_cxcmsa.ll b/llvm/test/CodeGen/Mips/msa/elm_cxcmsa.ll
index dab5e0c1b93a3..dd8afa61e3db5 100644
--- a/llvm/test/CodeGen/Mips/msa/elm_cxcmsa.ll
+++ b/llvm/test/CodeGen/Mips/msa/elm_cxcmsa.ll
@@ -1,8 +1,8 @@
 ; Test the MSA ctcmsa and cfcmsa intrinsics (which are encoded with the ELM
 ; instruction format).
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 -verify-machineinstrs < %s | FileCheck %s
 
 define i32 @msa_ir_cfcmsa_test() nounwind {
 entry:
diff --git a/llvm/test/CodeGen/Mips/msa/elm_insv.ll b/llvm/test/CodeGen/Mips/msa/elm_insv.ll
index b5b22fd8ee0c9..23acdc0b9bbf8 100644
--- a/llvm/test/CodeGen/Mips/msa/elm_insv.ll
+++ b/llvm/test/CodeGen/Mips/msa/elm_insv.ll
@@ -1,13 +1,13 @@
 ; Test the MSA element insertion intrinsics that are encoded with the ELM
 ; instruction format.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | \
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | \
 ; RUN:   FileCheck %s -check-prefixes=MIPS-ANY,MIPS32
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | \
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | \
 ; RUN:   FileCheck %s -check-prefixes=MIPS-ANY,MIPS32
-; RUN: llc -mtriple=mips64 -mcpu=mips64r2 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
+; RUN: llc -mtriple=mips64-elf -mcpu=mips64r2 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
 ; RUN:   FileCheck %s -check-prefixes=MIPS-ANY,MIPS64
-; RUN: llc -mtriple=mips64el -mcpu=mips64r2 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
+; RUN: llc -mtriple=mips64el-elf -mcpu=mips64r2 -mattr=+msa,+fp64 -relocation-model=pic < %s | \
 ; RUN:   FileCheck %s -check-prefixes=MIPS-ANY,MIPS64
 
 @llvm_mips_insert_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/elm_move.ll b/llvm/test/CodeGen/Mips/msa/elm_move.ll
index 9e13d52f28bc2..ed368815838bc 100644
--- a/llvm/test/CodeGen/Mips/msa/elm_move.ll
+++ b/llvm/test/CodeGen/Mips/msa/elm_move.ll
@@ -1,8 +1,8 @@
 ; Test the MSA move intrinsics (which are encoded with the ELM instruction
 ; format).
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_move_vb_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_move_vb_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/elm_shift_slide.ll b/llvm/test/CodeGen/Mips/msa/elm_shift_slide.ll
index 330da8b04469e..8195595c62d14 100644
--- a/llvm/test/CodeGen/Mips/msa/elm_shift_slide.ll
+++ b/llvm/test/CodeGen/Mips/msa/elm_shift_slide.ll
@@ -1,8 +1,8 @@
 ; Test the MSA intrinsics that are encoded with the ELM instruction format and
 ; are either shifts or slides.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_sldi_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_sldi_b_ARG2 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/endian.ll b/llvm/test/CodeGen/Mips/msa/endian.ll
index e7ed31d145270..c9e63403d6ee3 100644
--- a/llvm/test/CodeGen/Mips/msa/endian.ll
+++ b/llvm/test/CodeGen/Mips/msa/endian.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck -check-prefix=BIGENDIAN %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck -check-prefix=LITENDIAN %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck -check-prefix=BIGENDIAN %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck -check-prefix=LITENDIAN %s
 
 @v16i8 = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
 @v8i16 = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
diff --git a/llvm/test/CodeGen/Mips/msa/frameindex.ll b/llvm/test/CodeGen/Mips/msa/frameindex.ll
index 4d7fc78595f57..f6d46b1866837 100644
--- a/llvm/test/CodeGen/Mips/msa/frameindex.ll
+++ b/llvm/test/CodeGen/Mips/msa/frameindex.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r5 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r5 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r5 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r5 < %s | FileCheck %s
 
 define void @loadstore_v16i8_near() nounwind {
   ; CHECK: loadstore_v16i8_near:
diff --git a/llvm/test/CodeGen/Mips/msa/i10.ll b/llvm/test/CodeGen/Mips/msa/i10.ll
index a9f95df0d0e83..2698c91308f71 100644
--- a/llvm/test/CodeGen/Mips/msa/i10.ll
+++ b/llvm/test/CodeGen/Mips/msa/i10.ll
@@ -1,7 +1,7 @@
 ; Test the MSA intrinsics that are encoded with the I10 instruction format.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_bnz_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 
diff --git a/llvm/test/CodeGen/Mips/msa/i5-a.ll b/llvm/test/CodeGen/Mips/msa/i5-a.ll
index b46d0afcb8304..2e551a6a04c34 100644
--- a/llvm/test/CodeGen/Mips/msa/i5-a.ll
+++ b/llvm/test/CodeGen/Mips/msa/i5-a.ll
@@ -1,8 +1,8 @@
 ; Test the MSA intrinsics that are encoded with the I5 instruction format.
 ; There are lots of these so this covers those beginning with 'a'
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_addvi_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_addvi_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/i5-c.ll b/llvm/test/CodeGen/Mips/msa/i5-c.ll
index fb4a21bf4ed3c..976dfcf9c2700 100644
--- a/llvm/test/CodeGen/Mips/msa/i5-c.ll
+++ b/llvm/test/CodeGen/Mips/msa/i5-c.ll
@@ -1,8 +1,8 @@
 ; Test the MSA intrinsics that are encoded with the I5 instruction format.
 ; There are lots of these so this covers those beginning with 'c'
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_ceqi_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_ceqi_b_RES1 = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/i5-m.ll b/llvm/test/CodeGen/Mips/msa/i5-m.ll
index 60ed8b7c86165..c1729a7ba0de6 100644
--- a/llvm/test/CodeGen/Mips/msa/i5-m.ll
+++ b/llvm/test/CodeGen/Mips/msa/i5-m.ll
@@ -1,8 +1,8 @@
 ; Test the MSA intrinsics that are encoded with the I5 instruction format.
 ; There are lots of these so this covers those beginning with 'm'
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_maxi_s_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_maxi_s_b_RES1 = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/i5_ld_st.ll b/llvm/test/CodeGen/Mips/msa/i5_ld_st.ll
index 66cb32eb88246..b54247ea07074 100644
--- a/llvm/test/CodeGen/Mips/msa/i5_ld_st.ll
+++ b/llvm/test/CodeGen/Mips/msa/i5_ld_st.ll
@@ -1,8 +1,8 @@
 ; Test the MSA intrinsics that are encoded with the I5 instruction format and
 ; are loads or stores.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_ld_b_ARG = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_ld_b_RES = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/i8.ll b/llvm/test/CodeGen/Mips/msa/i8.ll
index e4c1affc701fb..b286574079da6 100644
--- a/llvm/test/CodeGen/Mips/msa/i8.ll
+++ b/llvm/test/CodeGen/Mips/msa/i8.ll
@@ -1,7 +1,7 @@
 ; Test the MSA intrinsics that are encoded with the I8 instruction format.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s | FileCheck %s
 
 @llvm_mips_andi_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 @llvm_mips_andi_b_RES  = global <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/remat-ldi.ll b/llvm/test/CodeGen/Mips/msa/remat-ldi.ll
index cd52077173dc3..313b51ee31f4c 100644
--- a/llvm/test/CodeGen/Mips/msa/remat-ldi.ll
+++ b/llvm/test/CodeGen/Mips/msa/remat-ldi.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -O3 -mtriple=mipsel -mcpu=mips32r6 -mattr=+fp64,+msa %s -o - | FileCheck %s
+; RUN: llc -O3 -mtriple=mipsel-elf -mcpu=mips32r6 -mattr=+fp64,+msa %s -o - | FileCheck %s
 
 ; Test that checks if spill for ldi can be avoided and instruction will be
 ; rematerialized.
diff --git a/llvm/test/CodeGen/Mips/msa/shift-dagcombine.ll b/llvm/test/CodeGen/Mips/msa/shift-dagcombine.ll
index 3f35a5be849ec..dbbd0fdfaf898 100644
--- a/llvm/test/CodeGen/Mips/msa/shift-dagcombine.ll
+++ b/llvm/test/CodeGen/Mips/msa/shift-dagcombine.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 define void @ashr_v4i32(ptr %c) nounwind {
   ; CHECK-LABEL: ashr_v4i32:
diff --git a/llvm/test/CodeGen/Mips/msa/shift_constant_pool.ll b/llvm/test/CodeGen/Mips/msa/shift_constant_pool.ll
index 2636f37a06ae6..79fb1b04da5f7 100644
--- a/llvm/test/CodeGen/Mips/msa/shift_constant_pool.ll
+++ b/llvm/test/CodeGen/Mips/msa/shift_constant_pool.ll
@@ -1,13 +1,13 @@
 ; Test whether the following functions, with vectors featuring negative or values larger than the element
 ; bit size have their results of operations generated correctly when placed into constant pools
 
-; RUN: llc -mtriple=mips64 -mattr=+msa,+fp64 -relocation-model=pic < %s \
+; RUN: llc -mtriple=mips64-elf -mattr=+msa,+fp64 -relocation-model=pic < %s \
 ; RUN:   | FileCheck -check-prefixes=ALL,MIPS64 %s
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s \
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s \
 ; RUN:   | FileCheck -check-prefixes=ALL,MIPS32 %s
-; RUN: llc -mtriple=mips64el -mattr=+msa,+fp64 -relocation-model=pic < %s \
+; RUN: llc -mtriple=mips64el-elf -mattr=+msa,+fp64 -relocation-model=pic < %s \
 ; RUN:   | FileCheck -check-prefixes=ALL,MIPS64 %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,mips32r2 -relocation-model=pic < %s \
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,mips32r2 -relocation-model=pic < %s \
 ; RUN:   | FileCheck -check-prefixes=ALL,MIPS32 %s
 
 @llvm_mips_bclr_w_test_const_vec_res = global <4 x i32> zeroinitializer, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/special.ll b/llvm/test/CodeGen/Mips/msa/special.ll
index df19d94eeed32..f70d9db348411 100644
--- a/llvm/test/CodeGen/Mips/msa/special.ll
+++ b/llvm/test/CodeGen/Mips/msa/special.ll
@@ -1,12 +1,12 @@
 ; Test the MSA intrinsics that are encoded with the SPECIAL instruction format.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | \
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | \
 ; RUN:   FileCheck %s --check-prefix=MIPS32
-; RUN: llc -mtriple=mips64 -mcpu=mips64r2 -mattr=+msa,+fp64 < %s | \
+; RUN: llc -mtriple=mips64-elf -mcpu=mips64r2 -mattr=+msa,+fp64 < %s | \
 ; RUN:   FileCheck %s --check-prefix=MIPS64
-; RUN: llc -mtriple=mips -mcpu=mips32r6 -mattr=+msa < %s | \
+; RUN: llc -mtriple=mips-elf -mcpu=mips32r6 -mattr=+msa < %s | \
 ; RUN:   FileCheck %s --check-prefix=MIPS32
-; RUN: llc -mtriple=mips64 -mcpu=mips64r6 -mattr=+msa < %s | \
+; RUN: llc -mtriple=mips64-elf -mcpu=mips64r6 -mattr=+msa < %s | \
 ; RUN:   FileCheck %s --check-prefix=MIPS64
 
 define i32 @llvm_mips_lsa_test(i32 %a, i32 %b) nounwind {
diff --git a/llvm/test/CodeGen/Mips/msa/spill.ll b/llvm/test/CodeGen/Mips/msa/spill.ll
index 51759f917a32c..5b00f3cad12cf 100644
--- a/llvm/test/CodeGen/Mips/msa/spill.ll
+++ b/llvm/test/CodeGen/Mips/msa/spill.ll
@@ -1,8 +1,8 @@
 ; Test that the correct instruction is chosen for spill and reload by trying
 ; to have 33 live MSA registers simultaneously
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 define i32 @test_i8(ptr %p0, ptr %q1) nounwind {
 entry:
diff --git a/llvm/test/CodeGen/Mips/msa/vec.ll b/llvm/test/CodeGen/Mips/msa/vec.ll
index 8f3a822704fea..21c550bc75428 100644
--- a/llvm/test/CodeGen/Mips/msa/vec.ll
+++ b/llvm/test/CodeGen/Mips/msa/vec.ll
@@ -1,8 +1,8 @@
 ; Test the MSA intrinsics that are encoded with the VEC instruction format.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s \
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s \
 ; RUN:   | FileCheck -check-prefix=ANYENDIAN %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s \
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 -relocation-model=pic < %s \
 ; RUN:   | FileCheck -check-prefix=ANYENDIAN %s
 
 @llvm_mips_and_v_b_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
diff --git a/llvm/test/CodeGen/Mips/msa/vecs10.ll b/llvm/test/CodeGen/Mips/msa/vecs10.ll
index 12da397a5d018..9d720f5f318cf 100644
--- a/llvm/test/CodeGen/Mips/msa/vecs10.ll
+++ b/llvm/test/CodeGen/Mips/msa/vecs10.ll
@@ -1,7 +1,7 @@
 ; Test the MSA intrinsics that are encoded with the VECS10 instruction format.
 
-; RUN: llc -mtriple=mips -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
-; RUN: llc -mtriple=mipsel -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -mattr=+msa,+fp64,+mips32r2 < %s | FileCheck %s
 
 @llvm_mips_bnz_v_ARG1 = global <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, align 16
 
diff --git a/llvm/test/CodeGen/Mips/octeon.ll b/llvm/test/CodeGen/Mips/octeon.ll
index 09a35e05cb454..e6c375a0d9c30 100644
--- a/llvm/test/CodeGen/Mips/octeon.ll
+++ b/llvm/test/CodeGen/Mips/octeon.ll
@@ -1,6 +1,6 @@
-; RUN: llc -O1 < %s -mtriple=mips64 -mcpu=octeon | FileCheck %s -check-prefixes=ALL,OCTEON
-; RUN: llc -O1 < %s -mtriple=mips64 -mcpu=mips64 | FileCheck %s -check-prefixes=ALL,MIPS64
-; RUN: llc -O1 < %s -mtriple=mips64 -mcpu=octeon -relocation-model=pic | FileCheck %s -check-prefixes=ALL,OCTEON-PIC
+; RUN: llc -O1 < %s -mtriple=mips64-elf -mcpu=octeon | FileCheck %s -check-prefixes=ALL,OCTEON
+; RUN: llc -O1 < %s -mtriple=mips64-elf -mcpu=mips64 | FileCheck %s -check-prefixes=ALL,MIPS64
+; RUN: llc -O1 < %s -mtriple=mips64-elf -mcpu=octeon -relocation-model=pic | FileCheck %s -check-prefixes=ALL,OCTEON-PIC
 
 define i64 @addi64(i64 %a, i64 %b) nounwind {
 entry:
diff --git a/llvm/test/CodeGen/Mips/prevent-hoisting.ll b/llvm/test/CodeGen/Mips/prevent-hoisting.ll
index 05b7e964c9ae0..3d659746ddb9d 100644
--- a/llvm/test/CodeGen/Mips/prevent-hoisting.ll
+++ b/llvm/test/CodeGen/Mips/prevent-hoisting.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel -O3 -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=mipsel-elf -O3 -relocation-model=pic < %s | FileCheck %s
 
 
 ; MIPS direct branches implicitly define register $at. This test makes sure that
diff --git a/llvm/test/CodeGen/Mips/selTBteqzCmpi.ll b/llvm/test/CodeGen/Mips/selTBteqzCmpi.ll
index 6520bcdc0622a..939d192ba28e5 100644
--- a/llvm/test/CodeGen/Mips/selTBteqzCmpi.ll
+++ b/llvm/test/CodeGen/Mips/selTBteqzCmpi.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 1, align 4
 @j = global i32 2, align 4
diff --git a/llvm/test/CodeGen/Mips/selTBtnezCmpi.ll b/llvm/test/CodeGen/Mips/selTBtnezCmpi.ll
index 9c3089d56ae32..7524bf2408673 100644
--- a/llvm/test/CodeGen/Mips/selTBtnezCmpi.ll
+++ b/llvm/test/CodeGen/Mips/selTBtnezCmpi.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 1, align 4
 @j = global i32 2, align 4
diff --git a/llvm/test/CodeGen/Mips/selTBtnezSlti.ll b/llvm/test/CodeGen/Mips/selTBtnezSlti.ll
index 1e32d9f2be1f6..792168e567dbf 100644
--- a/llvm/test/CodeGen/Mips/selTBtnezSlti.ll
+++ b/llvm/test/CodeGen/Mips/selTBtnezSlti.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @i = global i32 1, align 4
 @j = global i32 2, align 4
diff --git a/llvm/test/CodeGen/Mips/seleq.ll b/llvm/test/CodeGen/Mips/seleq.ll
index fcf12202b40ca..579c9c140a6b3 100644
--- a/llvm/test/CodeGen/Mips/seleq.ll
+++ b/llvm/test/CodeGen/Mips/seleq.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @t = global i32 10, align 4
 @f = global i32 199, align 4
diff --git a/llvm/test/CodeGen/Mips/seleqk.ll b/llvm/test/CodeGen/Mips/seleqk.ll
index 6292494671717..73a5967ae4aa2 100644
--- a/llvm/test/CodeGen/Mips/seleqk.ll
+++ b/llvm/test/CodeGen/Mips/seleqk.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @t = global i32 10, align 4
 @f = global i32 199, align 4
diff --git a/llvm/test/CodeGen/Mips/selgek.ll b/llvm/test/CodeGen/Mips/selgek.ll
index 9c9e77ae236a7..a9de8b20dfe39 100644
--- a/llvm/test/CodeGen/Mips/selgek.ll
+++ b/llvm/test/CodeGen/Mips/selgek.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @t = global i32 10, align 4
 @f = global i32 199, align 4
diff --git a/llvm/test/CodeGen/Mips/selgt.ll b/llvm/test/CodeGen/Mips/selgt.ll
index a59c89f875173..47648490a5e3f 100644
--- a/llvm/test/CodeGen/Mips/selgt.ll
+++ b/llvm/test/CodeGen/Mips/selgt.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @t = global i32 10, align 4
 @f = global i32 199, align 4
diff --git a/llvm/test/CodeGen/Mips/selle.ll b/llvm/test/CodeGen/Mips/selle.ll
index 59fe279966183..c7a321d4aa04d 100644
--- a/llvm/test/CodeGen/Mips/selle.ll
+++ b/llvm/test/CodeGen/Mips/selle.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @t = global i32 10, align 4
 @f = global i32 199, align 4
diff --git a/llvm/test/CodeGen/Mips/selltk.ll b/llvm/test/CodeGen/Mips/selltk.ll
index 2661ea1dead26..dccee12a51024 100644
--- a/llvm/test/CodeGen/Mips/selltk.ll
+++ b/llvm/test/CodeGen/Mips/selltk.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @t = global i32 10, align 4
 @f = global i32 199, align 4
diff --git a/llvm/test/CodeGen/Mips/selne.ll b/llvm/test/CodeGen/Mips/selne.ll
index 0d117c4d76601..ff4cd116441c0 100644
--- a/llvm/test/CodeGen/Mips/selne.ll
+++ b/llvm/test/CodeGen/Mips/selne.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @t = global i32 10, align 4
 @f = global i32 199, align 4
diff --git a/llvm/test/CodeGen/Mips/selnek.ll b/llvm/test/CodeGen/Mips/selnek.ll
index ecc68842cc623..f21693aeff0c7 100644
--- a/llvm/test/CodeGen/Mips/selnek.ll
+++ b/llvm/test/CodeGen/Mips/selnek.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=mipsel -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
+; RUN: llc -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic < %s | FileCheck %s -check-prefix=16
 
 @t = global i32 10, align 4
 @f = global i32 199, align 4
diff --git a/llvm/test/CodeGen/Mips/selpat.ll b/llvm/test/CodeGen/Mips/selpat.ll
index ad263bb41295b..dafe40e763636 100644
--- a/llvm/test/CodeGen/Mips/selpat.ll
+++ b/llvm/test/CodeGen/Mips/selpat.ll
@@ -1,4 +1,4 @@
-; RUN: llc  -mtriple=mipsel -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+; RUN: llc  -mtriple=mipsel-elf -mattr=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
 
 @t = global i32 10, align 4
 @f = global i32 199, align 4
diff --git a/llvm/test/CodeGen/Mips/unalignedload.ll b/llvm/test/CodeGen/Mips/unalignedload.ll
index 030c640522752..912998ab9d038 100644
--- a/llvm/test/CodeGen/Mips/unalignedload.ll
+++ b/llvm/test/CodeGen/Mips/unalignedload.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc  < %s -mtriple=mipsel -mcpu=mips32   -relocation-model=pic | FileCheck %s -check-prefixes=MIPS32-EL
-; RUN: llc  < %s -mtriple=mips   -mcpu=mips32   -relocation-model=pic | FileCheck %s -check-prefixes=MIPS32-EB
-; RUN: llc  < %s -mtriple=mipsel -mcpu=mips32r2 -relocation-model=pic | FileCheck %s -check-prefixes=MIPS32-EL
-; RUN: llc  < %s -mtriple=mips   -mcpu=mips32r2 -relocation-model=pic | FileCheck %s -check-prefixes=MIPS32-EB
-; RUN: llc  < %s -mtriple=mipsel -mcpu=mips32r6 -relocation-model=pic | FileCheck %s -check-prefixes=MIPS32R6-EL
-; RUN: llc  < %s -mtriple=mips   -mcpu=mips32r6 -relocation-model=pic | FileCheck %s -check-prefixes=MIPS32R6-EB
+; RUN: llc  < %s -mtriple=mipsel-elf -mcpu=mips32   -relocation-model=pic | FileCheck %s -check-prefixes=MIPS32-EL
+; RUN: llc  < %s -mtriple=mips-elf   -mcpu=mips32   -relocation-model=pic | FileCheck %s -check-prefixes=MIPS32-EB
+; RUN: llc  < %s -mtriple=mipsel-elf -mcpu=mips32r2 -relocation-model=pic | FileCheck %s -check-prefixes=MIPS32-EL
+; RUN: llc  < %s -mtriple=mips-elf   -mcpu=mips32r2 -relocation-model=pic | FileCheck %s -check-prefixes=MIPS32-EB
+; RUN: llc  < %s -mtriple=mipsel-elf -mcpu=mips32r6 -relocation-model=pic | FileCheck %s -check-prefixes=MIPS32R6-EL
+; RUN: llc  < %s -mtriple=mips-elf   -mcpu=mips32r6 -relocation-model=pic | FileCheck %s -check-prefixes=MIPS32R6-EB
 
 %struct.S2 = type { %struct.S1, %struct.S1 }
 %struct.S1 = type { i8, i8 }
diff --git a/llvm/test/DebugInfo/Mips/tls.ll b/llvm/test/DebugInfo/Mips/tls.ll
index 927966d514078..5a00d6b757c36 100644
--- a/llvm/test/DebugInfo/Mips/tls.ll
+++ b/llvm/test/DebugInfo/Mips/tls.ll
@@ -1,5 +1,5 @@
-; RUN: llc -O0 -mtriple=mips -mcpu=mips32r2 < %s | FileCheck %s -check-prefix=CHECK-WORD
-; RUN: llc -O0 -mtriple=mips64 -mcpu=mips64r2 < %s | FileCheck %s -check-prefix=CHECK-DWORD
+; RUN: llc -O0 -mtriple=mips-elf -mcpu=mips32r2 -filetype=asm < %s | FileCheck %s -check-prefix=CHECK-WORD
+; RUN: llc -O0 -mtriple=mips64-elf -mcpu=mips64r2 -filetype=asm < %s | FileCheck %s -check-prefix=CHECK-DWORD
 
 @x = thread_local global i32 5, align 4, !dbg !0
 
diff --git a/llvm/test/MC/Mips/coff-basic.ll b/llvm/test/MC/Mips/coff-basic.ll
new file mode 100644
index 0000000000000..4c25cd659c5c7
--- /dev/null
+++ b/llvm/test/MC/Mips/coff-basic.ll
@@ -0,0 +1,7 @@
+; RUN: llc -mtriple mipsel-windows -filetype=obj < %s | obj2yaml | FileCheck %s
+
+define i32 @foo() {
+  ret i32 0
+}
+
+; CHECK: Machine:         IMAGE_FILE_MACHINE_R4000
diff --git a/llvm/test/MC/Mips/coff-relocs.ll b/llvm/test/MC/Mips/coff-relocs.ll
new file mode 100644
index 0000000000000..1d8b3f192d7af
--- /dev/null
+++ b/llvm/test/MC/Mips/coff-relocs.ll
@@ -0,0 +1,42 @@
+; RUN: llc -mtriple mipsel-windows -filetype=obj < %s | obj2yaml | FileCheck %s
+
+; CHECK:  Machine:         IMAGE_FILE_MACHINE_R4000
+
+
+
+; CHECK:  - Name:            .text
+; CHECK:    Relocations:
+
+declare void @bar()
+define i32 @foo_jmp() {
+  call i32 @bar()
+; CHECK:      - VirtualAddress:  8
+; CHECK:        SymbolName:      bar
+; CHECK:        Type:            IMAGE_REL_MIPS_JMPADDR
+  ret i32 0
+}
+
+@var = external global i32
+define i32 @foo_var() {
+  %1 = load i32, i32* @var
+; CHECK:      - VirtualAddress:  32
+; CHECK:        SymbolName:      var
+; CHECK:        Type:            IMAGE_REL_MIPS_REFHI
+; CHECK:      - VirtualAddress:  40
+; CHECK:        SymbolName:      var
+; CHECK:        Type:            IMAGE_REL_MIPS_REFLO
+  ret i32 %1
+}
+
+
+
+; CHECK:  - Name:            .data
+; CHECK:    Relocations:
+
+%struct._PTR = type { ptr }
+
+@var1 = internal global %struct._PTR { ptr @var2 }
+@var2 = external global i32
+; CHECK:      - VirtualAddress:  0
+; CHECK:        SymbolName:      var2
+; CHECK:        Type:            IMAGE_REL_MIPS_REFWORD

From fe85c71a7b556b3cef1528399b123538e6b3dd4b Mon Sep 17 00:00:00 2001
From: Mikhail Goncharov <goncharov.mikhail@gmail.com>
Date: Fri, 20 Dec 2024 10:52:12 +0100
Subject: [PATCH 157/209] [bazel] port 53d080c5b5dfbb46eb81d189736864f5b6196492

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel      | 1 +
 utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel | 1 +
 2 files changed, 2 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 45719f729001d..c632cde3abd96 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -10218,6 +10218,7 @@ cc_binary(
     srcs = ["tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp"],
     deps = [
         ":ArithDialect",
+        ":ArithToLLVM",
         ":BuiltinToLLVMIRTranslation",
         ":ConvertToSPIRV",
         ":ExecutionEngineUtils",
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index a9446d60d2634..daaa241d2bf92 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -805,6 +805,7 @@ cc_library(
     includes = ["lib/Dialect/Test"],
     deps = [
         "//mlir:AffineToStandard",
+        "//mlir:ArithToLLVM",
         "//mlir:FuncDialect",
         "//mlir:FuncToLLVM",
         "//mlir:IR",

From 81e63f9e0c4b86ca1a00be7aeeffb1519a74226e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 20 Dec 2024 09:55:11 +0000
Subject: [PATCH 158/209] [CostModel][X86] getShuffleCost - use
 processShuffleMasks to split SK_PermuteTwoSrc shuffles to legal types
 (#120599)

processShuffleMasks can now correctly handle 2 src shuffles, so we can use the existing SK_PermuteSingleSrc splitting cost logic to handle SK_PermuteTwoSrc as well and correctly recognise the number of active subvectors per legalised shuffle.
---
 .../lib/Target/X86/X86TargetTransformInfo.cpp |  11 +-
 .../X86/shuffle-concat_subvector-codesize.ll  | 108 ++++-----
 .../X86/shuffle-concat_subvector-latency.ll   | 108 ++++-----
 .../shuffle-concat_subvector-sizelatency.ll   | 108 ++++-----
 .../CostModel/X86/shuffle-concat_subvector.ll | 108 ++++-----
 .../X86/shuffle-insert_subvector-codesize.ll  | 152 ++++++-------
 .../X86/shuffle-insert_subvector-latency.ll   | 152 ++++++-------
 .../shuffle-insert_subvector-sizelatency.ll   | 152 ++++++-------
 .../CostModel/X86/shuffle-insert_subvector.ll | 152 ++++++-------
 .../CostModel/X86/shuffle-two-src-codesize.ll | 210 +++++++++++-------
 .../X86/shuffle-two-src-fp16-codesize.ll      |   2 +-
 .../X86/shuffle-two-src-fp16-latency.ll       |   2 +-
 .../X86/shuffle-two-src-fp16-sizelatency.ll   |   2 +-
 .../CostModel/X86/shuffle-two-src-fp16.ll     |   2 +-
 .../CostModel/X86/shuffle-two-src-latency.ll  | 210 +++++++++++-------
 .../X86/shuffle-two-src-sizelatency.ll        | 210 +++++++++++-------
 .../Analysis/CostModel/X86/shuffle-two-src.ll | 210 +++++++++++-------
 .../Transforms/PhaseOrdering/X86/pr94546.ll   |  39 ++--
 .../SLPVectorizer/X86/horizontal-minmax.ll    |  19 +-
 .../X86/minbitwidth-transformed-operand.ll    |   7 +-
 .../VectorCombine/X86/shuffle-of-casts.ll     |  16 +-
 .../VectorCombine/X86/shuffle-of-shuffles.ll  |  27 ++-
 22 files changed, 1073 insertions(+), 934 deletions(-)

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 54c9998c0ead2..808f48eb92a61 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1698,7 +1698,8 @@ InstructionCost X86TTIImpl::getShuffleCost(
   // We are going to permute multiple sources and the result will be in multiple
   // destinations. Providing an accurate cost only for splits where the element
   // type remains the same.
-  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
+  if ((Kind == TTI::SK_PermuteSingleSrc || Kind == TTI::SK_PermuteTwoSrc) &&
+      LT.first != 1) {
     MVT LegalVT = LT.second;
     if (LegalVT.isVector() &&
         LegalVT.getVectorElementType().getSizeInBits() ==
@@ -1784,14 +1785,6 @@ InstructionCost X86TTIImpl::getShuffleCost(
     return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
   }
 
-  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
-  if (Kind == TTI::SK_PermuteTwoSrc && !IsInLaneShuffle && LT.first != 1) {
-    // We assume that source and destination have the same vector type.
-    InstructionCost NumOfDests = LT.first;
-    InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
-    LT.first = NumOfDests * NumOfShufflesPerDest;
-  }
-
   static const CostTblEntry AVX512VBMIShuffleTbl[] = {
       {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
       {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-codesize.ll
index c78023e24572c..176a794ea666f 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-codesize.ll
@@ -2,15 +2,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse2 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s -check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s -check-prefixes=AVX
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
 
 ;
 ; Verify the cost model for concat_subvector style shuffles.
@@ -19,14 +19,14 @@
 define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a512, <2 x double> %b128, <4 x double> %b256, <8 x double> %b512) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -48,14 +48,14 @@ define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a5
 define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -77,24 +77,17 @@ define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x
 define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512, <4 x float> %b128, <8 x float> %b256, <16 x float> %b512) {
 ; SSE-LABEL: 'test_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXf32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXf32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX-LABEL: 'test_vXf32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -113,24 +106,17 @@ define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512
 define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) {
 ; SSE-LABEL: 'test_vXi32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXi32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX-LABEL: 'test_vXi32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -147,19 +133,19 @@ define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x
 }
 
 define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) {
-; AVX1-LABEL: 'test_vXi16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXi16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXi16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -190,19 +176,19 @@ define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8
 }
 
 define void @test_vXi8(<16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) {
-; AVX1-LABEL: 'test_vXi8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXi8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXi8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-latency.ll
index 78aaf6e15cd2d..a99b1f0d7dbe8 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-latency.ll
@@ -2,15 +2,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse2 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s -check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
 
 ;
 ; Verify the cost model for concat_subvector style shuffles.
@@ -19,14 +19,14 @@
 define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a512, <2 x double> %b128, <4 x double> %b256, <8 x double> %b512) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -48,14 +48,14 @@ define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a5
 define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -77,24 +77,17 @@ define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x
 define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512, <4 x float> %b128, <8 x float> %b256, <16 x float> %b512) {
 ; SSE-LABEL: 'test_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXf32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXf32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX-LABEL: 'test_vXf32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -113,24 +106,17 @@ define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512
 define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) {
 ; SSE-LABEL: 'test_vXi32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXi32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX-LABEL: 'test_vXi32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -147,19 +133,19 @@ define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x
 }
 
 define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) {
-; AVX1-LABEL: 'test_vXi16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXi16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXi16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -190,19 +176,19 @@ define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8
 }
 
 define void @test_vXi8(<16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) {
-; AVX1-LABEL: 'test_vXi8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXi8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXi8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-sizelatency.ll
index 4edfa8c1384d4..85996551710ee 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-sizelatency.ll
@@ -2,15 +2,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse2 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s -check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
 
 ;
 ; Verify the cost model for concat_subvector style shuffles.
@@ -19,14 +19,14 @@
 define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a512, <2 x double> %b128, <4 x double> %b256, <8 x double> %b512) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -48,14 +48,14 @@ define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a5
 define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -77,24 +77,17 @@ define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x
 define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512, <4 x float> %b128, <8 x float> %b256, <16 x float> %b512) {
 ; SSE-LABEL: 'test_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXf32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXf32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX-LABEL: 'test_vXf32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -113,24 +106,17 @@ define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512
 define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) {
 ; SSE-LABEL: 'test_vXi32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXi32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX-LABEL: 'test_vXi32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -147,19 +133,19 @@ define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x
 }
 
 define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) {
-; AVX1-LABEL: 'test_vXi16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXi16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXi16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -190,19 +176,19 @@ define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8
 }
 
 define void @test_vXi8(<16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) {
-; AVX1-LABEL: 'test_vXi8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXi8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXi8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector.ll b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector.ll
index 3d5c2cfb2143d..d530d11432d96 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector.ll
@@ -2,15 +2,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s -check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s -check-prefixes=AVX
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
 
 ;
 ; Verify the cost model for concat_subvector style shuffles.
@@ -19,14 +19,14 @@
 define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a512, <2 x double> %b128, <4 x double> %b256, <8 x double> %b512) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -48,14 +48,14 @@ define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a5
 define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -77,24 +77,17 @@ define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x
 define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512, <4 x float> %b128, <8 x float> %b256, <16 x float> %b512) {
 ; SSE-LABEL: 'test_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXf32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXf32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX-LABEL: 'test_vXf32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -113,24 +106,17 @@ define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512
 define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) {
 ; SSE-LABEL: 'test_vXi32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXi32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX-LABEL: 'test_vXi32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -147,19 +133,19 @@ define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x
 }
 
 define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) {
-; AVX1-LABEL: 'test_vXi16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE-LABEL: 'test_vXi16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXi16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -190,19 +176,19 @@ define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8
 }
 
 define void @test_vXi8(<16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) {
-; AVX1-LABEL: 'test_vXi8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE-LABEL: 'test_vXi8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXi8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll
index 57f72056f9c8d..4e4235198b45e 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll
@@ -143,19 +143,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -174,19 +174,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -205,19 +205,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -242,13 +242,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -273,13 +273,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -364,19 +364,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -395,19 +395,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -426,19 +426,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -463,13 +463,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -494,13 +494,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -595,17 +595,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -630,17 +630,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -665,17 +665,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll
index ae91a70f67aee..61978badb34d4 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll
@@ -143,19 +143,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -174,19 +174,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -205,19 +205,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -242,13 +242,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -273,13 +273,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -364,19 +364,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -395,19 +395,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -426,19 +426,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -463,13 +463,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -494,13 +494,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -595,17 +595,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -630,17 +630,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -665,17 +665,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll
index c84b2847cf3e9..94e3bc3a610bd 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll
@@ -143,19 +143,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -174,19 +174,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -205,19 +205,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -242,13 +242,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -273,13 +273,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -364,19 +364,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -395,19 +395,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -426,19 +426,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -463,13 +463,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -494,13 +494,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -595,17 +595,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -630,17 +630,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -665,17 +665,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
index c4cbd6141d1f0..fe3e61d23397d 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
@@ -143,19 +143,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -174,19 +174,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -205,19 +205,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -242,13 +242,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -273,13 +273,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -364,19 +364,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -395,19 +395,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -426,19 +426,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -463,13 +463,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -494,13 +494,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -595,17 +595,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -630,17 +630,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -665,17 +665,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll
index 6b2029e71a6e1..027af628ea322 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll
@@ -3,15 +3,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+xop | FileCheck %s -check-prefixes=XOP
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s -check-prefixes=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
 
 ;
 ; Verify the cost model for 2 src shuffles
@@ -20,30 +20,37 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXf64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXf64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXf64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -56,30 +63,37 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -90,36 +104,52 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512,
 }
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512, <32 x float> %src1024, <2 x float> %src64_1, <4 x float> %src128_1, <8 x float> %src256_1, <16 x float> %src512_1, <32 x float> %src1024_1) {
-; SSE-LABEL: 'test_vXf32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE2-LABEL: 'test_vXf32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXf32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXf32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -127,7 +157,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -139,36 +169,52 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 }
 
 define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512, <32 x i32> %src1024, <2 x i32> %src64_1, <4 x i32> %src128_1, <8 x i32> %src256_1, <16 x i32> %src512_1, <32 x i32> %src1024_1) {
-; SSE-LABEL: 'test_vXi32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE2-LABEL: 'test_vXi32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXi32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXi32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -176,7 +222,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -192,27 +238,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -220,8 +266,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -229,8 +275,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 420 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -238,8 +284,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
@@ -248,7 +294,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi16'
@@ -257,7 +303,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMI-LABEL: 'test_vXi16'
@@ -266,7 +312,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
@@ -284,8 +330,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 364 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -293,8 +339,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -302,8 +348,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -312,7 +358,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -321,7 +367,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -330,7 +376,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-codesize.ll
index ae05855337b6c..dc7e8292c6eef 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-codesize.ll
@@ -6,7 +6,7 @@ define void @test_vXf16(<8 x half> %src128, <16 x half> %src256, <32 x half> %sr
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-latency.ll
index ca13511c32707..8ad1fa46f59e6 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-latency.ll
@@ -6,7 +6,7 @@ define void @test_vXf16(<8 x half> %src128, <16 x half> %src256, <32 x half> %sr
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-sizelatency.ll
index 5312e8a87930e..dc7f4c04d0f35 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-sizelatency.ll
@@ -6,7 +6,7 @@ define void @test_vXf16(<8 x half> %src128, <16 x half> %src256, <32 x half> %sr
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll
index 9e444b6888a49..8ed785453d4e0 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll
@@ -6,7 +6,7 @@ define void @test_vXf16(<8 x half> %src128, <16 x half> %src256, <32 x half> %sr
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll
index 1c07247914263..f9f045f3a172b 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll
@@ -3,15 +3,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+xop | FileCheck %s -check-prefixes=XOP
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s -check-prefixes=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
 
 ;
 ; Verify the cost model for 2 src shuffles
@@ -20,30 +20,37 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXf64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXf64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXf64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -56,30 +63,37 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -90,36 +104,52 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512,
 }
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512, <32 x float> %src1024, <2 x float> %src64_1, <4 x float> %src128_1, <8 x float> %src256_1, <16 x float> %src512_1, <32 x float> %src1024_1) {
-; SSE-LABEL: 'test_vXf32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE2-LABEL: 'test_vXf32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXf32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXf32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -127,7 +157,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -139,36 +169,52 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 }
 
 define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512, <32 x i32> %src1024, <2 x i32> %src64_1, <4 x i32> %src128_1, <8 x i32> %src256_1, <16 x i32> %src512_1, <32 x i32> %src1024_1) {
-; SSE-LABEL: 'test_vXi32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE2-LABEL: 'test_vXi32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXi32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXi32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -176,7 +222,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -192,27 +238,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -220,8 +266,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -229,8 +275,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 420 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -238,8 +284,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
@@ -248,7 +294,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi16'
@@ -257,7 +303,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMI-LABEL: 'test_vXi16'
@@ -266,7 +312,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
@@ -284,8 +330,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 364 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -293,8 +339,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -302,8 +348,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -312,7 +358,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -321,7 +367,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -330,7 +376,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll
index 8f40881800687..76690afecabdd 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll
@@ -3,15 +3,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+xop | FileCheck %s -check-prefixes=XOP
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s -check-prefixes=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
 
 ;
 ; Verify the cost model for 2 src shuffles
@@ -20,30 +20,37 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXf64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXf64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXf64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -56,30 +63,37 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -90,36 +104,52 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512,
 }
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512, <32 x float> %src1024, <2 x float> %src64_1, <4 x float> %src128_1, <8 x float> %src256_1, <16 x float> %src512_1, <32 x float> %src1024_1) {
-; SSE-LABEL: 'test_vXf32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE2-LABEL: 'test_vXf32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXf32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXf32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -127,7 +157,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -139,36 +169,52 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 }
 
 define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512, <32 x i32> %src1024, <2 x i32> %src64_1, <4 x i32> %src128_1, <8 x i32> %src256_1, <16 x i32> %src512_1, <32 x i32> %src1024_1) {
-; SSE-LABEL: 'test_vXi32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE2-LABEL: 'test_vXi32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXi32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXi32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -176,7 +222,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -192,27 +238,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -220,8 +266,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -229,8 +275,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 420 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -238,8 +284,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
@@ -248,7 +294,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi16'
@@ -257,7 +303,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMI-LABEL: 'test_vXi16'
@@ -266,7 +312,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
@@ -284,8 +330,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 364 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -293,8 +339,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -302,8 +348,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -312,7 +358,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -321,7 +367,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -330,7 +376,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll
index 32c06d4c44cec..034ec0acf79d9 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll
@@ -3,15 +3,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+xop | FileCheck %s -check-prefixes=XOP
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s -check-prefixes=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
 
 ;
 ; Verify the cost model for 2 src shuffles
@@ -20,30 +20,37 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXf64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-LABEL: 'test_vXf64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXf64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -56,30 +63,37 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-LABEL: 'test_vXi64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -90,36 +104,52 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512,
 }
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512, <32 x float> %src1024, <2 x float> %src64_1, <4 x float> %src128_1, <8 x float> %src256_1, <16 x float> %src512_1, <32 x float> %src1024_1) {
-; SSE-LABEL: 'test_vXf32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'test_vXf32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXf32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXf32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -127,7 +157,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -139,36 +169,52 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 }
 
 define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512, <32 x i32> %src1024, <2 x i32> %src64_1, <4 x i32> %src128_1, <8 x i32> %src256_1, <16 x i32> %src512_1, <32 x i32> %src1024_1) {
-; SSE-LABEL: 'test_vXi32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'test_vXi32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXi32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXi32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -176,7 +222,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -192,27 +238,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -220,8 +266,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -229,8 +275,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 420 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -238,8 +284,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
@@ -248,7 +294,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi16'
@@ -257,7 +303,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512VBMI-LABEL: 'test_vXi16'
@@ -266,7 +312,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
@@ -284,8 +330,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 364 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -293,8 +339,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -302,8 +348,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -312,7 +358,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -321,7 +367,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -330,7 +376,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll
index cbdafde5e421b..354a988416c3e 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll
@@ -1,19 +1,32 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64    -O3                   -S < %s  | FileCheck %s
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -O3                   -S < %s  | FileCheck %s
-; RUN: opt -mtriple=x86_64-- -mcpu=btver2    -O3                   -S < %s  | FileCheck %s
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -O3                   -S < %s  | FileCheck %s
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64    -passes="default<O3>" -S < %s  | FileCheck %s
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes="default<O3>" -S < %s  | FileCheck %s
-; RUN: opt -mtriple=x86_64-- -mcpu=btver2    -passes="default<O3>" -S < %s  | FileCheck %s
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes="default<O3>" -S < %s  | FileCheck %s
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64    -O3                   -S < %s  | FileCheck %s --check-prefixes=SSE2
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -O3                   -S < %s  | FileCheck %s --check-prefixes=SSE4
+; RUN: opt -mtriple=x86_64-- -mcpu=btver2    -O3                   -S < %s  | FileCheck %s --check-prefixes=AVX
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -O3                   -S < %s  | FileCheck %s --check-prefixes=AVX
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64    -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=SSE2
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=SSE4
+; RUN: opt -mtriple=x86_64-- -mcpu=btver2    -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=AVX
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=AVX
 
 define <4 x double> @PR94546(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @PR94546(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 poison, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 poison, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+; SSE2-LABEL: @PR94546(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 7>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
+; SSE2-NEXT:    ret <4 x double> [[TMP4]]
+;
+; SSE4-LABEL: @PR94546(
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 poison, i32 6>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 poison, i32 7>
+; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    ret <4 x double> [[TMP3]]
+;
+; AVX-LABEL: @PR94546(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 poison, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 poison, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %vecext = extractelement <4 x double> %a, i32 0
   %vecext1 = extractelement <4 x double> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index 0bc91d42b0f13..7626eea85f219 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -972,22 +972,15 @@ define i32 @maxi8_wrong_parent(i32) {
 ; SSE2-NEXT:    ret i32 [[TMP23]]
 ;
 ; SSE4-LABEL: @maxi8_wrong_parent(
-; SSE4-NEXT:    [[TMP2:%.*]] = load i32, ptr @arr, align 16
-; SSE4-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
+; SSE4-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16
 ; SSE4-NEXT:    br label [[PP:%.*]]
 ; SSE4:       pp:
 ; SSE4-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
-; SSE4-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
-; SSE4-NEXT:    [[TMP6:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
-; SSE4-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
-; SSE4-NEXT:    [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP5]]
-; SSE4-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP5]]
-; SSE4-NEXT:    [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP6]], [[TMP2]]
-; SSE4-NEXT:    [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP6]], i32 [[TMP2]]
-; SSE4-NEXT:    [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]]
-; SSE4-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]]
-; SSE4-NEXT:    [[OP_RDX6:%.*]] = icmp sgt i32 [[OP_RDX5]], [[TMP3]]
-; SSE4-NEXT:    [[OP_RDX7:%.*]] = select i1 [[OP_RDX6]], i32 [[OP_RDX5]], i32 [[TMP3]]
+; SSE4-NEXT:    [[TMP8:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
+; SSE4-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP4]], i64 0)
+; SSE4-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP8]], i64 4)
+; SSE4-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP2]], i64 6)
+; SSE4-NEXT:    [[OP_RDX7:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP7]])
 ; SSE4-NEXT:    ret i32 [[OP_RDX7]]
 ;
 ; AVX-LABEL: @maxi8_wrong_parent(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll
index 57b5d2af48ee6..76104efc1bb78 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll
@@ -5,12 +5,11 @@ define void @test(i64 %d.promoted.i) {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: i64 [[D_PROMOTED_I:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 0, i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[D_PROMOTED_I]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i1> [[TMP3]], <2 x i1> poison, <16 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i1> <i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x i1> [[TMP4]], <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i1> @llvm.vector.insert.v16i1.v2i1(<16 x i1> poison, <2 x i1> [[TMP3]], i64 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i1> [[TMP4]], <16 x i1> <i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <16 x i1> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
index 179e11136c883..fba4b60ef417b 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
@@ -165,11 +165,17 @@ define <16 x float> @concat_fptrunc_v8f64_v16f32(<8 x double> %a0, <8 x double>
 ; commuted vector concatenation
 
 define <16 x i32> @rconcat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: define <16 x i32> @rconcat_sext_v8i16_v16i32(
-; CHECK-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> [[A1]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
-; CHECK-NEXT:    ret <16 x i32> [[R]]
+; SSE-LABEL: define <16 x i32> @rconcat_sext_v8i16_v16i32(
+; SSE-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A1]], <8 x i16> [[A0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
+; SSE-NEXT:    ret <16 x i32> [[R]]
+;
+; AVX-LABEL: define <16 x i32> @rconcat_sext_v8i16_v16i32(
+; AVX-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> [[A1]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
+; AVX-NEXT:    ret <16 x i32> [[R]]
 ;
   %x0 = sext <8 x i16> %a0 to <8 x i32>
   %x1 = sext <8 x i16> %a1 to <8 x i32>
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
index 7dc70cd8b3d77..b30dc9ffdc596 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX
 
 ; fold to identity
 
@@ -47,14 +47,21 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
 ; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
 
 define  <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1)  {
-; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
-; CHECK-NEXT:    [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
-; CHECK-NEXT:    [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; CHECK-NEXT:    ret <4 x double> [[BLEND]]
+; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; SSE-NEXT:    [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; SSE-NEXT:    [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
+; SSE-NEXT:    ret <4 x double> [[BLEND]]
+;
+; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; AVX-NEXT:    [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; AVX-NEXT:    [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; AVX-NEXT:    ret <4 x double> [[BLEND]]
 ;
   %ld0 = load <4 x double>, ptr %p0, align 32
   %ld1 = load <4 x double>, ptr %p1, align 32

From e10cb443a1ba30cd1368907df246e968b7508278 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Fri, 20 Dec 2024 10:50:05 +0100
Subject: [PATCH 159/209] Revert "[compiler-rt] Add weak defs for
 .*contiguous_container.* functions (#120376)"

This reverts commit a73ca291547cf4f5822a3029dd56315354557517.
---
 .../lib/sanitizer_common/CMakeLists.txt       |  1 -
 .../sanitizer_common_interface.inc            | 14 ++---
 .../sanitizer_contiguous_container.cpp        | 53 -------------------
 3 files changed, 7 insertions(+), 61 deletions(-)
 delete mode 100644 compiler-rt/lib/sanitizer_common/sanitizer_contiguous_container.cpp

diff --git a/compiler-rt/lib/sanitizer_common/CMakeLists.txt b/compiler-rt/lib/sanitizer_common/CMakeLists.txt
index 9eb47e21863cb..09391e4f5f370 100644
--- a/compiler-rt/lib/sanitizer_common/CMakeLists.txt
+++ b/compiler-rt/lib/sanitizer_common/CMakeLists.txt
@@ -4,7 +4,6 @@
 set(SANITIZER_SOURCES_NOTERMINATION
   sanitizer_allocator.cpp
   sanitizer_common.cpp
-  sanitizer_contiguous_container.cpp
   sanitizer_deadlock_detector1.cpp
   sanitizer_deadlock_detector2.cpp
   sanitizer_errno.cpp
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interface.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interface.inc
index 900a99329d56c..4ea75cdd67cb9 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interface.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interface.inc
@@ -8,18 +8,18 @@
 // Sanitizer Common interface list.
 //===----------------------------------------------------------------------===//
 INTERFACE_FUNCTION(__sanitizer_acquire_crash_state)
-INTERFACE_WEAK_FUNCTION(__sanitizer_annotate_contiguous_container)
-INTERFACE_WEAK_FUNCTION(__sanitizer_annotate_double_ended_contiguous_container)
-INTERFACE_WEAK_FUNCTION(__sanitizer_copy_contiguous_container_annotations)
-INTERFACE_WEAK_FUNCTION(__sanitizer_contiguous_container_find_bad_address)
-INTERFACE_WEAK_FUNCTION(
+INTERFACE_FUNCTION(__sanitizer_annotate_contiguous_container)
+INTERFACE_FUNCTION(__sanitizer_annotate_double_ended_contiguous_container)
+INTERFACE_FUNCTION(__sanitizer_copy_contiguous_container_annotations)
+INTERFACE_FUNCTION(__sanitizer_contiguous_container_find_bad_address)
+INTERFACE_FUNCTION(
     __sanitizer_double_ended_contiguous_container_find_bad_address)
 INTERFACE_FUNCTION(__sanitizer_set_death_callback)
 INTERFACE_FUNCTION(__sanitizer_set_report_path)
 INTERFACE_FUNCTION(__sanitizer_set_report_fd)
 INTERFACE_FUNCTION(__sanitizer_get_report_path)
-INTERFACE_WEAK_FUNCTION(__sanitizer_verify_contiguous_container)
-INTERFACE_WEAK_FUNCTION(__sanitizer_verify_double_ended_contiguous_container)
+INTERFACE_FUNCTION(__sanitizer_verify_contiguous_container)
+INTERFACE_FUNCTION(__sanitizer_verify_double_ended_contiguous_container)
 INTERFACE_WEAK_FUNCTION(__sanitizer_on_print)
 INTERFACE_WEAK_FUNCTION(__sanitizer_report_error_summary)
 INTERFACE_WEAK_FUNCTION(__sanitizer_sandbox_on_notify)
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_contiguous_container.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_contiguous_container.cpp
deleted file mode 100644
index 4c093c1d4e918..0000000000000
--- a/compiler-rt/lib/sanitizer_common/sanitizer_contiguous_container.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-//===-- sanitizer_contiguous_container.cpp
-//-----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===---------------------------------------------------------------------===//
-//
-// This file provides weak defs of __sanitizer*contiguous_container* functions
-// whose strong implementations can be defined in particular runtime libs
-// of sanitizers
-//
-//===---------------------------------------------------------------------===//
-
-#include "sanitizer_internal_defs.h"
-
-SANITIZER_INTERFACE_WEAK_DEF(void, __sanitizer_annotate_contiguous_container,
-                             const void *, const void *, const void *,
-                             const void *) {}
-
-SANITIZER_INTERFACE_WEAK_DEF(
-    void, __sanitizer_annotate_double_ended_contiguous_container, const void *,
-    const void *, const void *, const void *, const void *, const void *) {}
-
-SANITIZER_INTERFACE_WEAK_DEF(void,
-                             __sanitizer_copy_contiguous_container_annotations,
-                             const void *, const void *, const void *,
-                             const void *) {}
-
-SANITIZER_INTERFACE_WEAK_DEF(int, __sanitizer_verify_contiguous_container,
-                             const void *, const void *, const void *) {
-  return 0;
-}
-
-SANITIZER_INTERFACE_WEAK_DEF(
-    int, __sanitizer_verify_double_ended_contiguous_container, const void *,
-    const void *, const void *, const void *) {
-  return 0;
-}
-
-SANITIZER_INTERFACE_WEAK_DEF(const void *,
-                             __sanitizer_contiguous_container_find_bad_address,
-                             const void *, const void *, const void *) {
-  return nullptr;
-}
-
-SANITIZER_INTERFACE_WEAK_DEF(
-    const void *,
-    __sanitizer_double_ended_contiguous_container_find_bad_address,
-    const void *, const void *, const void *, const void *) {
-  return nullptr;
-}

From 69ebac7ad6ae1db9bb19cf3a19ea978af6034ca3 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@hanshq.net>
Date: Fri, 20 Dec 2024 11:03:17 +0100
Subject: [PATCH 160/209] [win/asan] Don't intercept memset etc. in ntdll
 (#120397)

When ntdll was added to the list of of "interesting DLLs" list (in
d58230b9dcb3b312a2da8f874daa0cc8dc27da9b), the intention was not to
intercept the "mini CRT" functions it exports. OverrideFunction would
only intercept the *first* function it found when searching the list of
DLLs, and ntdll was put last in that list.

However, after 42cdfbcf3e92466754c175cb0e1e237e9f66749e,
OverrideFunction intercepts *all* matching functions in those DLLs. As
a side-effect, the runtime would now intercept functions like memset
etc. also in ntdll.

This causes a problem when ntdll-internal functions like
RtlDispatchException call the intercepted memset, which tries to
inspect uncommitted shadow memory, raising an exception, and getting
stuck in that loop until the stack overflows.

Since we never intended to intercept ntdll's memset etc., the simplest
fix seems to be to actively ignore ntdll when intercepting those
functions.

Fixes #114793
---
 .../lib/interception/interception_win.cpp     | 32 ++++++++++++++++---
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp
index a5897274521e9..bd85c50a083a6 100644
--- a/compiler-rt/lib/interception/interception_win.cpp
+++ b/compiler-rt/lib/interception/interception_win.cpp
@@ -213,6 +213,18 @@ static char* _strchr(char* str, char c) {
   return nullptr;
 }
 
+static int _strcmp(const char *s1, const char *s2) {
+  while (true) {
+    unsigned c1 = *s1;
+    unsigned c2 = *s2;
+    if (c1 != c2) return (c1 < c2) ? -1 : 1;
+    if (c1 == 0) break;
+    s1++;
+    s2++;
+  }
+  return 0;
+}
+
 static void _memset(void *p, int value, size_t sz) {
   for (size_t i = 0; i < sz; ++i)
     ((char*)p)[i] = (char)value;
@@ -1177,8 +1189,7 @@ static void **InterestingDLLsAvailable() {
     "libc++.dll",     // libc++
     "libunwind.dll",  // libunwind
 #  endif
-    // NTDLL should go last as it exports some functions that we should
-    // override in the CRT [presumably only used internally].
+    // NTDLL must go last as it gets special treatment in OverrideFunction.
     "ntdll.dll",
     NULL
   };
@@ -1235,7 +1246,7 @@ uptr InternalGetProcAddress(void *module, const char *func_name) {
 
   for (DWORD i = 0; i < exports->NumberOfNames; i++) {
     RVAPtr<char> name(module, names[i]);
-    if (!strcmp(func_name, name)) {
+    if (!_strcmp(func_name, name)) {
       DWORD index = ordinals[i];
       RVAPtr<char> func(module, functions[index]);
 
@@ -1281,9 +1292,22 @@ uptr InternalGetProcAddress(void *module, const char *func_name) {
 
 bool OverrideFunction(
     const char *func_name, uptr new_func, uptr *orig_old_func) {
+  static const char *kNtDllIgnore[] = {
+    "memcmp", "memcpy", "memmove", "memset"
+  };
+
   bool hooked = false;
   void **DLLs = InterestingDLLsAvailable();
   for (size_t i = 0; DLLs[i]; ++i) {
+    if (DLLs[i + 1] == nullptr) {
+      // This is the last DLL, i.e. NTDLL. It exports some functions that
+      // we only want to override in the CRT.
+      for (const char *ignored : kNtDllIgnore) {
+        if (_strcmp(func_name, ignored) == 0)
+          return hooked;
+      }
+    }
+
     uptr func_addr = InternalGetProcAddress(DLLs[i], func_name);
     if (func_addr &&
         OverrideFunction(func_addr, new_func, orig_old_func)) {
@@ -1337,7 +1361,7 @@ bool OverrideImportedFunction(const char *module_to_patch,
       RVAPtr<IMAGE_IMPORT_BY_NAME> import_by_name(
           module, name_table->u1.ForwarderString);
       const char *funcname = &import_by_name->Name[0];
-      if (strcmp(funcname, function_name) == 0)
+      if (_strcmp(funcname, function_name) == 0)
         break;
     }
   }

From b03a09e74fa38eceddbc314c4f896a935224f453 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 20 Dec 2024 11:03:33 +0100
Subject: [PATCH 161/209] [mlir] Fix integration tests after #120548 (#120706)

This should have been part of #120548.
---
 .../Pipelines/SparseTensorPipelines.cpp       |  2 ++
 .../microbench-linalg-async-parallel-for.mlir | 16 ++++++-----
 .../microbench-scf-async-parallel-for.mlir    | 25 +++++++++--------
 .../Async/CPU/test-async-parallel-for-1d.mlir | 27 ++++++++++---------
 .../Async/CPU/test-async-parallel-for-2d.mlir | 21 ++++++++-------
 .../Dialect/Complex/CPU/correctness.mlir      |  2 +-
 .../Dialect/ControlFlow/assert.mlir           |  2 +-
 .../CPU/X86/test-inline-asm-vector.mlir       |  2 +-
 .../Dialect/LLVMIR/CPU/test-vp-intrinsic.mlir |  2 +-
 .../Linalg/CPU/runtime-verification.mlir      |  1 +
 .../MemRef/cast-runtime-verification.mlir     |  4 ++-
 .../MemRef/load-runtime-verification.mlir     |  1 +
 .../Integration/Dialect/MemRef/memref_abi.c   |  2 +-
 ...reinterpret-cast-runtime-verification.mlir |  1 +
 .../MemRef/subview-runtime-verification.mlir  |  1 +
 .../Standard/CPU/test-ceil-floor-pos-neg.mlir |  4 +--
 .../Dialect/Vector/CPU/0-d-vectors.mlir       |  2 +-
 .../Dialect/Vector/CPU/broadcast.mlir         |  2 +-
 .../Dialect/Vector/CPU/compress.mlir          |  2 +-
 .../Dialect/Vector/CPU/constant-mask.mlir     |  2 +-
 .../Dialect/Vector/CPU/contraction.mlir       |  2 +-
 .../Dialect/Vector/CPU/create-mask-v4i1.mlir  |  2 +-
 .../Dialect/Vector/CPU/create-mask.mlir       |  2 +-
 .../Dialect/Vector/CPU/expand.mlir            |  2 +-
 .../Vector/CPU/extract-strided-slice.mlir     |  2 +-
 .../Vector/CPU/flat-transpose-col.mlir        |  2 +-
 .../Vector/CPU/flat-transpose-row.mlir        |  2 +-
 .../Integration/Dialect/Vector/CPU/fma.mlir   |  2 +-
 .../Dialect/Vector/CPU/gather.mlir            |  2 +-
 .../Dialect/Vector/CPU/index-vectors.mlir     |  2 +-
 .../Vector/CPU/insert-strided-slice.mlir      |  2 +-
 .../Dialect/Vector/CPU/maskedload.mlir        |  2 +-
 .../Dialect/Vector/CPU/maskedstore.mlir       |  2 +-
 .../Vector/CPU/matrix-multiply-col.mlir       |  2 +-
 .../Vector/CPU/matrix-multiply-row.mlir       |  2 +-
 .../Dialect/Vector/CPU/outerproduct-f32.mlir  |  2 +-
 .../Dialect/Vector/CPU/outerproduct-i64.mlir  |  2 +-
 .../Dialect/Vector/CPU/print-fp.mlir          |  2 +-
 .../Dialect/Vector/CPU/print-int.mlir         |  2 +-
 .../Dialect/Vector/CPU/realloc.mlir           |  4 +--
 .../Vector/CPU/reductions-f32-reassoc.mlir    |  3 ++-
 .../Dialect/Vector/CPU/reductions-f32.mlir    |  2 +-
 .../Vector/CPU/reductions-f64-reassoc.mlir    |  3 ++-
 .../Dialect/Vector/CPU/reductions-f64.mlir    |  2 +-
 .../Dialect/Vector/CPU/reductions-i32.mlir    |  2 +-
 .../Dialect/Vector/CPU/reductions-i4.mlir     |  2 +-
 .../Dialect/Vector/CPU/reductions-i64.mlir    |  2 +-
 .../Dialect/Vector/CPU/reductions-si4.mlir    |  2 +-
 .../Dialect/Vector/CPU/reductions-ui4.mlir    |  2 +-
 .../Integration/Dialect/Vector/CPU/scan.mlir  |  2 +-
 .../Dialect/Vector/CPU/scatter.mlir           |  2 +-
 .../Dialect/Vector/CPU/shape-cast.mlir        |  2 +-
 .../Dialect/Vector/CPU/shuffle.mlir           |  2 +-
 .../Dialect/Vector/CPU/shuffle16x16.mlir      |  2 +-
 .../Dialect/Vector/CPU/sparse-dot-matvec.mlir |  2 +-
 .../CPU/sparse-saxpy-jagged-matvec.mlir       |  2 +-
 .../Dialect/Vector/CPU/transfer-read-1d.mlir  |  4 +--
 .../Dialect/Vector/CPU/transfer-read-2d.mlir  |  4 +--
 .../Dialect/Vector/CPU/transfer-read-3d.mlir  |  4 +--
 .../Dialect/Vector/CPU/transfer-read.mlir     |  4 +--
 .../Dialect/Vector/CPU/transfer-to-loops.mlir |  4 +--
 .../Dialect/Vector/CPU/transfer-write.mlir    |  2 +-
 .../Dialect/Vector/CPU/transpose.mlir         |  2 +-
 mlir/test/Integration/GPU/CUDA/async.mlir     |  2 +-
 .../integration/dialects/linalg/opsrun.py     |  1 +
 65 files changed, 123 insertions(+), 101 deletions(-)

diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
index 96ccf9a9a2408..e922a88d5d3ef 100644
--- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
@@ -8,6 +8,7 @@
 
 #include "mlir/Dialect/SparseTensor/Pipelines/Passes.h"
 
+#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
 #include "mlir/Conversion/Passes.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
@@ -89,6 +90,7 @@ void mlir::sparse_tensor::buildSparsifier(OpPassManager &pm,
   pm.addPass(
       createConvertVectorToLLVMPass(options.convertVectorToLLVMOptions()));
   pm.addPass(createConvertFuncToLLVMPass());
+  pm.addPass(createArithToLLVMConversionPass());
 
   // Finalize GPU code generation.
   if (gpuCodegen) {
diff --git a/mlir/test/Integration/Dialect/Async/CPU/microbench-linalg-async-parallel-for.mlir b/mlir/test/Integration/Dialect/Async/CPU/microbench-linalg-async-parallel-for.mlir
index f70dd70f6a94b..db579647b1ad3 100644
--- a/mlir/test/Integration/Dialect/Async/CPU/microbench-linalg-async-parallel-for.mlir
+++ b/mlir/test/Integration/Dialect/Async/CPU/microbench-linalg-async-parallel-for.mlir
@@ -5,12 +5,13 @@
 // RUN:               -async-runtime-ref-counting                              \
 // RUN:               -async-runtime-ref-counting-opt                          \
 // RUN:               -convert-async-to-llvm                                   \
-// RUN:               -convert-scf-to-cf                                      \
+// RUN:               -convert-scf-to-cf                                       \
 // RUN:               -arith-expand                                            \
-// RUN:               -memref-expand                                              \
+// RUN:               -memref-expand                                           \
 // RUN:               -convert-vector-to-llvm                                  \
-// RUN:               -finalize-memref-to-llvm                                  \
-// RUN:               -convert-func-to-llvm                                     \
+// RUN:               -finalize-memref-to-llvm                                 \
+// RUN:               -convert-func-to-llvm                                    \
+// RUN:               -convert-arith-to-llvm                                   \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN: -e entry -entry-point-result=void -O3                                  \
@@ -21,10 +22,11 @@
 
 // RUN:   mlir-opt %s                                                          \
 // RUN:               -convert-linalg-to-loops                                 \
-// RUN:               -convert-scf-to-cf                                      \
+// RUN:               -convert-scf-to-cf                                       \
 // RUN:               -convert-vector-to-llvm                                  \
-// RUN:               -finalize-memref-to-llvm                                  \
-// RUN:               -convert-func-to-llvm                                     \
+// RUN:               -finalize-memref-to-llvm                                 \
+// RUN:               -convert-func-to-llvm                                    \
+// RUN:               -convert-arith-to-llvm                                   \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN: -e entry -entry-point-result=void -O3                                  \
diff --git a/mlir/test/Integration/Dialect/Async/CPU/microbench-scf-async-parallel-for.mlir b/mlir/test/Integration/Dialect/Async/CPU/microbench-scf-async-parallel-for.mlir
index 53847b98d8e36..3e6129dd4fb14 100644
--- a/mlir/test/Integration/Dialect/Async/CPU/microbench-scf-async-parallel-for.mlir
+++ b/mlir/test/Integration/Dialect/Async/CPU/microbench-scf-async-parallel-for.mlir
@@ -5,12 +5,13 @@
 // RUN:               -async-runtime-ref-counting-opt                          \
 // RUN:               -convert-async-to-llvm                                   \
 // RUN:               -convert-linalg-to-loops                                 \
-// RUN:               -convert-scf-to-cf                                      \
+// RUN:               -convert-scf-to-cf                                       \
 // RUN:               -arith-expand                                            \
-// RUN:               -memref-expand                                              \
+// RUN:               -memref-expand                                           \
 // RUN:               -convert-vector-to-llvm                                  \
-// RUN:               -finalize-memref-to-llvm                                  \
-// RUN:               -convert-func-to-llvm                                     \
+// RUN:               -finalize-memref-to-llvm                                 \
+// RUN:               -convert-func-to-llvm                                    \
+// RUN:               -convert-arith-to-llvm                                   \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN: -e entry -entry-point-result=void -O3                                  \
@@ -26,12 +27,13 @@
 // RUN:               -async-runtime-ref-counting-opt                          \
 // RUN:               -convert-async-to-llvm                                   \
 // RUN:               -convert-linalg-to-loops                                 \
-// RUN:               -convert-scf-to-cf                                      \
+// RUN:               -convert-scf-to-cf                                       \
 // RUN:               -arith-expand                                            \
-// RUN:               -memref-expand                                              \
+// RUN:               -memref-expand                                           \
 // RUN:               -convert-vector-to-llvm                                  \
-// RUN:               -finalize-memref-to-llvm                                  \
-// RUN:               -convert-func-to-llvm                                     \
+// RUN:               -finalize-memref-to-llvm                                 \
+// RUN:               -convert-func-to-llvm                                    \
+// RUN:               -convert-arith-to-llvm                                   \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN: -e entry -entry-point-result=void -O3                                  \
@@ -42,10 +44,11 @@
 
 // RUN:   mlir-opt %s                                                          \
 // RUN:               -convert-linalg-to-loops                                 \
-// RUN:               -convert-scf-to-cf                                      \
+// RUN:               -convert-scf-to-cf                                       \
 // RUN:               -convert-vector-to-llvm                                  \
-// RUN:               -finalize-memref-to-llvm                                  \
-// RUN:               -convert-func-to-llvm                                     \
+// RUN:               -finalize-memref-to-llvm                                 \
+// RUN:               -convert-func-to-llvm                                    \
+// RUN:               -convert-arith-to-llvm                                   \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN: -e entry -entry-point-result=void -O3                                  \
diff --git a/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-1d.mlir b/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-1d.mlir
index 27970eff4d6f9..0f1835b7006e5 100644
--- a/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-1d.mlir
+++ b/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-1d.mlir
@@ -3,11 +3,12 @@
 // RUN:               -async-runtime-ref-counting                              \
 // RUN:               -async-runtime-ref-counting-opt                          \
 // RUN:               -convert-async-to-llvm                                   \
-// RUN:               -convert-scf-to-cf                                      \
-// RUN:               -finalize-memref-to-llvm                                  \
+// RUN:               -convert-scf-to-cf                                       \
+// RUN:               -finalize-memref-to-llvm                                 \
 // RUN:               -arith-expand                                            \
-// RUN:               -memref-expand                                              \
-// RUN:               -convert-func-to-llvm                                     \
+// RUN:               -memref-expand                                           \
+// RUN:               -convert-func-to-llvm                                    \
+// RUN:               -convert-arith-to-llvm                                   \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN:  -e entry -entry-point-result=void -O0                                 \
@@ -19,11 +20,12 @@
 // RUN:               -async-to-async-runtime                                  \
 // RUN:               -async-runtime-policy-based-ref-counting                 \
 // RUN:               -convert-async-to-llvm                                   \
-// RUN:               -convert-scf-to-cf                                      \
-// RUN:               -finalize-memref-to-llvm                                  \
+// RUN:               -convert-scf-to-cf                                       \
+// RUN:               -finalize-memref-to-llvm                                 \
 // RUN:               -arith-expand                                            \
-// RUN:               -memref-expand                                              \
-// RUN:               -convert-func-to-llvm                                     \
+// RUN:               -memref-expand                                           \
+// RUN:               -convert-func-to-llvm                                    \
+// RUN:               -convert-arith-to-llvm                                   \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN:  -e entry -entry-point-result=void -O0                                 \
@@ -38,11 +40,12 @@
 // RUN:               -async-runtime-ref-counting                              \
 // RUN:               -async-runtime-ref-counting-opt                          \
 // RUN:               -convert-async-to-llvm                                   \
-// RUN:               -convert-scf-to-cf                                      \
-// RUN:               -finalize-memref-to-llvm                                  \
+// RUN:               -convert-scf-to-cf                                       \
+// RUN:               -finalize-memref-to-llvm                                 \
 // RUN:               -arith-expand                                            \
-// RUN:               -memref-expand                                              \
-// RUN:               -convert-func-to-llvm                                     \
+// RUN:               -memref-expand                                           \
+// RUN:               -convert-func-to-llvm                                    \
+// RUN:               -convert-arith-to-llvm                                   \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN:  -e entry -entry-point-result=void -O0                                 \
diff --git a/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-2d.mlir b/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-2d.mlir
index 0c29529f1e40a..869e34067f78f 100644
--- a/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-2d.mlir
+++ b/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-2d.mlir
@@ -4,9 +4,10 @@
 // RUN:               -async-runtime-ref-counting-opt                          \
 // RUN:               -arith-expand                                            \
 // RUN:               -convert-async-to-llvm                                   \
-// RUN:               -convert-scf-to-cf                                      \
-// RUN:               -finalize-memref-to-llvm                                  \
-// RUN:               -convert-func-to-llvm                                     \
+// RUN:               -convert-scf-to-cf                                       \
+// RUN:               -finalize-memref-to-llvm                                 \
+// RUN:               -convert-func-to-llvm                                    \
+// RUN:               -convert-arith-to-llvm                                   \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN:  -e entry -entry-point-result=void -O0                                 \
@@ -19,9 +20,10 @@
 // RUN:               -async-runtime-policy-based-ref-counting                 \
 // RUN:               -arith-expand                                            \
 // RUN:               -convert-async-to-llvm                                   \
-// RUN:               -convert-scf-to-cf                                      \
-// RUN:               -finalize-memref-to-llvm                                  \
-// RUN:               -convert-func-to-llvm                                     \
+// RUN:               -convert-scf-to-cf                                       \
+// RUN:               -finalize-memref-to-llvm                                 \
+// RUN:               -convert-func-to-llvm                                    \
+// RUN:               -convert-arith-to-llvm                                   \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN:  -e entry -entry-point-result=void -O0                                 \
@@ -37,9 +39,10 @@
 // RUN:               -async-runtime-ref-counting-opt                          \
 // RUN:               -arith-expand                                            \
 // RUN:               -convert-async-to-llvm                                   \
-// RUN:               -convert-scf-to-cf                                      \
-// RUN:               -finalize-memref-to-llvm                                  \
-// RUN:               -convert-func-to-llvm                                     \
+// RUN:               -convert-scf-to-cf                                       \
+// RUN:               -finalize-memref-to-llvm                                 \
+// RUN:               -convert-func-to-llvm                                    \
+// RUN:               -convert-arith-to-llvm                                   \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN:  -e entry -entry-point-result=void -O0                                 \
diff --git a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
index 5d27c3e290d50..6c81e07b3cccb 100644
--- a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
+++ b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
@@ -3,7 +3,7 @@
 // RUN:   -convert-scf-to-cf --convert-complex-to-standard \
 // RUN:   -finalize-memref-to-llvm -convert-math-to-llvm -convert-math-to-libm \
 // RUN:   -convert-vector-to-llvm -convert-complex-to-llvm \
-// RUN:   -convert-func-to-llvm -reconcile-unrealized-casts |\
+// RUN:   -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts |\
 // RUN: mlir-cpu-runner \
 // RUN:  -e entry -entry-point-result=void  \
 // RUN:  -shared-libs=%mlir_c_runner_utils |\
diff --git a/mlir/test/Integration/Dialect/ControlFlow/assert.mlir b/mlir/test/Integration/Dialect/ControlFlow/assert.mlir
index 42130250daf1b..6907a50886fe7 100644
--- a/mlir/test/Integration/Dialect/ControlFlow/assert.mlir
+++ b/mlir/test/Integration/Dialect/ControlFlow/assert.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s -test-cf-assert \
-// RUN:     -convert-func-to-llvm | \
+// RUN:     -convert-func-to-llvm -convert-arith-to-llvm | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void | \
 // RUN: FileCheck %s
 
diff --git a/mlir/test/Integration/Dialect/LLVMIR/CPU/X86/test-inline-asm-vector.mlir b/mlir/test/Integration/Dialect/LLVMIR/CPU/X86/test-inline-asm-vector.mlir
index 87041ccde427d..c14f3fa656261 100644
--- a/mlir/test/Integration/Dialect/LLVMIR/CPU/X86/test-inline-asm-vector.mlir
+++ b/mlir/test/Integration/Dialect/LLVMIR/CPU/X86/test-inline-asm-vector.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts |  \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts |  \
 // RUN: mlir-cpu-runner -e entry_point_with_all_constants -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils
 
diff --git a/mlir/test/Integration/Dialect/LLVMIR/CPU/test-vp-intrinsic.mlir b/mlir/test/Integration/Dialect/LLVMIR/CPU/test-vp-intrinsic.mlir
index a7cd3558bc6fe..84270ee132231 100644
--- a/mlir/test/Integration/Dialect/LLVMIR/CPU/test-vp-intrinsic.mlir
+++ b/mlir/test/Integration/Dialect/LLVMIR/CPU/test-vp-intrinsic.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm \
-// RUN: -convert-vector-to-llvm -convert-index-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \
+// RUN: -convert-vector-to-llvm -convert-index-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm \
 // RUN: -reconcile-unrealized-casts | \
 // RUN: mlir-translate -mlir-to-llvmir | \
 // RUN: %lli --entry-function=entry \
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir
index b05ef9422e596..c125d8041847f 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir
@@ -8,6 +8,7 @@
 // RUN: -convert-index-to-llvm \
 // RUN: -finalize-memref-to-llvm \
 // RUN: -convert-func-to-llvm \
+// RUN: -convert-arith-to-llvm \
 // RUN: -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:     -shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/Dialect/MemRef/cast-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/cast-runtime-verification.mlir
index 52b8c16d753da..9f083c73c69a3 100644
--- a/mlir/test/Integration/Dialect/MemRef/cast-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/cast-runtime-verification.mlir
@@ -1,6 +1,8 @@
 // RUN: mlir-opt %s -generate-runtime-verification -finalize-memref-to-llvm \
 // RUN:     -test-cf-assert \
-// RUN:     -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN:     -convert-func-to-llvm \
+// RUN:     -convert-arith-to-llvm \
+// RUN:     -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/MemRef/load-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/load-runtime-verification.mlir
index 169dfd7056459..1b4893dfed7f7 100644
--- a/mlir/test/Integration/Dialect/MemRef/load-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/load-runtime-verification.mlir
@@ -3,6 +3,7 @@
 // RUN:     -finalize-memref-to-llvm \
 // RUN:     -test-cf-assert \
 // RUN:     -convert-func-to-llvm \
+// RUN:     -convert-arith-to-llvm \
 // RUN:     -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
diff --git a/mlir/test/Integration/Dialect/MemRef/memref_abi.c b/mlir/test/Integration/Dialect/MemRef/memref_abi.c
index 8862727a539a8..645c29c30c451 100644
--- a/mlir/test/Integration/Dialect/MemRef/memref_abi.c
+++ b/mlir/test/Integration/Dialect/MemRef/memref_abi.c
@@ -4,7 +4,7 @@
 // Compile the MLIR file to LLVM:
 // RUN: mlir-opt %t/input.mlir \
 // RUN:  -lower-affine  -convert-scf-to-cf  -finalize-memref-to-llvm \
-// RUN:  -convert-func-to-llvm -reconcile-unrealized-casts \
+// RUN:  -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts \
 // RUN: | mlir-translate --mlir-to-llvmir -o %t.ll
 
 // Generate an object file for the MLIR code
diff --git a/mlir/test/Integration/Dialect/MemRef/reinterpret-cast-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/reinterpret-cast-runtime-verification.mlir
index 2239ba50b6626..089a3b9f0960a 100644
--- a/mlir/test/Integration/Dialect/MemRef/reinterpret-cast-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/reinterpret-cast-runtime-verification.mlir
@@ -3,6 +3,7 @@
 // RUN:     -finalize-memref-to-llvm \
 // RUN:     -test-cf-assert \
 // RUN:     -convert-func-to-llvm \
+// RUN:     -convert-arith-to-llvm \
 // RUN:     -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
diff --git a/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir b/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir
index 3ccf8b1be6d7b..539d8bb74af74 100644
--- a/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/MemRef/subview-runtime-verification.mlir
@@ -4,6 +4,7 @@
 // RUN:     -finalize-memref-to-llvm \
 // RUN:     -test-cf-assert \
 // RUN:     -convert-func-to-llvm \
+// RUN:     -convert-arith-to-llvm \
 // RUN:     -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:     -shared-libs=%mlir_runner_utils 2>&1 | \
diff --git a/mlir/test/Integration/Dialect/Standard/CPU/test-ceil-floor-pos-neg.mlir b/mlir/test/Integration/Dialect/Standard/CPU/test-ceil-floor-pos-neg.mlir
index a7013eacc9849..4550c1f960d40 100644
--- a/mlir/test/Integration/Dialect/Standard/CPU/test-ceil-floor-pos-neg.mlir
+++ b/mlir/test/Integration/Dialect/Standard/CPU/test-ceil-floor-pos-neg.mlir
@@ -1,8 +1,8 @@
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf,memref-expand,arith-expand),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf,memref-expand,arith-expand),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf,memref-expand,arith-expand),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf,memref-expand,arith-expand),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s --check-prefix=SCHECK
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/0-d-vectors.mlir b/mlir/test/Integration/Dialect/Vector/CPU/0-d-vectors.mlir
index 8efa01c7144f0..99e850a8d9a04 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/0-d-vectors.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/0-d-vectors.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/broadcast.mlir b/mlir/test/Integration/Dialect/Vector/CPU/broadcast.mlir
index a33aef5a32a40..d2cf18791fec0 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/broadcast.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/broadcast.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts  | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/compress.mlir b/mlir/test/Integration/Dialect/Vector/CPU/compress.mlir
index e21334f493d23..fd871ab2fed1c 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/compress.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/compress.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/constant-mask.mlir b/mlir/test/Integration/Dialect/Vector/CPU/constant-mask.mlir
index ec08dc59ac25f..480e8ad05262b 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/constant-mask.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/constant-mask.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/contraction.mlir b/mlir/test/Integration/Dialect/Vector/CPU/contraction.mlir
index ad35ff65b1157..506911d3ac33d 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/contraction.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/contraction.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/create-mask-v4i1.mlir b/mlir/test/Integration/Dialect/Vector/CPU/create-mask-v4i1.mlir
index cc8c57427dc3f..b871983249c21 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/create-mask-v4i1.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/create-mask-v4i1.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts| \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts| \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/create-mask.mlir b/mlir/test/Integration/Dialect/Vector/CPU/create-mask.mlir
index 58a3b81530193..9601b4c73f5b9 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/create-mask.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/create-mask.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/expand.mlir b/mlir/test/Integration/Dialect/Vector/CPU/expand.mlir
index ba61f83f760f1..0c8f7acd99f84 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/expand.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/expand.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/extract-strided-slice.mlir b/mlir/test/Integration/Dialect/Vector/CPU/extract-strided-slice.mlir
index 47c3211b8c487..397c50522a7ff 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/extract-strided-slice.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/extract-strided-slice.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-col.mlir b/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-col.mlir
index 779ef4ed4e6fe..dc13f8ce53529 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-col.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-col.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -O0 -enable-matrix -matrix-allow-contract -matrix-default-layout=column-major \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-row.mlir b/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-row.mlir
index 724949d38ee3e..9cb35c17b1462 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-row.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-row.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -O0 -enable-matrix -matrix-allow-contract -matrix-default-layout=row-major \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/fma.mlir b/mlir/test/Integration/Dialect/Vector/CPU/fma.mlir
index 10d18dec515a7..31816cc3d6c83 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/fma.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/fma.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/gather.mlir b/mlir/test/Integration/Dialect/Vector/CPU/gather.mlir
index 30700858d2f4e..61bb35f00d6ab 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/gather.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/gather.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/index-vectors.mlir b/mlir/test/Integration/Dialect/Vector/CPU/index-vectors.mlir
index cf19af8f30a05..8570ad56e021b 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/index-vectors.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/index-vectors.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/insert-strided-slice.mlir b/mlir/test/Integration/Dialect/Vector/CPU/insert-strided-slice.mlir
index 91cf95a6ec376..d64916af38425 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/insert-strided-slice.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/insert-strided-slice.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/maskedload.mlir b/mlir/test/Integration/Dialect/Vector/CPU/maskedload.mlir
index c668aa62c823a..76709ab1e7fca 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/maskedload.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/maskedload.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/maskedstore.mlir b/mlir/test/Integration/Dialect/Vector/CPU/maskedstore.mlir
index 4dcafc94164f4..b343dc9d29318 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/maskedstore.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/maskedstore.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-col.mlir b/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-col.mlir
index 5c048518cb952..dc711185f2722 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-col.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-col.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -O0 -enable-matrix -matrix-allow-contract -matrix-default-layout=column-major \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-row.mlir b/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-row.mlir
index 7b5cfbb1f3b2c..fe1152df38141 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-row.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-row.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -O0 -enable-matrix -matrix-allow-contract -matrix-default-layout=row-major \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-f32.mlir
index 90088ba8ffbf3..482a555e67f71 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-f32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-f32.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-i64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-i64.mlir
index 8253f037a34d5..b44775a4f264a 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-i64.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-i64.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/print-fp.mlir b/mlir/test/Integration/Dialect/Vector/CPU/print-fp.mlir
index ae2a31fc93667..bdd912005e25c 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/print-fp.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/print-fp.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/print-int.mlir b/mlir/test/Integration/Dialect/Vector/CPU/print-int.mlir
index 977c676e86cb1..98f5e8d72a42d 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/print-int.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/print-int.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/realloc.mlir b/mlir/test/Integration/Dialect/Vector/CPU/realloc.mlir
index 6a988000d67d2..099d920c55beb 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/realloc.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/realloc.mlir
@@ -1,7 +1,7 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -expand-realloc -expand-strided-metadata -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts |\
+// RUN: mlir-opt %s -convert-vector-to-scf -expand-realloc -expand-strided-metadata -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts |\
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils
-// RUN: mlir-opt %s -convert-vector-to-scf -expand-realloc -expand-strided-metadata -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm='use-aligned-alloc=1' -convert-func-to-llvm -arith-expand -reconcile-unrealized-casts |\
+// RUN: mlir-opt %s -convert-vector-to-scf -expand-realloc -expand-strided-metadata -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm='use-aligned-alloc=1' -convert-func-to-llvm -arith-expand -convert-arith-to-llvm -reconcile-unrealized-casts |\
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | FileCheck %s
 
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/reductions-f32-reassoc.mlir b/mlir/test/Integration/Dialect/Vector/CPU/reductions-f32-reassoc.mlir
index 298c382eac72d..7f832cd45814c 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/reductions-f32-reassoc.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/reductions-f32-reassoc.mlir
@@ -1,6 +1,7 @@
 // RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf \
 // RUN:             -convert-vector-to-llvm='reassociate-fp-reductions' \
-// RUN:             -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN:             -convert-func-to-llvm -convert-arith-to-llvm \
+// RUN:             -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/reductions-f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/reductions-f32.mlir
index aac679c5f5bcc..ce0c625c1077f 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/reductions-f32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/reductions-f32.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/reductions-f64-reassoc.mlir b/mlir/test/Integration/Dialect/Vector/CPU/reductions-f64-reassoc.mlir
index 3abe18252c0dc..850835fb091b9 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/reductions-f64-reassoc.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/reductions-f64-reassoc.mlir
@@ -1,6 +1,7 @@
 // RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf \
 // RUN:             -convert-vector-to-llvm='reassociate-fp-reductions' \
-// RUN:             -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN:             -convert-func-to-llvm -convert-arith-to-llvm \
+// RUN:             -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/reductions-f64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/reductions-f64.mlir
index d5ae64c058d45..85fdcd69e1b7c 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/reductions-f64.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/reductions-f64.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/reductions-i32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/reductions-i32.mlir
index 29df68e9c54a4..1db49e3462acc 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/reductions-i32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/reductions-i32.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/reductions-i4.mlir b/mlir/test/Integration/Dialect/Vector/CPU/reductions-i4.mlir
index dd82fa304a0bf..054173f172ebe 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/reductions-i4.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/reductions-i4.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/reductions-i64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/reductions-i64.mlir
index d87c15b84d5c3..bb2400f8e9ea0 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/reductions-i64.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/reductions-i64.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/reductions-si4.mlir b/mlir/test/Integration/Dialect/Vector/CPU/reductions-si4.mlir
index d62e23a782577..73e0e59c68fa8 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/reductions-si4.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/reductions-si4.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/reductions-ui4.mlir b/mlir/test/Integration/Dialect/Vector/CPU/reductions-ui4.mlir
index f1bac61ce7627..7a4453c4e7170 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/reductions-ui4.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/reductions-ui4.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/scan.mlir b/mlir/test/Integration/Dialect/Vector/CPU/scan.mlir
index 6a0357202e0ad..b1be537831ba1 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/scan.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/scan.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -test-vector-scan-lowering -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts  | \
+// RUN: mlir-opt %s -test-vector-scan-lowering -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/scatter.mlir b/mlir/test/Integration/Dialect/Vector/CPU/scatter.mlir
index dba453f2f09a7..cd9278213e376 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/scatter.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/scatter.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/shape-cast.mlir b/mlir/test/Integration/Dialect/Vector/CPU/shape-cast.mlir
index b01db12767afd..a60d891e0e3ed 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/shape-cast.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/shape-cast.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts  | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/shuffle.mlir b/mlir/test/Integration/Dialect/Vector/CPU/shuffle.mlir
index 1aac72142a103..e4f98a1407d02 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/shuffle.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/shuffle.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/shuffle16x16.mlir b/mlir/test/Integration/Dialect/Vector/CPU/shuffle16x16.mlir
index f7f0a7267cd07..03bd5cdfa0caf 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/shuffle16x16.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/shuffle16x16.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf \
 // RUN:   -transform-interpreter \
 // RUN:   -test-transform-dialect-erase-schedule \
-// RUN:   -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN:   -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/sparse-dot-matvec.mlir b/mlir/test/Integration/Dialect/Vector/CPU/sparse-dot-matvec.mlir
index c3677b12cefd9..7803f4404c84c 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/sparse-dot-matvec.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/sparse-dot-matvec.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/sparse-saxpy-jagged-matvec.mlir b/mlir/test/Integration/Dialect/Vector/CPU/sparse-saxpy-jagged-matvec.mlir
index ad8cedf9eb7e0..88797fa6de8ed 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/sparse-saxpy-jagged-matvec.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/sparse-saxpy-jagged-matvec.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir
index 8a98d39e657f2..488cd674bc02e 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir
@@ -1,9 +1,9 @@
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,expand-strided-metadata,lower-affine,convert-arith-to-llvm,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,expand-strided-metadata,lower-affine,convert-arith-to-llvm,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
 
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true},expand-strided-metadata,lower-affine,convert-arith-to-llvm,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true},expand-strided-metadata,lower-affine,convert-arith-to-llvm,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir
index cb8a8ce8ab0b0..229dbf144c5b9 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir
@@ -1,9 +1,9 @@
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
 
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true},lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true},lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir
index 4aecca3d6891e..a36cef858b3e5 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir
@@ -1,9 +1,9 @@
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
 
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true},lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true},lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read.mlir
index 91dc945cd3432..d912e705cdf31 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read.mlir
@@ -1,9 +1,9 @@
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
 
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true},lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true},lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-to-loops.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-to-loops.mlir
index 2c1f3e2b6fd52..b6e942938dc6b 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-to-loops.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-to-loops.mlir
@@ -1,9 +1,9 @@
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils | \
 // RUN: FileCheck %s
 
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true},lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true},lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-write.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-write.mlir
index cc6763e54c1cb..6e563625ca442 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-write.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-write.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transpose.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transpose.mlir
index 11327ee2c9988..08c006b4913ee 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transpose.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transpose.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/GPU/CUDA/async.mlir b/mlir/test/Integration/GPU/CUDA/async.mlir
index 2ec781571818d..f16c541aa2ccb 100644
--- a/mlir/test/Integration/GPU/CUDA/async.mlir
+++ b/mlir/test/Integration/GPU/CUDA/async.mlir
@@ -3,7 +3,7 @@
 // RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)' \
 // RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary="format=%gpu_compilation_format" \
 // RUN: | mlir-opt -async-to-async-runtime -async-runtime-ref-counting \
-// RUN: | mlir-opt -convert-async-to-llvm -convert-func-to-llvm \
+// RUN: | mlir-opt -convert-async-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_async_runtime \
diff --git a/mlir/test/python/integration/dialects/linalg/opsrun.py b/mlir/test/python/integration/dialects/linalg/opsrun.py
index f77900bc27773..d1639fb486527 100644
--- a/mlir/test/python/integration/dialects/linalg/opsrun.py
+++ b/mlir/test/python/integration/dialects/linalg/opsrun.py
@@ -170,6 +170,7 @@ def transform(module, boilerplate):
     pm.add("convert-vector-to-llvm")
     pm.add("finalize-memref-to-llvm")
     pm.add("convert-func-to-llvm")
+    pm.add("convert-arith-to-llvm")
     pm.add("reconcile-unrealized-casts")
     pm.run(mod.operation)
     return mod

From 091448e3c17bc8e7812dd7b571c852576d648977 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 20 Dec 2024 10:06:03 +0000
Subject: [PATCH 162/209] Revert "[CostModel][X86] getShuffleCost - use
 processShuffleMasks to split SK_PermuteTwoSrc shuffles to legal types"
 (#120707)

Reverts llvm/llvm-project#120599 - some recent tests are currently failing
---
 .../lib/Target/X86/X86TargetTransformInfo.cpp |  11 +-
 .../X86/shuffle-concat_subvector-codesize.ll  | 108 +++++----
 .../X86/shuffle-concat_subvector-latency.ll   | 108 +++++----
 .../shuffle-concat_subvector-sizelatency.ll   | 108 +++++----
 .../CostModel/X86/shuffle-concat_subvector.ll | 108 +++++----
 .../X86/shuffle-insert_subvector-codesize.ll  | 152 ++++++-------
 .../X86/shuffle-insert_subvector-latency.ll   | 152 ++++++-------
 .../shuffle-insert_subvector-sizelatency.ll   | 152 ++++++-------
 .../CostModel/X86/shuffle-insert_subvector.ll | 152 ++++++-------
 .../CostModel/X86/shuffle-two-src-codesize.ll | 210 +++++++-----------
 .../X86/shuffle-two-src-fp16-codesize.ll      |   2 +-
 .../X86/shuffle-two-src-fp16-latency.ll       |   2 +-
 .../X86/shuffle-two-src-fp16-sizelatency.ll   |   2 +-
 .../CostModel/X86/shuffle-two-src-fp16.ll     |   2 +-
 .../CostModel/X86/shuffle-two-src-latency.ll  | 210 +++++++-----------
 .../X86/shuffle-two-src-sizelatency.ll        | 210 +++++++-----------
 .../Analysis/CostModel/X86/shuffle-two-src.ll | 210 +++++++-----------
 .../Transforms/PhaseOrdering/X86/pr94546.ll   |  39 ++--
 .../SLPVectorizer/X86/horizontal-minmax.ll    |  19 +-
 .../X86/minbitwidth-transformed-operand.ll    |   7 +-
 .../VectorCombine/X86/shuffle-of-casts.ll     |  16 +-
 .../VectorCombine/X86/shuffle-of-shuffles.ll  |  27 +--
 22 files changed, 934 insertions(+), 1073 deletions(-)

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 808f48eb92a61..54c9998c0ead2 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1698,8 +1698,7 @@ InstructionCost X86TTIImpl::getShuffleCost(
   // We are going to permute multiple sources and the result will be in multiple
   // destinations. Providing an accurate cost only for splits where the element
   // type remains the same.
-  if ((Kind == TTI::SK_PermuteSingleSrc || Kind == TTI::SK_PermuteTwoSrc) &&
-      LT.first != 1) {
+  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
     MVT LegalVT = LT.second;
     if (LegalVT.isVector() &&
         LegalVT.getVectorElementType().getSizeInBits() ==
@@ -1785,6 +1784,14 @@ InstructionCost X86TTIImpl::getShuffleCost(
     return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
   }
 
+  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
+  if (Kind == TTI::SK_PermuteTwoSrc && !IsInLaneShuffle && LT.first != 1) {
+    // We assume that source and destination have the same vector type.
+    InstructionCost NumOfDests = LT.first;
+    InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
+    LT.first = NumOfDests * NumOfShufflesPerDest;
+  }
+
   static const CostTblEntry AVX512VBMIShuffleTbl[] = {
       {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
       {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-codesize.ll
index 176a794ea666f..c78023e24572c 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-codesize.ll
@@ -2,15 +2,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse2 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s -check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s -check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
 
 ;
 ; Verify the cost model for concat_subvector style shuffles.
@@ -19,14 +19,14 @@
 define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a512, <2 x double> %b128, <4 x double> %b256, <8 x double> %b512) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -48,14 +48,14 @@ define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a5
 define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -77,17 +77,24 @@ define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x
 define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512, <4 x float> %b128, <8 x float> %b256, <16 x float> %b512) {
 ; SSE-LABEL: 'test_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXf32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXf32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXf32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -106,17 +113,24 @@ define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512
 define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) {
 ; SSE-LABEL: 'test_vXi32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -133,19 +147,19 @@ define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x
 }
 
 define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) {
-; SSE-LABEL: 'test_vXi16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX-LABEL: 'test_vXi16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -176,19 +190,19 @@ define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8
 }
 
 define void @test_vXi8(<16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) {
-; SSE-LABEL: 'test_vXi8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX-LABEL: 'test_vXi8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-latency.ll
index a99b1f0d7dbe8..78aaf6e15cd2d 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-latency.ll
@@ -2,15 +2,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse2 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s -check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
 
 ;
 ; Verify the cost model for concat_subvector style shuffles.
@@ -19,14 +19,14 @@
 define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a512, <2 x double> %b128, <4 x double> %b256, <8 x double> %b512) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -48,14 +48,14 @@ define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a5
 define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -77,17 +77,24 @@ define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x
 define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512, <4 x float> %b128, <8 x float> %b256, <16 x float> %b512) {
 ; SSE-LABEL: 'test_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXf32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXf32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXf32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -106,17 +113,24 @@ define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512
 define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) {
 ; SSE-LABEL: 'test_vXi32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -133,19 +147,19 @@ define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x
 }
 
 define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) {
-; SSE-LABEL: 'test_vXi16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX-LABEL: 'test_vXi16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -176,19 +190,19 @@ define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8
 }
 
 define void @test_vXi8(<16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) {
-; SSE-LABEL: 'test_vXi8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX-LABEL: 'test_vXi8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-sizelatency.ll
index 85996551710ee..4edfa8c1384d4 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-sizelatency.ll
@@ -2,15 +2,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse2 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s -check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
 
 ;
 ; Verify the cost model for concat_subvector style shuffles.
@@ -19,14 +19,14 @@
 define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a512, <2 x double> %b128, <4 x double> %b256, <8 x double> %b512) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -48,14 +48,14 @@ define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a5
 define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -77,17 +77,24 @@ define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x
 define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512, <4 x float> %b128, <8 x float> %b256, <16 x float> %b512) {
 ; SSE-LABEL: 'test_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXf32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXf32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXf32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -106,17 +113,24 @@ define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512
 define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) {
 ; SSE-LABEL: 'test_vXi32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -133,19 +147,19 @@ define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x
 }
 
 define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) {
-; SSE-LABEL: 'test_vXi16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX-LABEL: 'test_vXi16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -176,19 +190,19 @@ define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8
 }
 
 define void @test_vXi8(<16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) {
-; SSE-LABEL: 'test_vXi8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX-LABEL: 'test_vXi8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector.ll b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector.ll
index d530d11432d96..3d5c2cfb2143d 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector.ll
@@ -2,15 +2,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s -check-prefixes=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s -check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
 
 ;
 ; Verify the cost model for concat_subvector style shuffles.
@@ -19,14 +19,14 @@
 define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a512, <2 x double> %b128, <4 x double> %b256, <8 x double> %b512) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -48,14 +48,14 @@ define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a5
 define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -77,17 +77,24 @@ define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x
 define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512, <4 x float> %b128, <8 x float> %b256, <16 x float> %b512) {
 ; SSE-LABEL: 'test_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXf32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-LABEL: 'test_vXf32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXf32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -106,17 +113,24 @@ define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512
 define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) {
 ; SSE-LABEL: 'test_vXi32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi32'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-LABEL: 'test_vXi32'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi32'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -133,19 +147,19 @@ define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x
 }
 
 define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) {
-; SSE-LABEL: 'test_vXi16'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX-LABEL: 'test_vXi16'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-LABEL: 'test_vXi16'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi16'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -176,19 +190,19 @@ define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8
 }
 
 define void @test_vXi8(<16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) {
-; SSE-LABEL: 'test_vXi8'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX-LABEL: 'test_vXi8'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-LABEL: 'test_vXi8'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi8'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll
index 4e4235198b45e..57f72056f9c8d 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll
@@ -143,19 +143,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -174,19 +174,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -205,19 +205,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -242,13 +242,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -273,13 +273,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -364,19 +364,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -395,19 +395,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -426,19 +426,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -463,13 +463,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -494,13 +494,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -595,17 +595,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -630,17 +630,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -665,17 +665,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll
index 61978badb34d4..ae91a70f67aee 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll
@@ -143,19 +143,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -174,19 +174,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -205,19 +205,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -242,13 +242,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -273,13 +273,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -364,19 +364,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -395,19 +395,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -426,19 +426,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -463,13 +463,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -494,13 +494,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -595,17 +595,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -630,17 +630,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -665,17 +665,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll
index 94e3bc3a610bd..c84b2847cf3e9 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll
@@ -143,19 +143,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -174,19 +174,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -205,19 +205,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -242,13 +242,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -273,13 +273,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -364,19 +364,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -395,19 +395,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -426,19 +426,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -463,13 +463,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -494,13 +494,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -595,17 +595,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -630,17 +630,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -665,17 +665,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
index fe3e61d23397d..c4cbd6141d1f0 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
@@ -143,19 +143,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -174,19 +174,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -205,19 +205,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -242,13 +242,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -273,13 +273,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -364,19 +364,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -395,19 +395,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -426,19 +426,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -463,13 +463,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -494,13 +494,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -595,17 +595,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -630,17 +630,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -665,17 +665,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll
index 027af628ea322..6b2029e71a6e1 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll
@@ -3,15 +3,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+xop | FileCheck %s -check-prefixes=XOP
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s -check-prefixes=AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
 
 ;
 ; Verify the cost model for 2 src shuffles
@@ -20,37 +20,30 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXf64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXf64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX-LABEL: 'test_vXf64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -63,37 +56,30 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXi64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX-LABEL: 'test_vXi64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -104,52 +90,36 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512,
 }
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512, <32 x float> %src1024, <2 x float> %src64_1, <4 x float> %src128_1, <8 x float> %src256_1, <16 x float> %src512_1, <32 x float> %src1024_1) {
-; SSE2-LABEL: 'test_vXf32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSSE3-LABEL: 'test_vXf32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSE42-LABEL: 'test_vXf32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXf32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -157,7 +127,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -169,52 +139,36 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 }
 
 define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512, <32 x i32> %src1024, <2 x i32> %src64_1, <4 x i32> %src128_1, <8 x i32> %src256_1, <16 x i32> %src512_1, <32 x i32> %src1024_1) {
-; SSE2-LABEL: 'test_vXi32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSSE3-LABEL: 'test_vXi32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSE42-LABEL: 'test_vXi32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXi32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -222,7 +176,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -238,27 +192,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -266,8 +220,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -275,8 +229,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 420 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -284,8 +238,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
@@ -294,7 +248,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi16'
@@ -303,7 +257,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMI-LABEL: 'test_vXi16'
@@ -312,7 +266,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
@@ -330,8 +284,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 364 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -339,8 +293,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -348,8 +302,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -358,7 +312,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -367,7 +321,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -376,7 +330,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-codesize.ll
index dc7e8292c6eef..ae05855337b6c 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-codesize.ll
@@ -6,7 +6,7 @@ define void @test_vXf16(<8 x half> %src128, <16 x half> %src256, <32 x half> %sr
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-latency.ll
index 8ad1fa46f59e6..ca13511c32707 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-latency.ll
@@ -6,7 +6,7 @@ define void @test_vXf16(<8 x half> %src128, <16 x half> %src256, <32 x half> %sr
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-sizelatency.ll
index dc7f4c04d0f35..5312e8a87930e 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-sizelatency.ll
@@ -6,7 +6,7 @@ define void @test_vXf16(<8 x half> %src128, <16 x half> %src256, <32 x half> %sr
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll
index 8ed785453d4e0..9e444b6888a49 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll
@@ -6,7 +6,7 @@ define void @test_vXf16(<8 x half> %src128, <16 x half> %src256, <32 x half> %sr
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll
index f9f045f3a172b..1c07247914263 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll
@@ -3,15 +3,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+xop | FileCheck %s -check-prefixes=XOP
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s -check-prefixes=AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
 
 ;
 ; Verify the cost model for 2 src shuffles
@@ -20,37 +20,30 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXf64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXf64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX-LABEL: 'test_vXf64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -63,37 +56,30 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXi64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX-LABEL: 'test_vXi64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -104,52 +90,36 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512,
 }
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512, <32 x float> %src1024, <2 x float> %src64_1, <4 x float> %src128_1, <8 x float> %src256_1, <16 x float> %src512_1, <32 x float> %src1024_1) {
-; SSE2-LABEL: 'test_vXf32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSSE3-LABEL: 'test_vXf32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSE42-LABEL: 'test_vXf32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXf32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -157,7 +127,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -169,52 +139,36 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 }
 
 define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512, <32 x i32> %src1024, <2 x i32> %src64_1, <4 x i32> %src128_1, <8 x i32> %src256_1, <16 x i32> %src512_1, <32 x i32> %src1024_1) {
-; SSE2-LABEL: 'test_vXi32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSSE3-LABEL: 'test_vXi32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSE42-LABEL: 'test_vXi32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXi32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -222,7 +176,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -238,27 +192,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -266,8 +220,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -275,8 +229,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 420 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -284,8 +238,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
@@ -294,7 +248,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi16'
@@ -303,7 +257,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMI-LABEL: 'test_vXi16'
@@ -312,7 +266,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
@@ -330,8 +284,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 364 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -339,8 +293,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -348,8 +302,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -358,7 +312,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -367,7 +321,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -376,7 +330,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll
index 76690afecabdd..8f40881800687 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll
@@ -3,15 +3,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+xop | FileCheck %s -check-prefixes=XOP
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s -check-prefixes=AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
 
 ;
 ; Verify the cost model for 2 src shuffles
@@ -20,37 +20,30 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXf64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXf64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX-LABEL: 'test_vXf64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -63,37 +56,30 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXi64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX-LABEL: 'test_vXi64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -104,52 +90,36 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512,
 }
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512, <32 x float> %src1024, <2 x float> %src64_1, <4 x float> %src128_1, <8 x float> %src256_1, <16 x float> %src512_1, <32 x float> %src1024_1) {
-; SSE2-LABEL: 'test_vXf32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSSE3-LABEL: 'test_vXf32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSE42-LABEL: 'test_vXf32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXf32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -157,7 +127,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -169,52 +139,36 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 }
 
 define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512, <32 x i32> %src1024, <2 x i32> %src64_1, <4 x i32> %src128_1, <8 x i32> %src256_1, <16 x i32> %src512_1, <32 x i32> %src1024_1) {
-; SSE2-LABEL: 'test_vXi32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSSE3-LABEL: 'test_vXi32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SSE42-LABEL: 'test_vXi32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXi32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -222,7 +176,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -238,27 +192,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -266,8 +220,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -275,8 +229,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 420 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -284,8 +238,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
@@ -294,7 +248,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi16'
@@ -303,7 +257,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMI-LABEL: 'test_vXi16'
@@ -312,7 +266,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
@@ -330,8 +284,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 364 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -339,8 +293,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -348,8 +302,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -358,7 +312,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -367,7 +321,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -376,7 +330,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll
index 034ec0acf79d9..32c06d4c44cec 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll
@@ -3,15 +3,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+xop | FileCheck %s -check-prefixes=XOP
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s -check-prefixes=AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
 
 ;
 ; Verify the cost model for 2 src shuffles
@@ -20,37 +20,30 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXf64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXf64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX-LABEL: 'test_vXf64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -63,37 +56,30 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXi64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX-LABEL: 'test_vXi64'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -104,52 +90,36 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512,
 }
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512, <32 x float> %src1024, <2 x float> %src64_1, <4 x float> %src128_1, <8 x float> %src256_1, <16 x float> %src512_1, <32 x float> %src1024_1) {
-; SSE2-LABEL: 'test_vXf32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; SSSE3-LABEL: 'test_vXf32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; SSE42-LABEL: 'test_vXf32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE-LABEL: 'test_vXf32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -157,7 +127,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -169,52 +139,36 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 }
 
 define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512, <32 x i32> %src1024, <2 x i32> %src64_1, <4 x i32> %src128_1, <8 x i32> %src256_1, <16 x i32> %src512_1, <32 x i32> %src1024_1) {
-; SSE2-LABEL: 'test_vXi32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; SSSE3-LABEL: 'test_vXi32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; SSE42-LABEL: 'test_vXi32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE-LABEL: 'test_vXi32'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -222,7 +176,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -238,27 +192,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -266,8 +220,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -275,8 +229,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 420 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -284,8 +238,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
@@ -294,7 +248,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi16'
@@ -303,7 +257,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512VBMI-LABEL: 'test_vXi16'
@@ -312,7 +266,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
@@ -330,8 +284,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 364 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -339,8 +293,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -348,8 +302,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -358,7 +312,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -367,7 +321,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -376,7 +330,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll
index 354a988416c3e..cbdafde5e421b 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll
@@ -1,32 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64    -O3                   -S < %s  | FileCheck %s --check-prefixes=SSE2
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -O3                   -S < %s  | FileCheck %s --check-prefixes=SSE4
-; RUN: opt -mtriple=x86_64-- -mcpu=btver2    -O3                   -S < %s  | FileCheck %s --check-prefixes=AVX
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -O3                   -S < %s  | FileCheck %s --check-prefixes=AVX
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64    -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=SSE2
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=SSE4
-; RUN: opt -mtriple=x86_64-- -mcpu=btver2    -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=AVX
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=AVX
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64    -O3                   -S < %s  | FileCheck %s
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -O3                   -S < %s  | FileCheck %s
+; RUN: opt -mtriple=x86_64-- -mcpu=btver2    -O3                   -S < %s  | FileCheck %s
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -O3                   -S < %s  | FileCheck %s
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64    -passes="default<O3>" -S < %s  | FileCheck %s
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes="default<O3>" -S < %s  | FileCheck %s
+; RUN: opt -mtriple=x86_64-- -mcpu=btver2    -passes="default<O3>" -S < %s  | FileCheck %s
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes="default<O3>" -S < %s  | FileCheck %s
 
 define <4 x double> @PR94546(<4 x double> %a, <4 x double> %b) {
-; SSE2-LABEL: @PR94546(
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 6>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 7>
-; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
-; SSE2-NEXT:    ret <4 x double> [[TMP4]]
-;
-; SSE4-LABEL: @PR94546(
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 poison, i32 6>
-; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 poison, i32 7>
-; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; SSE4-NEXT:    ret <4 x double> [[TMP3]]
-;
-; AVX-LABEL: @PR94546(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 poison, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 poison, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x double> [[TMP3]]
+; CHECK-LABEL: @PR94546(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 poison, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 poison, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %vecext = extractelement <4 x double> %a, i32 0
   %vecext1 = extractelement <4 x double> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index 7626eea85f219..0bc91d42b0f13 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -972,15 +972,22 @@ define i32 @maxi8_wrong_parent(i32) {
 ; SSE2-NEXT:    ret i32 [[TMP23]]
 ;
 ; SSE4-LABEL: @maxi8_wrong_parent(
-; SSE4-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16
+; SSE4-NEXT:    [[TMP2:%.*]] = load i32, ptr @arr, align 16
+; SSE4-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
 ; SSE4-NEXT:    br label [[PP:%.*]]
 ; SSE4:       pp:
 ; SSE4-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
-; SSE4-NEXT:    [[TMP8:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
-; SSE4-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP4]], i64 0)
-; SSE4-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP8]], i64 4)
-; SSE4-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP2]], i64 6)
-; SSE4-NEXT:    [[OP_RDX7:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP7]])
+; SSE4-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
+; SSE4-NEXT:    [[TMP6:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
+; SSE4-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
+; SSE4-NEXT:    [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP5]]
+; SSE4-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP5]]
+; SSE4-NEXT:    [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP6]], [[TMP2]]
+; SSE4-NEXT:    [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP6]], i32 [[TMP2]]
+; SSE4-NEXT:    [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]]
+; SSE4-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]]
+; SSE4-NEXT:    [[OP_RDX6:%.*]] = icmp sgt i32 [[OP_RDX5]], [[TMP3]]
+; SSE4-NEXT:    [[OP_RDX7:%.*]] = select i1 [[OP_RDX6]], i32 [[OP_RDX5]], i32 [[TMP3]]
 ; SSE4-NEXT:    ret i32 [[OP_RDX7]]
 ;
 ; AVX-LABEL: @maxi8_wrong_parent(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll
index 76104efc1bb78..57b5d2af48ee6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll
@@ -5,11 +5,12 @@ define void @test(i64 %d.promoted.i) {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: i64 [[D_PROMOTED_I:%.*]]) {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 0, i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[D_PROMOTED_I]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i1> @llvm.vector.insert.v16i1.v2i1(<16 x i1> poison, <2 x i1> [[TMP3]], i64 0)
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i1> [[TMP4]], <16 x i1> <i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i1> [[TMP3]], <2 x i1> poison, <16 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i1> <i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x i1> [[TMP4]], <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <16 x i1> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
index fba4b60ef417b..179e11136c883 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
@@ -165,17 +165,11 @@ define <16 x float> @concat_fptrunc_v8f64_v16f32(<8 x double> %a0, <8 x double>
 ; commuted vector concatenation
 
 define <16 x i32> @rconcat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
-; SSE-LABEL: define <16 x i32> @rconcat_sext_v8i16_v16i32(
-; SSE-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] {
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A1]], <8 x i16> [[A0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
-; SSE-NEXT:    ret <16 x i32> [[R]]
-;
-; AVX-LABEL: define <16 x i32> @rconcat_sext_v8i16_v16i32(
-; AVX-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] {
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> [[A1]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
-; AVX-NEXT:    ret <16 x i32> [[R]]
+; CHECK-LABEL: define <16 x i32> @rconcat_sext_v8i16_v16i32(
+; CHECK-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> [[A1]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
+; CHECK-NEXT:    ret <16 x i32> [[R]]
 ;
   %x0 = sext <8 x i16> %a0 to <8 x i32>
   %x1 = sext <8 x i16> %a1 to <8 x i32>
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
index b30dc9ffdc596..7dc70cd8b3d77 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
 
 ; fold to identity
 
@@ -47,21 +47,14 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
 ; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
 
 define  <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1)  {
-; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; SSE-NEXT:    [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
-; SSE-NEXT:    [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
-; SSE-NEXT:    [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
-; SSE-NEXT:    ret <4 x double> [[BLEND]]
-;
-; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; AVX-NEXT:    [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
-; AVX-NEXT:    [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
-; AVX-NEXT:    [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
-; AVX-NEXT:    [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
-; AVX-NEXT:    [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; AVX-NEXT:    ret <4 x double> [[BLEND]]
+; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; CHECK-NEXT:    [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; CHECK-NEXT:    [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    ret <4 x double> [[BLEND]]
 ;
   %ld0 = load <4 x double>, ptr %p0, align 32
   %ld1 = load <4 x double>, ptr %p1, align 32

From 2405c5fb3ed49b928bb2816ace7b67c8979cd9d7 Mon Sep 17 00:00:00 2001
From: wenzhoumei <wenzhoumei7@gmail.com>
Date: Fri, 20 Dec 2024 10:15:09 +0000
Subject: [PATCH 163/209] [llvm-readelf] Update outdated URL (#120498)

This updates a comment pointing to the list of registered machine
architectures in the ELF gABI as the URL in the old comment is no longer
valid.
---
 llvm/include/llvm/BinaryFormat/ELF.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 5356843f8ecf1..024b136816ff7 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -127,9 +127,11 @@ enum {
 // Versioning
 enum { EV_NONE = 0, EV_CURRENT = 1 };
 
-// Machine architectures
-// See current registered ELF machine architectures at:
-//    http://www.uxsglobal.com/developers/gabi/latest/ch4.eheader.html
+// Machine architectures.
+// At the time of writing, the list of registered machine architectures is
+// at https://groups.google.com/g/generic-abi/c/0kORSDcyhTE/m/ZRf_PvcHAAAJ
+// Please refer to https://groups.google.com/g/generic-abi for any further
+// updates.
 enum {
   EM_NONE = 0,           // No machine
   EM_M32 = 1,            // AT&T WE 32100

From 0b5b09b67c572867d88bbf5b41bcc5e722ec653a Mon Sep 17 00:00:00 2001
From: Shubham Sandeep Rastogi <srastogi22@apple.com>
Date: Fri, 20 Dec 2024 02:17:54 -0800
Subject: [PATCH 164/209] [NFC] Move DroppedVariableStats to its own file
 (#120711)

Move DroppedVariableStats code to its own file and change the class to
have an extensible design so that we can use it to add dropped
statistics to MIR passes and the instruction selector.

Also moved class DroppedVariableStatsIR to its own file.

Reland 2de78815604e9027efd93cac27c517bf732587d2
---
 .../llvm/Passes/DroppedVariableStats.h        | 167 +++++++--------
 .../llvm/Passes/DroppedVariableStatsIR.h      | 101 +++++++++
 .../llvm/Passes/StandardInstrumentations.h    |   2 +-
 llvm/lib/Passes/CMakeLists.txt                |   2 +-
 llvm/lib/Passes/DroppedVariableStats.cpp      | 194 ------------------
 llvm/lib/Passes/DroppedVariableStatsIR.cpp    |  91 ++++++++
 llvm/unittests/IR/CMakeLists.txt              |   2 +-
 7 files changed, 279 insertions(+), 280 deletions(-)
 create mode 100644 llvm/include/llvm/Passes/DroppedVariableStatsIR.h
 delete mode 100644 llvm/lib/Passes/DroppedVariableStats.cpp
 create mode 100644 llvm/lib/Passes/DroppedVariableStatsIR.cpp

diff --git a/llvm/include/llvm/Passes/DroppedVariableStats.h b/llvm/include/llvm/Passes/DroppedVariableStats.h
index 4555157c942b5..c4de849ca7554 100644
--- a/llvm/include/llvm/Passes/DroppedVariableStats.h
+++ b/llvm/include/llvm/Passes/DroppedVariableStats.h
@@ -14,11 +14,9 @@
 #ifndef LLVM_CODEGEN_DROPPEDVARIABLESTATS_H
 #define LLVM_CODEGEN_DROPPEDVARIABLESTATS_H
 
-#include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/PassInstrumentation.h"
 
 namespace llvm {
@@ -92,24 +90,75 @@ class DroppedVariableStats {
   void calculateDroppedStatsAndPrint(DebugVariables &DbgVariables,
                                      StringRef FuncName, StringRef PassID,
                                      StringRef FuncOrModName,
-                                     StringRef PassLevel, const Function *Func);
+                                     StringRef PassLevel,
+                                     const Function *Func) {
+    unsigned DroppedCount = 0;
+    DenseSet<VarID> &DebugVariablesBeforeSet =
+        DbgVariables.DebugVariablesBefore;
+    DenseSet<VarID> &DebugVariablesAfterSet = DbgVariables.DebugVariablesAfter;
+    DenseMap<VarID, DILocation *> &InlinedAtsMap = InlinedAts.back()[FuncName];
+    // Find an Instruction that shares the same scope as the dropped #dbg_value
+    // or has a scope that is the child of the scope of the #dbg_value, and has
+    // an inlinedAt equal to the inlinedAt of the #dbg_value or it's inlinedAt
+    // chain contains the inlinedAt of the #dbg_value, if such an Instruction is
+    // found, debug information is dropped.
+    for (VarID Var : DebugVariablesBeforeSet) {
+      if (DebugVariablesAfterSet.contains(Var))
+        continue;
+      visitEveryInstruction(DroppedCount, InlinedAtsMap, Var);
+      removeVarFromAllSets(Var, Func);
+    }
+    if (DroppedCount > 0) {
+      llvm::outs() << PassLevel << ", " << PassID << ", " << DroppedCount
+                   << ", " << FuncOrModName << "\n";
+      PassDroppedVariables = true;
+    } else
+      PassDroppedVariables = false;
+  }
 
   /// Check if a \p Var has been dropped or is a false positive. Also update the
   /// \p DroppedCount if a debug variable is dropped.
   bool updateDroppedCount(DILocation *DbgLoc, const DIScope *Scope,
                           const DIScope *DbgValScope,
                           DenseMap<VarID, DILocation *> &InlinedAtsMap,
-                          VarID Var, unsigned &DroppedCount);
+                          VarID Var, unsigned &DroppedCount) {
+    // If the Scope is a child of, or equal to the DbgValScope and is inlined at
+    // the Var's InlinedAt location, return true to signify that the Var has
+    // been dropped.
+    if (isScopeChildOfOrEqualTo(Scope, DbgValScope))
+      if (isInlinedAtChildOfOrEqualTo(DbgLoc->getInlinedAt(),
+                                      InlinedAtsMap[Var])) {
+        // Found another instruction in the variable's scope, so there exists a
+        // break point at which the variable could be observed. Count it as
+        // dropped.
+        DroppedCount++;
+        return true;
+      }
+    return false;
+  }
   /// Run code to populate relevant data structures over an llvm::Function or
   /// llvm::MachineFunction.
-  void run(DebugVariables &DbgVariables, StringRef FuncName, bool Before);
+  void run(DebugVariables &DbgVariables, StringRef FuncName, bool Before) {
+    auto &VarIDSet = (Before ? DbgVariables.DebugVariablesBefore
+                             : DbgVariables.DebugVariablesAfter);
+    auto &InlinedAtsMap = InlinedAts.back();
+    if (Before)
+      InlinedAtsMap.try_emplace(FuncName, DenseMap<VarID, DILocation *>());
+    VarIDSet = DenseSet<VarID>();
+    visitEveryDebugRecord(VarIDSet, InlinedAtsMap, FuncName, Before);
+  }
   /// Populate the VarIDSet and InlinedAtMap with the relevant information
   /// needed for before and after pass analysis to determine dropped variable
   /// status.
   void populateVarIDSetAndInlinedMap(
       const DILocalVariable *DbgVar, DebugLoc DbgLoc, DenseSet<VarID> &VarIDSet,
       DenseMap<StringRef, DenseMap<VarID, DILocation *>> &InlinedAtsMap,
-      StringRef FuncName, bool Before);
+      StringRef FuncName, bool Before) {
+    VarID Key{DbgVar->getScope(), DbgLoc->getInlinedAtScope(), DbgVar};
+    VarIDSet.insert(Key);
+    if (Before)
+      InlinedAtsMap[FuncName].try_emplace(Key, DbgLoc.getInlinedAt());
+  }
   /// Visit every llvm::Instruction or llvm::MachineInstruction and check if the
   /// debug variable denoted by its ID \p Var may have been dropped by an
   /// optimization pass.
@@ -136,87 +185,39 @@ class DroppedVariableStats {
   /// Return true if \p Scope is the same as \p DbgValScope or a child scope of
   /// \p DbgValScope, return false otherwise.
   bool isScopeChildOfOrEqualTo(const DIScope *Scope,
-                               const DIScope *DbgValScope);
+                               const DIScope *DbgValScope) {
+    while (Scope != nullptr) {
+      if (VisitedScope.find(Scope) == VisitedScope.end()) {
+        VisitedScope.insert(Scope);
+        if (Scope == DbgValScope) {
+          VisitedScope.clear();
+          return true;
+        }
+        Scope = Scope->getScope();
+      } else {
+        VisitedScope.clear();
+        return false;
+      }
+    }
+    return false;
+  }
   /// Return true if \p InlinedAt is the same as \p DbgValInlinedAt or part of
   /// the InlinedAt chain, return false otherwise.
   bool isInlinedAtChildOfOrEqualTo(const DILocation *InlinedAt,
-                                   const DILocation *DbgValInlinedAt);
-  bool PassDroppedVariables = false;
-};
-
-/// A class to collect and print dropped debug information due to LLVM IR
-/// optimization passes. After every LLVM IR pass is run, it will print how many
-/// #dbg_values were dropped due to that pass.
-class DroppedVariableStatsIR : public DroppedVariableStats {
-public:
-  DroppedVariableStatsIR(bool DroppedVarStatsEnabled)
-      : llvm::DroppedVariableStats(DroppedVarStatsEnabled) {}
-
-  void runBeforePass(Any IR) {
-    setup();
-    if (const auto *M = unwrapIR<Module>(IR))
-      return this->runOnModule(M, true);
-    if (const auto *F = unwrapIR<Function>(IR))
-      return this->runOnFunction(F, true);
-  }
-
-  void runAfterPass(StringRef P, Any IR) {
-    if (const auto *M = unwrapIR<Module>(IR))
-      runAfterPassModule(P, M);
-    else if (const auto *F = unwrapIR<Function>(IR))
-      runAfterPassFunction(P, F);
-    cleanup();
-  }
-
-  void registerCallbacks(PassInstrumentationCallbacks &PIC);
-
-private:
-  const Function *Func;
-
-  void runAfterPassFunction(StringRef PassID, const Function *F) {
-    runOnFunction(F, false);
-    calculateDroppedVarStatsOnFunction(F, PassID, F->getName().str(),
-                                       "Function");
-  }
-
-  void runAfterPassModule(StringRef PassID, const Module *M) {
-    runOnModule(M, false);
-    calculateDroppedVarStatsOnModule(M, PassID, M->getName().str(), "Module");
-  }
-  /// Populate DebugVariablesBefore, DebugVariablesAfter, InlinedAts before or
-  /// after a pass has run to facilitate dropped variable calculation for an
-  /// llvm::Function.
-  void runOnFunction(const Function *F, bool Before);
-  /// Iterate over all Instructions in a Function and report any dropped debug
-  /// information.
-  void calculateDroppedVarStatsOnFunction(const Function *F, StringRef PassID,
-                                          StringRef FuncOrModName,
-                                          StringRef PassLevel);
-  /// Populate DebugVariablesBefore, DebugVariablesAfter, InlinedAts before or
-  /// after a pass has run to facilitate dropped variable calculation for an
-  /// llvm::Module. Calls runOnFunction on every Function in the Module.
-  void runOnModule(const Module *M, bool Before);
-  /// Iterate over all Functions in a Module and report any dropped debug
-  /// information. Will call calculateDroppedVarStatsOnFunction on every
-  /// Function.
-  void calculateDroppedVarStatsOnModule(const Module *M, StringRef PassID,
-                                        StringRef FuncOrModName,
-                                        StringRef PassLevel);
-  /// Override base class method to run on an llvm::Function specifically.
-  virtual void
-  visitEveryInstruction(unsigned &DroppedCount,
-                        DenseMap<VarID, DILocation *> &InlinedAtsMap,
-                        VarID Var) override;
-  /// Override base class method to run on #dbg_values specifically.
-  virtual void visitEveryDebugRecord(
-      DenseSet<VarID> &VarIDSet,
-      DenseMap<StringRef, DenseMap<VarID, DILocation *>> &InlinedAtsMap,
-      StringRef FuncName, bool Before) override;
-
-  template <typename IRUnitT> static const IRUnitT *unwrapIR(Any IR) {
-    const IRUnitT **IRPtr = llvm::any_cast<const IRUnitT *>(&IR);
-    return IRPtr ? *IRPtr : nullptr;
+                                   const DILocation *DbgValInlinedAt) {
+    if (DbgValInlinedAt == InlinedAt)
+      return true;
+    if (!DbgValInlinedAt)
+      return false;
+    auto *IA = InlinedAt;
+    while (IA) {
+      if (IA == DbgValInlinedAt)
+        return true;
+      IA = IA->getInlinedAt();
+    }
+    return false;
   }
+  bool PassDroppedVariables = false;
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Passes/DroppedVariableStatsIR.h b/llvm/include/llvm/Passes/DroppedVariableStatsIR.h
new file mode 100644
index 0000000000000..99701e8c8e1c0
--- /dev/null
+++ b/llvm/include/llvm/Passes/DroppedVariableStatsIR.h
@@ -0,0 +1,101 @@
+///===- DroppedVariableStatsIR.h - Opt Diagnostics -*- C++ -*--------------===//
+///
+/// Part of the LLVM Project, under the Apache License v2.0 with LLVM
+/// Exceptions. See https://llvm.org/LICENSE.txt for license information.
+/// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+///
+///===---------------------------------------------------------------------===//
+/// \file
+/// Dropped Variable Statistics for Debug Information. Reports any number
+/// of #dbg_value that get dropped due to an optimization pass.
+///
+///===---------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_DROPPEDVARIABLESTATSIR_H
+#define LLVM_CODEGEN_DROPPEDVARIABLESTATSIR_H
+
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Passes/DroppedVariableStats.h"
+
+namespace llvm {
+
+/// A class to collect and print dropped debug information due to LLVM IR
+/// optimization passes. After every LLVM IR pass is run, it will print how many
+/// #dbg_values were dropped due to that pass.
+class DroppedVariableStatsIR : public DroppedVariableStats {
+public:
+  DroppedVariableStatsIR(bool DroppedVarStatsEnabled)
+      : llvm::DroppedVariableStats(DroppedVarStatsEnabled) {}
+
+  void runBeforePass(Any IR) {
+    setup();
+    if (const auto *M = unwrapIR<Module>(IR))
+      return this->runOnModule(M, true);
+    if (const auto *F = unwrapIR<Function>(IR))
+      return this->runOnFunction(F, true);
+  }
+
+  void runAfterPass(StringRef P, Any IR) {
+    if (const auto *M = unwrapIR<Module>(IR))
+      runAfterPassModule(P, M);
+    else if (const auto *F = unwrapIR<Function>(IR))
+      runAfterPassFunction(P, F);
+    cleanup();
+  }
+
+  void registerCallbacks(PassInstrumentationCallbacks &PIC);
+
+private:
+  const Function *Func;
+
+  void runAfterPassFunction(StringRef PassID, const Function *F) {
+    runOnFunction(F, false);
+    calculateDroppedVarStatsOnFunction(F, PassID, F->getName().str(),
+                                       "Function");
+  }
+
+  void runAfterPassModule(StringRef PassID, const Module *M) {
+    runOnModule(M, false);
+    calculateDroppedVarStatsOnModule(M, PassID, M->getName().str(), "Module");
+  }
+  /// Populate DebugVariablesBefore, DebugVariablesAfter, InlinedAts before or
+  /// after a pass has run to facilitate dropped variable calculation for an
+  /// llvm::Function.
+  void runOnFunction(const Function *F, bool Before);
+  /// Iterate over all Instructions in a Function and report any dropped debug
+  /// information.
+  void calculateDroppedVarStatsOnFunction(const Function *F, StringRef PassID,
+                                          StringRef FuncOrModName,
+                                          StringRef PassLevel);
+  /// Populate DebugVariablesBefore, DebugVariablesAfter, InlinedAts before or
+  /// after a pass has run to facilitate dropped variable calculation for an
+  /// llvm::Module. Calls runOnFunction on every Function in the Module.
+  void runOnModule(const Module *M, bool Before);
+  /// Iterate over all Functions in a Module and report any dropped debug
+  /// information. Will call calculateDroppedVarStatsOnFunction on every
+  /// Function.
+  void calculateDroppedVarStatsOnModule(const Module *M, StringRef PassID,
+                                        StringRef FuncOrModName,
+                                        StringRef PassLevel);
+  /// Override base class method to run on an llvm::Function specifically.
+  virtual void
+  visitEveryInstruction(unsigned &DroppedCount,
+                        DenseMap<VarID, DILocation *> &InlinedAtsMap,
+                        VarID Var) override;
+
+  /// Override base class method to run on #dbg_values specifically.
+  virtual void visitEveryDebugRecord(
+      DenseSet<VarID> &VarIDSet,
+      DenseMap<StringRef, DenseMap<VarID, DILocation *>> &InlinedAtsMap,
+      StringRef FuncName, bool Before) override;
+
+  template <typename IRUnitT> static const IRUnitT *unwrapIR(Any IR) {
+    const IRUnitT **IRPtr = llvm::any_cast<const IRUnitT *>(&IR);
+    return IRPtr ? *IRPtr : nullptr;
+  }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/Passes/StandardInstrumentations.h b/llvm/include/llvm/Passes/StandardInstrumentations.h
index 6ba466f9269f0..4e62ee9c00daf 100644
--- a/llvm/include/llvm/Passes/StandardInstrumentations.h
+++ b/llvm/include/llvm/Passes/StandardInstrumentations.h
@@ -25,7 +25,7 @@
 #include "llvm/IR/OptBisect.h"
 #include "llvm/IR/PassTimingInfo.h"
 #include "llvm/IR/ValueHandle.h"
-#include "llvm/Passes/DroppedVariableStats.h"
+#include "llvm/Passes/DroppedVariableStatsIR.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Transforms/IPO/SampleProfileProbe.h"
diff --git a/llvm/lib/Passes/CMakeLists.txt b/llvm/lib/Passes/CMakeLists.txt
index 9e16a446c9b39..23799ac4f98f7 100644
--- a/llvm/lib/Passes/CMakeLists.txt
+++ b/llvm/lib/Passes/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_llvm_component_library(LLVMPasses
   CodeGenPassBuilder.cpp
-  DroppedVariableStats.cpp
+  DroppedVariableStatsIR.cpp
   OptimizationLevel.cpp
   PassBuilder.cpp
   PassBuilderBindings.cpp
diff --git a/llvm/lib/Passes/DroppedVariableStats.cpp b/llvm/lib/Passes/DroppedVariableStats.cpp
deleted file mode 100644
index 5dc6b75fb8ace..0000000000000
--- a/llvm/lib/Passes/DroppedVariableStats.cpp
+++ /dev/null
@@ -1,194 +0,0 @@
-///===- DroppedVariableStats.cpp ------------------------------------------===//
-///
-/// Part of the LLVM Project, under the Apache License v2.0 with LLVM
-/// Exceptions. See https://llvm.org/LICENSE.txt for license information.
-/// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-///
-///===---------------------------------------------------------------------===//
-/// \file
-/// Dropped Variable Statistics for Debug Information. Reports any number
-/// of #dbg_value that get dropped due to an optimization pass.
-///
-///===---------------------------------------------------------------------===//
-
-#include "llvm/Passes/DroppedVariableStats.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Module.h"
-
-using namespace llvm;
-
-bool DroppedVariableStats::isScopeChildOfOrEqualTo(const DIScope *Scope,
-                                                   const DIScope *DbgValScope) {
-  while (Scope != nullptr) {
-    if (VisitedScope.find(Scope) == VisitedScope.end()) {
-      VisitedScope.insert(Scope);
-      if (Scope == DbgValScope) {
-        VisitedScope.clear();
-        return true;
-      }
-      Scope = Scope->getScope();
-    } else {
-      VisitedScope.clear();
-      return false;
-    }
-  }
-  return false;
-}
-
-bool DroppedVariableStats::isInlinedAtChildOfOrEqualTo(
-    const DILocation *InlinedAt, const DILocation *DbgValInlinedAt) {
-  if (DbgValInlinedAt == InlinedAt)
-    return true;
-  if (!DbgValInlinedAt)
-    return false;
-  auto *IA = InlinedAt;
-  while (IA) {
-    if (IA == DbgValInlinedAt)
-      return true;
-    IA = IA->getInlinedAt();
-  }
-  return false;
-}
-
-void DroppedVariableStats::calculateDroppedStatsAndPrint(
-    DebugVariables &DbgVariables, StringRef FuncName, StringRef PassID,
-    StringRef FuncOrModName, StringRef PassLevel, const Function *Func) {
-  unsigned DroppedCount = 0;
-  DenseSet<VarID> &DebugVariablesBeforeSet = DbgVariables.DebugVariablesBefore;
-  DenseSet<VarID> &DebugVariablesAfterSet = DbgVariables.DebugVariablesAfter;
-  DenseMap<VarID, DILocation *> &InlinedAtsMap = InlinedAts.back()[FuncName];
-  // Find an Instruction that shares the same scope as the dropped #dbg_value or
-  // has a scope that is the child of the scope of the #dbg_value, and has an
-  // inlinedAt equal to the inlinedAt of the #dbg_value or it's inlinedAt chain
-  // contains the inlinedAt of the #dbg_value, if such an Instruction is found,
-  // debug information is dropped.
-  for (VarID Var : DebugVariablesBeforeSet) {
-    if (DebugVariablesAfterSet.contains(Var))
-      continue;
-    visitEveryInstruction(DroppedCount, InlinedAtsMap, Var);
-    removeVarFromAllSets(Var, Func);
-  }
-  if (DroppedCount > 0) {
-    llvm::outs() << PassLevel << ", " << PassID << ", " << DroppedCount << ", "
-                 << FuncOrModName << "\n";
-    PassDroppedVariables = true;
-  } else
-    PassDroppedVariables = false;
-}
-
-bool DroppedVariableStats::updateDroppedCount(
-    DILocation *DbgLoc, const DIScope *Scope, const DIScope *DbgValScope,
-    DenseMap<VarID, DILocation *> &InlinedAtsMap, VarID Var,
-    unsigned &DroppedCount) {
-
-  // If the Scope is a child of, or equal to the DbgValScope and is inlined at
-  // the Var's InlinedAt location, return true to signify that the Var has been
-  // dropped.
-  if (isScopeChildOfOrEqualTo(Scope, DbgValScope))
-    if (isInlinedAtChildOfOrEqualTo(DbgLoc->getInlinedAt(),
-                                    InlinedAtsMap[Var])) {
-      // Found another instruction in the variable's scope, so there exists a
-      // break point at which the variable could be observed. Count it as
-      // dropped.
-      DroppedCount++;
-      return true;
-    }
-  return false;
-}
-
-void DroppedVariableStats::run(DebugVariables &DbgVariables, StringRef FuncName,
-                               bool Before) {
-  auto &VarIDSet = (Before ? DbgVariables.DebugVariablesBefore
-                           : DbgVariables.DebugVariablesAfter);
-  auto &InlinedAtsMap = InlinedAts.back();
-  if (Before)
-    InlinedAtsMap.try_emplace(FuncName, DenseMap<VarID, DILocation *>());
-  VarIDSet = DenseSet<VarID>();
-  visitEveryDebugRecord(VarIDSet, InlinedAtsMap, FuncName, Before);
-}
-
-void DroppedVariableStats::populateVarIDSetAndInlinedMap(
-    const DILocalVariable *DbgVar, DebugLoc DbgLoc, DenseSet<VarID> &VarIDSet,
-    DenseMap<StringRef, DenseMap<VarID, DILocation *>> &InlinedAtsMap,
-    StringRef FuncName, bool Before) {
-  VarID Key{DbgVar->getScope(), DbgLoc->getInlinedAtScope(), DbgVar};
-  VarIDSet.insert(Key);
-  if (Before)
-    InlinedAtsMap[FuncName].try_emplace(Key, DbgLoc.getInlinedAt());
-}
-
-void DroppedVariableStatsIR::runOnFunction(const Function *F, bool Before) {
-  auto &DebugVariables = DebugVariablesStack.back()[F];
-  auto FuncName = F->getName();
-  Func = F;
-  run(DebugVariables, FuncName, Before);
-}
-
-void DroppedVariableStatsIR::calculateDroppedVarStatsOnFunction(
-    const Function *F, StringRef PassID, StringRef FuncOrModName,
-    StringRef PassLevel) {
-  Func = F;
-  StringRef FuncName = F->getName();
-  DebugVariables &DbgVariables = DebugVariablesStack.back()[F];
-  calculateDroppedStatsAndPrint(DbgVariables, FuncName, PassID, FuncOrModName,
-                                PassLevel, Func);
-}
-
-void DroppedVariableStatsIR::runOnModule(const Module *M, bool Before) {
-  for (auto &F : *M)
-    runOnFunction(&F, Before);
-}
-
-void DroppedVariableStatsIR::calculateDroppedVarStatsOnModule(
-    const Module *M, StringRef PassID, StringRef FuncOrModName,
-    StringRef PassLevel) {
-  for (auto &F : *M) {
-    calculateDroppedVarStatsOnFunction(&F, PassID, FuncOrModName, PassLevel);
-  }
-}
-
-void DroppedVariableStatsIR::registerCallbacks(
-    PassInstrumentationCallbacks &PIC) {
-  if (!DroppedVariableStatsEnabled)
-    return;
-
-  PIC.registerBeforeNonSkippedPassCallback(
-      [this](StringRef P, Any IR) { return runBeforePass(IR); });
-  PIC.registerAfterPassCallback(
-      [this](StringRef P, Any IR, const PreservedAnalyses &PA) {
-        return runAfterPass(P, IR);
-      });
-  PIC.registerAfterPassInvalidatedCallback(
-      [this](StringRef P, const PreservedAnalyses &PA) { return cleanup(); });
-}
-
-void DroppedVariableStatsIR::visitEveryInstruction(
-    unsigned &DroppedCount, DenseMap<VarID, DILocation *> &InlinedAtsMap,
-    VarID Var) {
-  const DIScope *DbgValScope = std::get<0>(Var);
-  for (const auto &I : instructions(Func)) {
-    auto *DbgLoc = I.getDebugLoc().get();
-    if (!DbgLoc)
-      continue;
-    if (updateDroppedCount(DbgLoc, DbgLoc->getScope(), DbgValScope,
-                           InlinedAtsMap, Var, DroppedCount))
-      break;
-  }
-}
-
-void DroppedVariableStatsIR::visitEveryDebugRecord(
-    DenseSet<VarID> &VarIDSet,
-    DenseMap<StringRef, DenseMap<VarID, DILocation *>> &InlinedAtsMap,
-    StringRef FuncName, bool Before) {
-  for (const auto &I : instructions(Func)) {
-    for (DbgRecord &DR : I.getDbgRecordRange()) {
-      if (auto *Dbg = dyn_cast<DbgVariableRecord>(&DR)) {
-        auto *DbgVar = Dbg->getVariable();
-        auto DbgLoc = DR.getDebugLoc();
-        populateVarIDSetAndInlinedMap(DbgVar, DbgLoc, VarIDSet, InlinedAtsMap,
-                                      FuncName, Before);
-      }
-    }
-  }
-}
diff --git a/llvm/lib/Passes/DroppedVariableStatsIR.cpp b/llvm/lib/Passes/DroppedVariableStatsIR.cpp
new file mode 100644
index 0000000000000..496a47e71182e
--- /dev/null
+++ b/llvm/lib/Passes/DroppedVariableStatsIR.cpp
@@ -0,0 +1,91 @@
+///===- DroppedVariableStatsIR.cpp ----------------------------------------===//
+///
+/// Part of the LLVM Project, under the Apache License v2.0 with LLVM
+/// Exceptions. See https://llvm.org/LICENSE.txt for license information.
+/// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+///
+///===---------------------------------------------------------------------===//
+/// \file
+/// Dropped Variable Statistics for Debug Information. Reports any number
+/// of #dbg_value that get dropped due to an optimization pass.
+///
+///===---------------------------------------------------------------------===//
+
+#include "llvm/Passes/DroppedVariableStatsIR.h"
+
+using namespace llvm;
+
+void DroppedVariableStatsIR::runOnFunction(const Function *F, bool Before) {
+  auto &DebugVariables = DebugVariablesStack.back()[F];
+  auto FuncName = F->getName();
+  Func = F;
+  run(DebugVariables, FuncName, Before);
+}
+
+void DroppedVariableStatsIR::calculateDroppedVarStatsOnFunction(
+    const Function *F, StringRef PassID, StringRef FuncOrModName,
+    StringRef PassLevel) {
+  Func = F;
+  StringRef FuncName = F->getName();
+  DebugVariables &DbgVariables = DebugVariablesStack.back()[F];
+  calculateDroppedStatsAndPrint(DbgVariables, FuncName, PassID, FuncOrModName,
+                                PassLevel, Func);
+}
+
+void DroppedVariableStatsIR::runOnModule(const Module *M, bool Before) {
+  for (auto &F : *M)
+    runOnFunction(&F, Before);
+}
+
+void DroppedVariableStatsIR::calculateDroppedVarStatsOnModule(
+    const Module *M, StringRef PassID, StringRef FuncOrModName,
+    StringRef PassLevel) {
+  for (auto &F : *M) {
+    calculateDroppedVarStatsOnFunction(&F, PassID, FuncOrModName, PassLevel);
+  }
+}
+
+void DroppedVariableStatsIR::registerCallbacks(
+    PassInstrumentationCallbacks &PIC) {
+  if (!DroppedVariableStatsEnabled)
+    return;
+
+  PIC.registerBeforeNonSkippedPassCallback(
+      [this](StringRef P, Any IR) { return runBeforePass(IR); });
+  PIC.registerAfterPassCallback(
+      [this](StringRef P, Any IR, const PreservedAnalyses &PA) {
+        return runAfterPass(P, IR);
+      });
+  PIC.registerAfterPassInvalidatedCallback(
+      [this](StringRef P, const PreservedAnalyses &PA) { return cleanup(); });
+}
+
+void DroppedVariableStatsIR::visitEveryInstruction(
+    unsigned &DroppedCount, DenseMap<VarID, DILocation *> &InlinedAtsMap,
+    VarID Var) {
+  const DIScope *DbgValScope = std::get<0>(Var);
+  for (const auto &I : instructions(Func)) {
+    auto *DbgLoc = I.getDebugLoc().get();
+    if (!DbgLoc)
+      continue;
+    if (updateDroppedCount(DbgLoc, DbgLoc->getScope(), DbgValScope,
+                           InlinedAtsMap, Var, DroppedCount))
+      break;
+  }
+}
+
+void DroppedVariableStatsIR::visitEveryDebugRecord(
+    DenseSet<VarID> &VarIDSet,
+    DenseMap<StringRef, DenseMap<VarID, DILocation *>> &InlinedAtsMap,
+    StringRef FuncName, bool Before) {
+  for (const auto &I : instructions(Func)) {
+    for (DbgRecord &DR : I.getDbgRecordRange()) {
+      if (auto *Dbg = dyn_cast<DbgVariableRecord>(&DR)) {
+        auto *DbgVar = Dbg->getVariable();
+        auto DbgLoc = DR.getDebugLoc();
+        populateVarIDSetAndInlinedMap(DbgVar, DbgLoc, VarIDSet, InlinedAtsMap,
+                                      FuncName, Before);
+      }
+    }
+  }
+}
diff --git a/llvm/unittests/IR/CMakeLists.txt b/llvm/unittests/IR/CMakeLists.txt
index 01f02bf5d70ac..bea6b1b46f573 100644
--- a/llvm/unittests/IR/CMakeLists.txt
+++ b/llvm/unittests/IR/CMakeLists.txt
@@ -26,6 +26,7 @@ add_llvm_unittest(IRTests
   DemandedBitsTest.cpp
   DominatorTreeTest.cpp
   DominatorTreeBatchUpdatesTest.cpp
+  DroppedVariableStatsIRTest.cpp
   FunctionTest.cpp
   PassBuilderCallbacksTest.cpp
   IRBuilderTest.cpp
@@ -43,7 +44,6 @@ add_llvm_unittest(IRTests
   ShuffleVectorInstTest.cpp
   StructuralHashTest.cpp
   TimePassesTest.cpp
-  DroppedVariableStatsIRTest.cpp
   TypesTest.cpp
   UseTest.cpp
   UserTest.cpp

From 2fa2c2197ddbf2f06c78b6d271782a8762b13b57 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Fri, 20 Dec 2024 10:24:23 +0000
Subject: [PATCH 165/209] [llvm][docs] MemTagSanitizer is only supported on
 AArch64 Android (#120545)

```
$ ./bin/clang /tmp/test.c -o /tmp/test.o -target aarch64-linux -march=armv8+memtag -fsanitize=memtag-stack
clang: error: unsupported option '-fsanitize=memtag*' for target 'aarch64-unknown-linux'
```
But this works:
```
$ ./bin/clang /tmp/test.c -o /tmp/test.o --target=aarch64-linux-android -march=armv8+memtag -fsanitize=memtag-stack
```

Due to this check in Clang:

https://github.com/llvm/llvm-project/blob/2210da3b823ccf21fc634c858827c9f12c864b51/clang/lib/Driver/ToolChains/CommonArgs.cpp#L1651

Likely because the required notes and dynamic loader support only exist
for Android.

You can get around this, sort of, by not linking the file. However this
means you have to provide your own way of loading it, so it doesn't
change the statement that this feature is Android only.

https://github.com/llvm/llvm-project/issues/64692 also confirms that the
intent is to only support Android at this time.

And while I'm here, suggest an additive set of flags that can also be
used.
---
 llvm/docs/MemTagSanitizer.rst | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/docs/MemTagSanitizer.rst b/llvm/docs/MemTagSanitizer.rst
index 62efbfb041cbd..8a0fffc0d3822 100644
--- a/llvm/docs/MemTagSanitizer.rst
+++ b/llvm/docs/MemTagSanitizer.rst
@@ -28,10 +28,15 @@ memory bugs.
 Usage
 =====
 
-Compile and link your program with ``-fsanitize=memtag`` flag. This
-will only work when targeting AArch64 with MemTag extension. One
-possible way to achieve that is to add ``-target
-aarch64-linux -march=armv8+memtag`` to compilation flags.
+Compile and link your program with the ``-fsanitize=memtag`` flag. This
+will only work when targeting AArch64 Android with the memory tagging extension.
+One possible way to achieve that is to add ``--target=aarch64-linux-android -march=armv8+memtag``
+to your compilation flags.
+
+Note that doing this will override existing flags of the same type. Assuming that
+you are already targeting AArch64 Android, an alternative is to add
+``-Xclang -target-feature -Xclang +mte`` to your compilation flags. This
+adds the memory tagging feature, without changing anything else.
 
 Implementation
 ==============

From d66f653c8db90d0c643f8f2740bbdc01bf647f18 Mon Sep 17 00:00:00 2001
From: Pengcheng Wang <wangpengcheng.pp@bytedance.com>
Date: Fri, 20 Dec 2024 18:30:17 +0800
Subject: [PATCH 166/209] [MachinePipeliner] Skip reserved registers when
 computing register pressure (#120694)

We used to skip fixed registers, but fixed registers are not enough
because there are some runtime unusable registers like registers
reserved by `-ffixed-xxx` options.

Here we change to use reserved registers so that the estimated
pressure is more accurate.
---
 llvm/lib/CodeGen/MachinePipeliner.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index d2eb7e72ac6b2..db12b9c343b6b 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -1283,9 +1283,9 @@ class HighRegisterPressureDetector {
     }
   }
 
-  // Return true if Reg is fixed one, for example, stack pointer
-  bool isFixedRegister(Register Reg) const {
-    return Reg.isPhysical() && TRI->isFixedRegister(MF, Reg.asMCReg());
+  // Return true if Reg is reserved one, for example, stack pointer
+  bool isReservedRegister(Register Reg) const {
+    return Reg.isPhysical() && MRI.isReserved(Reg.asMCReg());
   }
 
   bool isDefinedInThisLoop(Register Reg) const {
@@ -1311,7 +1311,7 @@ class HighRegisterPressureDetector {
         // because it's used only at the first iteration.
         if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
           continue;
-        if (isFixedRegister(Reg))
+        if (isReservedRegister(Reg))
           continue;
         if (isDefinedInThisLoop(Reg))
           continue;
@@ -1423,7 +1423,7 @@ class HighRegisterPressureDetector {
 
     const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
                                                    Register Reg) {
-      if (!Reg.isValid() || isFixedRegister(Reg))
+      if (!Reg.isValid() || isReservedRegister(Reg))
         return;
 
       bool Inserted = RegSet.insert(Reg).second;
@@ -1437,7 +1437,7 @@ class HighRegisterPressureDetector {
 
     const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
                                                   Register Reg) {
-      if (!Reg.isValid() || isFixedRegister(Reg))
+      if (!Reg.isValid() || isReservedRegister(Reg))
         return;
 
       // live-in register

From 1e18815fdc13bb1f8b0b87acd8abf62b5cf70d53 Mon Sep 17 00:00:00 2001
From: Chenhui Huang <huangchenhui.yellow@bytedance.com>
Date: Fri, 20 Dec 2024 18:32:11 +0800
Subject: [PATCH 167/209] [MLIR] fix shape.broadcast canonicalize with all
 empty shape operands (#118941)

Example: all the operands of `shape.broadcast` are empty tensors.
```
func.func @all_empty(%arg0: tensor<f32>) -> tensor<0xindex> {
  %1 = shape.shape_of %arg0 : tensor<f32> -> tensor<0xindex>
  %2 = shape.const_shape [] : tensor<0xindex>
  %3 = shape.broadcast %1, %2, %1 : tensor<0xindex>, tensor<0xindex>, tensor<0xindex> -> tensor<0xindex>
  return %3 : tensor<0xindex>
}
```

One can reproduce crash when canonicalize with *down-top* order, cmd
like this:
`mlir-opt -split-input-file -allow-unregistered-dialect
-canonicalize="test-convergence top-down=0" %s`

The root cause is when all operands are empty tensor,
`RemoveEmptyShapeOperandsPattern` would filter out all operands.

Co-authored-by: Kai Sasaki <lewuathe@gmail.com>
---
 mlir/lib/Dialect/Shape/IR/Shape.cpp       |  8 ++++++++
 mlir/test/Dialect/Shape/canonicalize.mlir | 16 ++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp
index bebfaa8c1ea82..65efc88e9c403 100644
--- a/mlir/lib/Dialect/Shape/IR/Shape.cpp
+++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp
@@ -698,6 +698,14 @@ struct RemoveEmptyShapeOperandsPattern : public OpRewritePattern<OpTy> {
     auto newOperands = llvm::filter_to_vector<8>(op->getOperands(),
                                                  isPotentiallyNonEmptyShape);
 
+    // Replace the op with empty shape constant if all operants are reduced to
+    // be empty.
+    if (newOperands.empty()) {
+      rewriter.replaceOpWithNewOp<ConstShapeOp>(
+          op, op->getResultTypes().front(), rewriter.getIndexTensorAttr({}));
+      return success();
+    }
+
     // Reduce op to equivalent without empty shape operands.
     if (newOperands.size() < op.getNumOperands()) {
       rewriter.replaceOpWithNewOp<OpTy>(op, op->getResultTypes(), newOperands,
diff --git a/mlir/test/Dialect/Shape/canonicalize.mlir b/mlir/test/Dialect/Shape/canonicalize.mlir
index 5b98a7790debf..cf439c9c1b854 100644
--- a/mlir/test/Dialect/Shape/canonicalize.mlir
+++ b/mlir/test/Dialect/Shape/canonicalize.mlir
@@ -1,4 +1,5 @@
 // RUN: mlir-opt -split-input-file -allow-unregistered-dialect -canonicalize="test-convergence" %s | FileCheck %s
+// RUN: mlir-opt -split-input-file -allow-unregistered-dialect -canonicalize="test-convergence top-down=0" %s | FileCheck %s
 
 // CHECK-LABEL: func @f
 func.func @f(%arg0: tensor<2x3x4xf32>) -> tensor<3xindex> {
@@ -134,6 +135,21 @@ func.func @all_but_one_empty(%arg0 : !shape.shape) -> !shape.shape {
 
 // -----
 
+// All operands are known empty shapes.
+// CHECK-LABEL: @all_empty
+// CHECK-SAME:  (%[[ARG_0:.*]]: tensor<f32>, %[[ARG_1:.*]]: tensor<i1>)
+func.func @all_empty(%arg0: tensor<f32>, %arg1: tensor<i1>) -> tensor<0xindex> {
+  // CHECK: %[[CST:.*]] = shape.const_shape [] : tensor<0xindex>
+  // CHECK: return %[[CST]] : tensor<0xindex>
+  %1 = shape.shape_of %arg0 : tensor<f32> -> tensor<0xindex>
+  %2 = shape.shape_of %arg1 : tensor<i1> -> tensor<0xindex>
+  %3 = shape.const_shape [] : tensor<0xindex>
+  %4 = shape.broadcast %1, %2, %3 : tensor<0xindex>, tensor<0xindex>, tensor<0xindex> -> tensor<0xindex>
+  return %4 : tensor<0xindex>
+}
+
+// -----
+
 // Partial folding.
 // CHECK-LABEL: @partial_folding
 // CHECK-SAME:  (%[[ARG:.*]]: !shape.shape)

From 611401c11594871aa5c7692cd17a7f12b6fbe660 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 20 Dec 2024 10:39:07 +0000
Subject: [PATCH 168/209] [CostModel][X86] getShuffleCost - use
 processShuffleMasks to split SK_PermuteTwoSrc shuffles to legal types
 (#120599)

processShuffleMasks can now correctly handle 2 src shuffles, so we can use the existing SK_PermuteSingleSrc splitting cost logic to handle SK_PermuteTwoSrc as well and correctly recognise the number of active subvectors per legalised shuffle.
---
 .../lib/Target/X86/X86TargetTransformInfo.cpp |  11 +-
 .../X86/shuffle-concat_subvector-codesize.ll  | 108 ++++-----
 .../X86/shuffle-concat_subvector-latency.ll   | 108 ++++-----
 .../shuffle-concat_subvector-sizelatency.ll   | 108 ++++-----
 .../CostModel/X86/shuffle-concat_subvector.ll | 108 ++++-----
 .../X86/shuffle-insert_subvector-codesize.ll  | 152 ++++++-------
 .../X86/shuffle-insert_subvector-latency.ll   | 152 ++++++-------
 .../shuffle-insert_subvector-sizelatency.ll   | 152 ++++++-------
 .../CostModel/X86/shuffle-insert_subvector.ll | 152 ++++++-------
 .../CostModel/X86/shuffle-two-src-codesize.ll | 210 +++++++++++-------
 .../X86/shuffle-two-src-fp16-codesize.ll      |   2 +-
 .../X86/shuffle-two-src-fp16-latency.ll       |   2 +-
 .../X86/shuffle-two-src-fp16-sizelatency.ll   |   2 +-
 .../CostModel/X86/shuffle-two-src-fp16.ll     |   2 +-
 .../CostModel/X86/shuffle-two-src-latency.ll  | 210 +++++++++++-------
 .../X86/shuffle-two-src-sizelatency.ll        | 210 +++++++++++-------
 .../Analysis/CostModel/X86/shuffle-two-src.ll | 210 +++++++++++-------
 .../test/Transforms/PhaseOrdering/X86/hadd.ll | 137 ++++++++----
 .../Transforms/PhaseOrdering/X86/pr94546.ll   |  39 ++--
 .../SLPVectorizer/X86/horizontal-minmax.ll    |  19 +-
 .../X86/minbitwidth-transformed-operand.ll    |   7 +-
 .../VectorCombine/X86/shuffle-of-casts.ll     |  16 +-
 .../VectorCombine/X86/shuffle-of-shuffles.ll  |  27 ++-
 23 files changed, 1166 insertions(+), 978 deletions(-)

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 54c9998c0ead2..808f48eb92a61 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1698,7 +1698,8 @@ InstructionCost X86TTIImpl::getShuffleCost(
   // We are going to permute multiple sources and the result will be in multiple
   // destinations. Providing an accurate cost only for splits where the element
   // type remains the same.
-  if (Kind == TTI::SK_PermuteSingleSrc && LT.first != 1) {
+  if ((Kind == TTI::SK_PermuteSingleSrc || Kind == TTI::SK_PermuteTwoSrc) &&
+      LT.first != 1) {
     MVT LegalVT = LT.second;
     if (LegalVT.isVector() &&
         LegalVT.getVectorElementType().getSizeInBits() ==
@@ -1784,14 +1785,6 @@ InstructionCost X86TTIImpl::getShuffleCost(
     return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
   }
 
-  // For 2-input shuffles, we must account for splitting the 2 inputs into many.
-  if (Kind == TTI::SK_PermuteTwoSrc && !IsInLaneShuffle && LT.first != 1) {
-    // We assume that source and destination have the same vector type.
-    InstructionCost NumOfDests = LT.first;
-    InstructionCost NumOfShufflesPerDest = LT.first * 2 - 1;
-    LT.first = NumOfDests * NumOfShufflesPerDest;
-  }
-
   static const CostTblEntry AVX512VBMIShuffleTbl[] = {
       {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
       {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-codesize.ll
index c78023e24572c..176a794ea666f 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-codesize.ll
@@ -2,15 +2,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse2 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s -check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s -check-prefixes=AVX
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
 
 ;
 ; Verify the cost model for concat_subvector style shuffles.
@@ -19,14 +19,14 @@
 define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a512, <2 x double> %b128, <4 x double> %b256, <8 x double> %b512) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -48,14 +48,14 @@ define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a5
 define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -77,24 +77,17 @@ define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x
 define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512, <4 x float> %b128, <8 x float> %b256, <16 x float> %b512) {
 ; SSE-LABEL: 'test_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXf32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXf32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX-LABEL: 'test_vXf32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -113,24 +106,17 @@ define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512
 define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) {
 ; SSE-LABEL: 'test_vXi32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXi32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX-LABEL: 'test_vXi32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -147,19 +133,19 @@ define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x
 }
 
 define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) {
-; AVX1-LABEL: 'test_vXi16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXi16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXi16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -190,19 +176,19 @@ define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8
 }
 
 define void @test_vXi8(<16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) {
-; AVX1-LABEL: 'test_vXi8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXi8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXi8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-latency.ll
index 78aaf6e15cd2d..a99b1f0d7dbe8 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-latency.ll
@@ -2,15 +2,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse2 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s -check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
 
 ;
 ; Verify the cost model for concat_subvector style shuffles.
@@ -19,14 +19,14 @@
 define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a512, <2 x double> %b128, <4 x double> %b256, <8 x double> %b512) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -48,14 +48,14 @@ define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a5
 define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -77,24 +77,17 @@ define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x
 define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512, <4 x float> %b128, <8 x float> %b256, <16 x float> %b512) {
 ; SSE-LABEL: 'test_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXf32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXf32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX-LABEL: 'test_vXf32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -113,24 +106,17 @@ define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512
 define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) {
 ; SSE-LABEL: 'test_vXi32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXi32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX-LABEL: 'test_vXi32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -147,19 +133,19 @@ define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x
 }
 
 define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) {
-; AVX1-LABEL: 'test_vXi16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXi16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXi16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -190,19 +176,19 @@ define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8
 }
 
 define void @test_vXi8(<16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) {
-; AVX1-LABEL: 'test_vXi8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXi8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXi8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-sizelatency.ll
index 4edfa8c1384d4..85996551710ee 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector-sizelatency.ll
@@ -2,15 +2,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse2 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s -check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
 
 ;
 ; Verify the cost model for concat_subvector style shuffles.
@@ -19,14 +19,14 @@
 define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a512, <2 x double> %b128, <4 x double> %b256, <8 x double> %b512) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -48,14 +48,14 @@ define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a5
 define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -77,24 +77,17 @@ define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x
 define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512, <4 x float> %b128, <8 x float> %b256, <16 x float> %b512) {
 ; SSE-LABEL: 'test_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXf32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXf32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX-LABEL: 'test_vXf32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -113,24 +106,17 @@ define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512
 define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) {
 ; SSE-LABEL: 'test_vXi32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXi32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX-LABEL: 'test_vXi32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -147,19 +133,19 @@ define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x
 }
 
 define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) {
-; AVX1-LABEL: 'test_vXi16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXi16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXi16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -190,19 +176,19 @@ define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8
 }
 
 define void @test_vXi8(<16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) {
-; AVX1-LABEL: 'test_vXi8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE-LABEL: 'test_vXi8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXi8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector.ll b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector.ll
index 3d5c2cfb2143d..d530d11432d96 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-concat_subvector.ll
@@ -2,15 +2,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse2 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s -check-prefixes=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s -check-prefixes=AVX
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX
 
 ;
 ; Verify the cost model for concat_subvector style shuffles.
@@ -19,14 +19,14 @@
 define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a512, <2 x double> %b128, <4 x double> %b256, <8 x double> %b512) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXf64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x double> %a128, <2 x double> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x double> %a256, <4 x double> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x double> %a512, <8 x double> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -48,14 +48,14 @@ define void @test_vXf64(<2 x double> %a128, <4 x double> %a256, <8 x double> %a5
 define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x i64> %b128, <4 x i64> %b256, <8 x i64> %b512) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX-LABEL: 'test_vXi64'
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <2 x i64> %a128, <2 x i64> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <4 x i64> %a256, <4 x i64> %b256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <8 x i64> %a512, <8 x i64> %b512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -77,24 +77,17 @@ define void @test_vXi64(<2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512, <2 x
 define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512, <4 x float> %b128, <8 x float> %b256, <16 x float> %b512) {
 ; SSE-LABEL: 'test_vXf32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXf32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXf32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX-LABEL: 'test_vXf32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x float> %a256, <8 x float> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x float> %a512, <16 x float> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x float> %a128, <4 x float> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -113,24 +106,17 @@ define void @test_vXf32(<4 x float> %a128, <8 x float> %a256, <16 x float> %a512
 define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x i32> %b128, <8 x i32> %b256, <16 x i32> %b512) {
 ; SSE-LABEL: 'test_vXi32'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX1-LABEL: 'test_vXi32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX-LABEL: 'test_vXi32'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <8 x i32> %a256, <8 x i32> %b256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <16 x i32> %a512, <16 x i32> %b512, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <4 x i32> %a128, <4 x i32> %b128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -147,19 +133,19 @@ define void @test_vXi32(<4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512, <4 x
 }
 
 define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8 x i16> %b128, <16 x i16> %b256, <32 x i16> %b512) {
-; AVX1-LABEL: 'test_vXi16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE-LABEL: 'test_vXi16'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXi16'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <16 x i16> %a256, <16 x i16> %b256, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <32 x i16> %a512, <32 x i16> %b512, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <8 x i16> %a128, <8 x i16> %b128, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -190,19 +176,19 @@ define void @test_vXi16(<8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512, <8
 }
 
 define void @test_vXi8(<16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512, <16 x i8> %b128, <32 x i8> %b256, <64 x i8> %b512) {
-; AVX1-LABEL: 'test_vXi8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; AVX2-LABEL: 'test_vXi8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE-LABEL: 'test_vXi8'
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX-LABEL: 'test_vXi8'
+; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_256 = shufflevector <32 x i8> %a256, <32 x i8> %b256, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024_512 = shufflevector <64 x i8> %a512, <64 x i8> %b512, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_128 = shufflevector <16 x i8> %a128, <16 x i8> %b128, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll
index 57f72056f9c8d..4e4235198b45e 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-codesize.ll
@@ -143,19 +143,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -174,19 +174,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -205,19 +205,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -242,13 +242,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -273,13 +273,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -364,19 +364,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -395,19 +395,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -426,19 +426,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -463,13 +463,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -494,13 +494,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -595,17 +595,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -630,17 +630,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -665,17 +665,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll
index ae91a70f67aee..61978badb34d4 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-latency.ll
@@ -143,19 +143,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -174,19 +174,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -205,19 +205,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -242,13 +242,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -273,13 +273,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -364,19 +364,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -395,19 +395,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -426,19 +426,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -463,13 +463,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -494,13 +494,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -595,17 +595,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -630,17 +630,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -665,17 +665,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll
index c84b2847cf3e9..94e3bc3a610bd 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector-sizelatency.ll
@@ -143,19 +143,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -174,19 +174,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -205,19 +205,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -242,13 +242,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -273,13 +273,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -364,19 +364,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -395,19 +395,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -426,19 +426,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -463,13 +463,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -494,13 +494,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -595,17 +595,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -630,17 +630,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
@@ -665,17 +665,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
index c4cbd6141d1f0..fe3e61d23397d 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-insert_subvector.ll
@@ -143,19 +143,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -174,19 +174,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -205,19 +205,19 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -242,13 +242,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -273,13 +273,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -364,19 +364,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -395,19 +395,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -426,19 +426,19 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -463,13 +463,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -494,13 +494,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
@@ -595,17 +595,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -630,17 +630,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -665,17 +665,17 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll
index 6b2029e71a6e1..027af628ea322 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-codesize.ll
@@ -3,15 +3,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+xop | FileCheck %s -check-prefixes=XOP
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx | FileCheck %s -check-prefixes=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
 
 ;
 ; Verify the cost model for 2 src shuffles
@@ -20,30 +20,37 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXf64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXf64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXf64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -56,30 +63,37 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -90,36 +104,52 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512,
 }
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512, <32 x float> %src1024, <2 x float> %src64_1, <4 x float> %src128_1, <8 x float> %src256_1, <16 x float> %src512_1, <32 x float> %src1024_1) {
-; SSE-LABEL: 'test_vXf32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE2-LABEL: 'test_vXf32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXf32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXf32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -127,7 +157,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -139,36 +169,52 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 }
 
 define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512, <32 x i32> %src1024, <2 x i32> %src64_1, <4 x i32> %src128_1, <8 x i32> %src256_1, <16 x i32> %src512_1, <32 x i32> %src1024_1) {
-; SSE-LABEL: 'test_vXi32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE2-LABEL: 'test_vXi32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXi32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXi32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -176,7 +222,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -192,27 +238,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -220,8 +266,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -229,8 +275,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 420 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -238,8 +284,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
@@ -248,7 +294,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi16'
@@ -257,7 +303,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMI-LABEL: 'test_vXi16'
@@ -266,7 +312,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
@@ -284,8 +330,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 364 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -293,8 +339,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -302,8 +348,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -312,7 +358,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -321,7 +367,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -330,7 +376,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-codesize.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-codesize.ll
index ae05855337b6c..dc7e8292c6eef 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-codesize.ll
@@ -6,7 +6,7 @@ define void @test_vXf16(<8 x half> %src128, <16 x half> %src256, <32 x half> %sr
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-latency.ll
index ca13511c32707..8ad1fa46f59e6 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-latency.ll
@@ -6,7 +6,7 @@ define void @test_vXf16(<8 x half> %src128, <16 x half> %src256, <32 x half> %sr
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-sizelatency.ll
index 5312e8a87930e..dc7f4c04d0f35 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16-sizelatency.ll
@@ -6,7 +6,7 @@ define void @test_vXf16(<8 x half> %src128, <16 x half> %src256, <32 x half> %sr
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll
index 9e444b6888a49..8ed785453d4e0 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-fp16.ll
@@ -6,7 +6,7 @@ define void @test_vXf16(<8 x half> %src128, <16 x half> %src256, <32 x half> %sr
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x half> %src256, <16 x half> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x half> %src512, <32 x half> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x half> %src1024, <64 x half> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V128 = shufflevector <8 x half> %src128, <8 x half> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll
index 1c07247914263..f9f045f3a172b 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-latency.ll
@@ -3,15 +3,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+xop | FileCheck %s -check-prefixes=XOP
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx | FileCheck %s -check-prefixes=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
 
 ;
 ; Verify the cost model for 2 src shuffles
@@ -20,30 +20,37 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXf64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXf64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXf64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -56,30 +63,37 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -90,36 +104,52 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512,
 }
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512, <32 x float> %src1024, <2 x float> %src64_1, <4 x float> %src128_1, <8 x float> %src256_1, <16 x float> %src512_1, <32 x float> %src1024_1) {
-; SSE-LABEL: 'test_vXf32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE2-LABEL: 'test_vXf32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXf32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXf32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -127,7 +157,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -139,36 +169,52 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 }
 
 define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512, <32 x i32> %src1024, <2 x i32> %src64_1, <4 x i32> %src128_1, <8 x i32> %src256_1, <16 x i32> %src512_1, <32 x i32> %src1024_1) {
-; SSE-LABEL: 'test_vXi32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE2-LABEL: 'test_vXi32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXi32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXi32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -176,7 +222,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -192,27 +238,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -220,8 +266,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -229,8 +275,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 420 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -238,8 +284,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
@@ -248,7 +294,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi16'
@@ -257,7 +303,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMI-LABEL: 'test_vXi16'
@@ -266,7 +312,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
@@ -284,8 +330,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 364 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -293,8 +339,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -302,8 +348,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -312,7 +358,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -321,7 +367,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -330,7 +376,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll
index 8f40881800687..76690afecabdd 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src-sizelatency.ll
@@ -3,15 +3,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+xop | FileCheck %s -check-prefixes=XOP
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx | FileCheck %s -check-prefixes=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
 
 ;
 ; Verify the cost model for 2 src shuffles
@@ -20,30 +20,37 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXf64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXf64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXf64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -56,30 +63,37 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; AVX1-LABEL: 'test_vXi64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -90,36 +104,52 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512,
 }
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512, <32 x float> %src1024, <2 x float> %src64_1, <4 x float> %src128_1, <8 x float> %src256_1, <16 x float> %src512_1, <32 x float> %src1024_1) {
-; SSE-LABEL: 'test_vXf32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE2-LABEL: 'test_vXf32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXf32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXf32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -127,7 +157,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -139,36 +169,52 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 }
 
 define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512, <32 x i32> %src1024, <2 x i32> %src64_1, <4 x i32> %src128_1, <8 x i32> %src256_1, <16 x i32> %src512_1, <32 x i32> %src1024_1) {
-; SSE-LABEL: 'test_vXi32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; SSE2-LABEL: 'test_vXi32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXi32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXi32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -176,7 +222,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -192,27 +238,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -220,8 +266,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -229,8 +275,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 420 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -238,8 +284,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
@@ -248,7 +294,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi16'
@@ -257,7 +303,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512VBMI-LABEL: 'test_vXi16'
@@ -266,7 +312,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
@@ -284,8 +330,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 364 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -293,8 +339,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -302,8 +348,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -312,7 +358,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -321,7 +367,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -330,7 +376,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
diff --git a/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll b/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll
index 32c06d4c44cec..034ec0acf79d9 100644
--- a/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll
+++ b/llvm/test/Analysis/CostModel/X86/shuffle-two-src.ll
@@ -3,15 +3,15 @@
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+ssse3 | FileCheck %s -check-prefixes=SSE,SSSE3
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+sse4.2 | FileCheck %s -check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+xop | FileCheck %s -check-prefixes=XOP
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s -check-prefixes=AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx | FileCheck %s -check-prefixes=AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx2 | FileCheck %s -check-prefixes=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mattr=+avx512f,+avx512bw,+avx512vbmi | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
 ;
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=slm | FileCheck %s --check-prefixes=SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=goldmont | FileCheck %s --check-prefixes=SSE,SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes="print<cost-model>" 2>&1 -disable-output -mcpu=btver2 | FileCheck %s --check-prefixes=AVX1
 
 ;
 ; Verify the cost model for 2 src shuffles
@@ -20,30 +20,37 @@
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <2 x double> %src128_1, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
 ; SSE-LABEL: 'test_vXf64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXf64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-LABEL: 'test_vXf64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXf64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V128 = shufflevector <2 x double> %src128, <2 x double> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -56,30 +63,37 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512, <16 x i64> %src1024, <2 x i64> %src128_1, <4 x i64> %src256_1, <8 x i64> %src512_1, <16 x i64> %src1024_1) {
 ; SSE-LABEL: 'test_vXi64'
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; SSE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi64'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-; AVX-LABEL: 'test_vXi64'
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; AVX1-LABEL: 'test_vXi64'
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; AVX2-LABEL: 'test_vXi64'
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi64'
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <4 x i64> %src256, <4 x i64> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <8 x i64> %src512, <8 x i64> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <16 x i64> %src1024, <16 x i64> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V128 = shufflevector <2 x i64> %src128, <2 x i64> %src128_1, <2 x i32> <i32 3, i32 0>
@@ -90,36 +104,52 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512,
 }
 
 define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %src256, <16 x float> %src512, <32 x float> %src1024, <2 x float> %src64_1, <4 x float> %src128_1, <8 x float> %src256_1, <16 x float> %src512_1, <32 x float> %src1024_1) {
-; SSE-LABEL: 'test_vXf32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'test_vXf32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXf32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXf32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXf32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXf32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXf32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXf32'
@@ -127,7 +157,7 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V64 = shufflevector <2 x float> %src64, <2 x float> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -139,36 +169,52 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 }
 
 define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512, <32 x i32> %src1024, <2 x i32> %src64_1, <4 x i32> %src128_1, <8 x i32> %src256_1, <16 x i32> %src512_1, <32 x i32> %src1024_1) {
-; SSE-LABEL: 'test_vXi32'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; SSE2-LABEL: 'test_vXi32'
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSSE3-LABEL: 'test_vXi32'
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SSE42-LABEL: 'test_vXi32'
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi32'
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi32'
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi32'
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512-LABEL: 'test_vXi32'
@@ -176,7 +222,7 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <4 x i32> %src128, <4 x i32> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256 = shufflevector <8 x i32> %src256, <8 x i32> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512 = shufflevector <16 x i32> %src512, <16 x i32> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V64 = shufflevector <2 x i32> %src64, <2 x i32> %src64_1, <2 x i32> <i32 3, i32 0>
@@ -192,27 +238,27 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 960 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi16'
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi16'
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 360 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi16'
@@ -220,8 +266,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi16'
@@ -229,8 +275,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 420 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi16'
@@ -238,8 +284,8 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <4 x i16> %src64, <4 x i16> %src64_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi16'
@@ -248,7 +294,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'test_vXi16'
@@ -257,7 +303,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512VBMI-LABEL: 'test_vXi16'
@@ -266,7 +312,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
 ; AVX512VBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %V32 = shufflevector <2 x i16> %src32, <2 x i16> %src32_1, <2 x i32> <i32 3, i32 0>
@@ -284,8 +330,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 364 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'test_vXi8'
@@ -293,8 +339,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SSE42-LABEL: 'test_vXi8'
@@ -302,8 +348,8 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V32 = shufflevector <4 x i8> %src32, <4 x i8> %src32_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; XOP-LABEL: 'test_vXi8'
@@ -312,7 +358,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; XOP-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; XOP-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; XOP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX1-LABEL: 'test_vXi8'
@@ -321,7 +367,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 90 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX2-LABEL: 'test_vXi8'
@@ -330,7 +376,7 @@ define void @test_vXi8(<2 x i8> %src16, <4 x i8> %src32, <8 x i8> %src64, <16 x
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = shufflevector <8 x i8> %src64, <8 x i8> %src64_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'test_vXi8'
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index 664f144aa15ae..7ca06c4a8791e 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -447,13 +447,14 @@ define <8 x i32> @add_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
 ; SSE2-LABEL: @add_v8i32_01234u67(
 ; SSE2-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]]
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <2 x i32> <i32 5, i32 6>
+; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 4, i32 7>
+; SSE2-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[TMP5]], [[TMP6]]
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP4:%.*]] = add <8 x i32> [[TMP2]], [[TMP3]]
 ; SSE2-NEXT:    [[HADD4:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:    [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]]
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
 ; SSE2-NEXT:    ret <8 x i32> [[RESULT]]
 ;
@@ -461,13 +462,14 @@ define <8 x i32> @add_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
 ; SSE4-NEXT:    [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
 ; SSE4-NEXT:    [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
 ; SSE4-NEXT:    [[A45:%.*]] = add i32 [[A4]], [[A5]]
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> poison, <2 x i32> <i32 5, i32 6>
+; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <2 x i32> <i32 4, i32 7>
+; SSE4-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
 ; SSE4-NEXT:    [[HADD4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP6:%.*]] = add <8 x i32> [[TMP4]], [[TMP5]]
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE4-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
 ; SSE4-NEXT:    ret <8 x i32> [[RESULT]]
 ;
@@ -841,13 +843,14 @@ define <8 x float> @add_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
 ; SSE-NEXT:    [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
 ; SSE-NEXT:    [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
 ; SSE-NEXT:    [[A67:%.*]] = fadd float [[A6]], [[A7]]
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
+; SSE-NEXT:    [[TMP7:%.*]] = fadd <2 x float> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
 ; SSE-NEXT:    [[HADD5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP5]]
+; SSE-NEXT:    [[TMP6:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:    [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
 ; SSE-NEXT:    ret <8 x float> [[RESULT]]
 ;
@@ -1054,15 +1057,26 @@ define <4 x double> @add_v4f64_u123(<4 x double> %a, <4 x double> %b) {
 }
 
 define <4 x double> @add_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: @add_v4f64_0u23(
-; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 2, i32 poison>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 3, i32 poison>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
-; SSE-NEXT:    ret <4 x double> [[RESULT]]
+; SSE2-LABEL: @add_v4f64_0u23(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE2-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
+; SSE2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @add_v4f64_0u23(
+; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE4-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 2, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 3, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v4f64_0u23(
 ; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
@@ -1094,15 +1108,26 @@ define <4 x double> @add_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
 }
 
 define <4 x double> @add_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: @add_v4f64_01u3(
-; SSE-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
-; SSE-NEXT:    ret <4 x double> [[RESULT]]
+; SSE2-LABEL: @add_v4f64_01u3(
+; SSE2-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE2-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
+; SSE2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @add_v4f64_01u3(
+; SSE4-NEXT:    [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE4-NEXT:    [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE4-NEXT:    [[B23:%.*]] = fadd double [[B2]], [[B3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v4f64_01u3(
 ; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
@@ -1134,15 +1159,26 @@ define <4 x double> @add_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
 }
 
 define <4 x double> @add_v4f64_012u(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: @add_v4f64_012u(
-; SSE-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
-; SSE-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
-; SSE-NEXT:    [[A23:%.*]] = fadd double [[A2]], [[A3]]
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; SSE-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[A23]], i64 2
-; SSE-NEXT:    ret <4 x double> [[RESULT]]
+; SSE2-LABEL: @add_v4f64_012u(
+; SSE2-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
+; SSE2-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SSE2-NEXT:    [[A23:%.*]] = fadd double [[A2]], [[A3]]
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[A23]], i64 2
+; SSE2-NEXT:    ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @add_v4f64_012u(
+; SSE4-NEXT:    [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
+; SSE4-NEXT:    [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SSE4-NEXT:    [[A23:%.*]] = fadd double [[A2]], [[A3]]
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[A23]], i64 2
+; SSE4-NEXT:    ret <4 x double> [[RESULT]]
 ;
 ; AVX-LABEL: @add_v4f64_012u(
 ; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
@@ -1214,11 +1250,24 @@ define <4 x double> @add_v4f64_uu23(<4 x double> %a, <4 x double> %b) {
 }
 
 define <4 x double> @add_v4f64_01uu(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @add_v4f64_01uu(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+; SSE2-LABEL: @add_v4f64_01uu(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT:    ret <4 x double> [[TMP4]]
+;
+; SSE4-LABEL: @add_v4f64_01uu(
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    ret <4 x double> [[TMP3]]
+;
+; AVX-LABEL: @add_v4f64_01uu(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll
index cbdafde5e421b..354a988416c3e 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll
@@ -1,19 +1,32 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64    -O3                   -S < %s  | FileCheck %s
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -O3                   -S < %s  | FileCheck %s
-; RUN: opt -mtriple=x86_64-- -mcpu=btver2    -O3                   -S < %s  | FileCheck %s
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -O3                   -S < %s  | FileCheck %s
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64    -passes="default<O3>" -S < %s  | FileCheck %s
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes="default<O3>" -S < %s  | FileCheck %s
-; RUN: opt -mtriple=x86_64-- -mcpu=btver2    -passes="default<O3>" -S < %s  | FileCheck %s
-; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes="default<O3>" -S < %s  | FileCheck %s
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64    -O3                   -S < %s  | FileCheck %s --check-prefixes=SSE2
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -O3                   -S < %s  | FileCheck %s --check-prefixes=SSE4
+; RUN: opt -mtriple=x86_64-- -mcpu=btver2    -O3                   -S < %s  | FileCheck %s --check-prefixes=AVX
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -O3                   -S < %s  | FileCheck %s --check-prefixes=AVX
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64    -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=SSE2
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=SSE4
+; RUN: opt -mtriple=x86_64-- -mcpu=btver2    -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=AVX
+; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes="default<O3>" -S < %s  | FileCheck %s --check-prefixes=AVX
 
 define <4 x double> @PR94546(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @PR94546(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 poison, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 poison, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+; SSE2-LABEL: @PR94546(
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 6>
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 7>
+; SSE2-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 1>
+; SSE2-NEXT:    ret <4 x double> [[TMP4]]
+;
+; SSE4-LABEL: @PR94546(
+; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 poison, i32 6>
+; SSE4-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 poison, i32 7>
+; SSE4-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT:    ret <4 x double> [[TMP3]]
+;
+; AVX-LABEL: @PR94546(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 poison, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 poison, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %vecext = extractelement <4 x double> %a, i32 0
   %vecext1 = extractelement <4 x double> %a, i32 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
index 0bc91d42b0f13..7626eea85f219 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll
@@ -972,22 +972,15 @@ define i32 @maxi8_wrong_parent(i32) {
 ; SSE2-NEXT:    ret i32 [[TMP23]]
 ;
 ; SSE4-LABEL: @maxi8_wrong_parent(
-; SSE4-NEXT:    [[TMP2:%.*]] = load i32, ptr @arr, align 16
-; SSE4-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4
+; SSE4-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16
 ; SSE4-NEXT:    br label [[PP:%.*]]
 ; SSE4:       pp:
 ; SSE4-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8
-; SSE4-NEXT:    [[TMP5:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
-; SSE4-NEXT:    [[TMP6:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4
-; SSE4-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
-; SSE4-NEXT:    [[OP_RDX:%.*]] = icmp sgt i32 [[TMP7]], [[TMP5]]
-; SSE4-NEXT:    [[OP_RDX1:%.*]] = select i1 [[OP_RDX]], i32 [[TMP7]], i32 [[TMP5]]
-; SSE4-NEXT:    [[OP_RDX2:%.*]] = icmp sgt i32 [[TMP6]], [[TMP2]]
-; SSE4-NEXT:    [[OP_RDX3:%.*]] = select i1 [[OP_RDX2]], i32 [[TMP6]], i32 [[TMP2]]
-; SSE4-NEXT:    [[OP_RDX4:%.*]] = icmp sgt i32 [[OP_RDX1]], [[OP_RDX3]]
-; SSE4-NEXT:    [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[OP_RDX1]], i32 [[OP_RDX3]]
-; SSE4-NEXT:    [[OP_RDX6:%.*]] = icmp sgt i32 [[OP_RDX5]], [[TMP3]]
-; SSE4-NEXT:    [[OP_RDX7:%.*]] = select i1 [[OP_RDX6]], i32 [[OP_RDX5]], i32 [[TMP3]]
+; SSE4-NEXT:    [[TMP8:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8
+; SSE4-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP4]], i64 0)
+; SSE4-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP8]], i64 4)
+; SSE4-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP2]], i64 6)
+; SSE4-NEXT:    [[OP_RDX7:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP7]])
 ; SSE4-NEXT:    ret i32 [[OP_RDX7]]
 ;
 ; AVX-LABEL: @maxi8_wrong_parent(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll
index 57b5d2af48ee6..76104efc1bb78 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/minbitwidth-transformed-operand.ll
@@ -5,12 +5,11 @@ define void @test(i64 %d.promoted.i) {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: i64 [[D_PROMOTED_I:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 0, i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> <i64 poison, i64 0>, i64 [[D_PROMOTED_I]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> zeroinitializer, [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i1> [[TMP3]], <2 x i1> poison, <16 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i1> <i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x i1> [[TMP4]], <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i1> @llvm.vector.insert.v16i1.v2i1(<16 x i1> poison, <2 x i1> [[TMP3]], i64 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i1> [[TMP4]], <16 x i1> <i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 1, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <16 x i1> [[TMP5]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP8]] to i32
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
index 179e11136c883..fba4b60ef417b 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-casts.ll
@@ -165,11 +165,17 @@ define <16 x float> @concat_fptrunc_v8f64_v16f32(<8 x double> %a0, <8 x double>
 ; commuted vector concatenation
 
 define <16 x i32> @rconcat_sext_v8i16_v16i32(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: define <16 x i32> @rconcat_sext_v8i16_v16i32(
-; CHECK-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> [[A1]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
-; CHECK-NEXT:    ret <16 x i32> [[R]]
+; SSE-LABEL: define <16 x i32> @rconcat_sext_v8i16_v16i32(
+; SSE-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A1]], <8 x i16> [[A0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
+; SSE-NEXT:    ret <16 x i32> [[R]]
+;
+; AVX-LABEL: define <16 x i32> @rconcat_sext_v8i16_v16i32(
+; AVX-SAME: <8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> [[A1]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX-NEXT:    [[R:%.*]] = sext <16 x i16> [[TMP1]] to <16 x i32>
+; AVX-NEXT:    ret <16 x i32> [[R]]
 ;
   %x0 = sext <8 x i16> %a0 to <8 x i32>
   %x1 = sext <8 x i16> %a1 to <8 x i32>
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
index 7dc70cd8b3d77..b30dc9ffdc596 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-shuffles.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX
 
 ; fold to identity
 
@@ -47,14 +47,21 @@ define <8 x i32> @concat_extract_subvectors_poison(<8 x i32> %x) {
 ; broadcast loads are free on AVX (and blends are much cheap than general 2-operand shuffles)
 
 define  <4 x double> @blend_broadcasts_v4f64(ptr %p0, ptr %p1)  {
-; CHECK-LABEL: define <4 x double> @blend_broadcasts_v4f64(
-; CHECK-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
-; CHECK-NEXT:    [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
-; CHECK-NEXT:    [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; CHECK-NEXT:    ret <4 x double> [[BLEND]]
+; SSE-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; SSE-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; SSE-NEXT:    [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; SSE-NEXT:    [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; SSE-NEXT:    [[BLEND:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> [[LD1]], <4 x i32> <i32 0, i32 4, i32 4, i32 0>
+; SSE-NEXT:    ret <4 x double> [[BLEND]]
+;
+; AVX-LABEL: define <4 x double> @blend_broadcasts_v4f64(
+; AVX-SAME: ptr [[P0:%.*]], ptr [[P1:%.*]]) #[[ATTR0]] {
+; AVX-NEXT:    [[LD0:%.*]] = load <4 x double>, ptr [[P0]], align 32
+; AVX-NEXT:    [[LD1:%.*]] = load <4 x double>, ptr [[P1]], align 32
+; AVX-NEXT:    [[BCST0:%.*]] = shufflevector <4 x double> [[LD0]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[BCST1:%.*]] = shufflevector <4 x double> [[LD1]], <4 x double> undef, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[BLEND:%.*]] = shufflevector <4 x double> [[BCST0]], <4 x double> [[BCST1]], <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; AVX-NEXT:    ret <4 x double> [[BLEND]]
 ;
   %ld0 = load <4 x double>, ptr %p0, align 32
   %ld1 = load <4 x double>, ptr %p1, align 32

From ff93ca7d6c487108a65d3ad15c3392235fd9c190 Mon Sep 17 00:00:00 2001
From: hanbeom <kese111@gmail.com>
Date: Fri, 20 Dec 2024 19:44:49 +0900
Subject: [PATCH 169/209] [VectorCombine] Combine scalar fneg with
 insert/extract to vector fneg when length is different (#120461)

insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index
-> shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask

Original combining left the combine between vectors of different lengths as a TODO. this commit do that. (see
#[https://github.com/llvm/llvm-project/commit/baab4aa1ba5f68634b4936375e19c8686b1b474a])
---
 .../Transforms/Vectorize/VectorCombine.cpp    |  34 +++-
 .../VectorCombine/X86/extract-fneg-insert.ll  | 154 ++++++++++++++++++
 2 files changed, 180 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 430fed6d76eaa..9ea8190cfb49d 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -666,9 +666,10 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
                        m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index))))))
     return false;
 
-  // TODO: We could handle this with a length-changing shuffle.
   auto *VecTy = cast<FixedVectorType>(I.getType());
-  if (SrcVec->getType() != VecTy)
+  auto *ScalarTy = VecTy->getScalarType();
+  auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
+  if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType())
     return false;
 
   // Ignore bogus insert/extract index.
@@ -682,8 +683,6 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
   SmallVector<int> Mask(NumElts);
   std::iota(Mask.begin(), Mask.end(), 0);
   Mask[Index] = Index + NumElts;
-
-  Type *ScalarTy = VecTy->getScalarType();
   InstructionCost OldCost =
       TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) +
       TTI.getVectorInstrCost(I, VecTy, CostKind, Index);
@@ -698,14 +697,33 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
       TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) +
       TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask, CostKind);
 
+  bool NeedLenChg = SrcVecTy->getNumElements() != NumElts;
+  // If the lengths of the two vectors are not equal,
+  // we need to add a length-change vector. Add this cost.
+  SmallVector<int> SrcMask;
+  if (NeedLenChg) {
+    SrcMask.assign(NumElts, PoisonMaskElem);
+    SrcMask[Index] = Index;
+    NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                  SrcVecTy, SrcMask, CostKind);
+  }
+
   if (NewCost > OldCost)
     return false;
 
-  // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index -->
-  // shuffle DestVec, (fneg SrcVec), Mask
+  Value *NewShuf;
+  // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index
   Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
-  Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
-  replaceValue(I, *Shuf);
+  if (NeedLenChg) {
+    // shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
+    Value *LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
+    NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask);
+  } else {
+    // shuffle DestVec, (fneg SrcVec), Mask
+    NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
+  }
+
+  replaceValue(I, *NewShuf);
   return true;
 }
 
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
index df5fcdb7beb65..5c856ce397c1d 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
@@ -18,6 +18,19 @@ define <4 x float> @ext0_v4f32(<4 x float> %x, <4 x float> %y) {
   ret <4 x float> %r
 }
 
+define <4 x float> @ext0_v2f32v4f32(<2 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @ext0_v2f32v4f32(
+; CHECK-NEXT:    [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[N:%.*]] = fneg float [[E]]
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 0
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %e = extractelement <2 x float> %x, i32 0
+  %n = fneg float %e
+  %r = insertelement <4 x float> %y, float %n, i32 0
+  ret <4 x float> %r
+}
+
 ; Eliminating extract/insert is profitable.
 
 define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) {
@@ -32,6 +45,19 @@ define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) {
   ret <4 x float> %r
 }
 
+define <4 x float> @ext2_v2f32v4f32(<2 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @ext2_v2f32v4f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <2 x float> [[X:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 2, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %e = extractelement <2 x float> %x, i32 2
+  %n = fneg float %e
+  %r = insertelement <4 x float> %y, float %n, i32 2
+  ret <4 x float> %r
+}
+
 ; Eliminating extract/insert is still profitable. Flags propagate.
 
 define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) {
@@ -46,6 +72,25 @@ define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) {
   ret <2 x double> %r
 }
 
+define <4 x double> @ext1_v2f64v4f64(<2 x double> %x, <4 x double> %y) {
+; SSE-LABEL: @ext1_v2f64v4f64(
+; SSE-NEXT:    [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
+; SSE-NEXT:    [[N:%.*]] = fneg nsz double [[E]]
+; SSE-NEXT:    [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 1
+; SSE-NEXT:    ret <4 x double> [[R]]
+;
+; AVX-LABEL: @ext1_v2f64v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; AVX-NEXT:    [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+; AVX-NEXT:    ret <4 x double> [[R]]
+;
+  %e = extractelement <2 x double> %x, i32 1
+  %n = fneg nsz double %e
+  %r = insertelement <4 x double> %y, double %n, i32 1
+  ret <4 x double> %r
+}
+
 ; The vector fneg would cost twice as much as the scalar op with SSE,
 ; so we don't transform there (the shuffle would also be more expensive).
 
@@ -67,6 +112,19 @@ define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) {
   ret <8 x float> %r
 }
 
+define <8 x float> @ext7_v4f32v8f32(<4 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @ext7_v4f32v8f32(
+; CHECK-NEXT:    [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
+; CHECK-NEXT:    [[N:%.*]] = fneg float [[E]]
+; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
+  %e = extractelement <4 x float> %x, i32 3
+  %n = fneg float %e
+  %r = insertelement <8 x float> %y, float %n, i32 7
+  ret <8 x float> %r
+}
+
 ; Same as above with an extra use of the extracted element.
 
 define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) {
@@ -91,6 +149,21 @@ define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) {
   ret <8 x float> %r
 }
 
+define <8 x float> @ext7_v4f32v8f32_use1(<4 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @ext7_v4f32v8f32_use1(
+; CHECK-NEXT:    [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
+; CHECK-NEXT:    call void @use(float [[E]])
+; CHECK-NEXT:    [[N:%.*]] = fneg float [[E]]
+; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
+  %e = extractelement <4 x float> %x, i32 3
+  call void @use(float %e)
+  %n = fneg float %e
+  %r = insertelement <8 x float> %y, float %n, i32 3
+  ret <8 x float> %r
+}
+
 ; Negative test - the transform is likely not profitable if the fneg has another use.
 
 define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) {
@@ -108,6 +181,21 @@ define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) {
   ret <8 x float> %r
 }
 
+define <8 x float> @ext7_v4f32v8f32_use2(<4 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @ext7_v4f32v8f32_use2(
+; CHECK-NEXT:    [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
+; CHECK-NEXT:    [[N:%.*]] = fneg float [[E]]
+; CHECK-NEXT:    call void @use(float [[N]])
+; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3
+; CHECK-NEXT:    ret <8 x float> [[R]]
+;
+  %e = extractelement <4 x float> %x, i32 3
+  %n = fneg float %e
+  call void @use(float %n)
+  %r = insertelement <8 x float> %y, float %n, i32 3
+  ret <8 x float> %r
+}
+
 ; Negative test - can't convert variable index to a shuffle.
 
 define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %index) {
@@ -123,6 +211,19 @@ define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %
   ret <2 x double> %r
 }
 
+define <4 x double> @ext_index_var_v2f64v4f64(<2 x double> %x, <4 x double> %y, i32 %index) {
+; CHECK-LABEL: @ext_index_var_v2f64v4f64(
+; CHECK-NEXT:    [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 [[INDEX:%.*]]
+; CHECK-NEXT:    [[N:%.*]] = fneg nsz double [[E]]
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 [[INDEX]]
+; CHECK-NEXT:    ret <4 x double> [[R]]
+;
+  %e = extractelement <2 x double> %x, i32 %index
+  %n = fneg nsz double %e
+  %r = insertelement <4 x double> %y, double %n, i32 %index
+  ret <4 x double> %r
+}
+
 ; Negative test - require same extract/insert index for simple shuffle.
 ; TODO: We could handle this by adjusting the cost calculation.
 
@@ -139,6 +240,33 @@ define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) {
   ret <2 x double> %r
 }
 
+; Negative test - extract from an index greater than the vector width of the destination
+define <2 x double> @ext3_v4f64v2f64(<4 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: @ext3_v4f64v2f64(
+; CHECK-NEXT:    [[E:%.*]] = extractelement <4 x double> [[X:%.*]], i32 3
+; CHECK-NEXT:    [[N:%.*]] = fneg nsz double [[E]]
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 1
+; CHECK-NEXT:    ret <2 x double> [[R]]
+;
+  %e = extractelement <4 x double> %x, i32 3
+  %n = fneg nsz double %e
+  %r = insertelement <2 x double> %y, double %n, i32 1
+  ret <2 x double> %r
+}
+
+define <4 x double> @ext1_v2f64v4f64_ins0(<2 x double> %x, <4 x double> %y) {
+; CHECK-LABEL: @ext1_v2f64v4f64_ins0(
+; CHECK-NEXT:    [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
+; CHECK-NEXT:    [[N:%.*]] = fneg nsz double [[E]]
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0
+; CHECK-NEXT:    ret <4 x double> [[R]]
+;
+  %e = extractelement <2 x double> %x, i32 1
+  %n = fneg nsz double %e
+  %r = insertelement <4 x double> %y, double %n, i32 0
+  ret <4 x double> %r
+}
+
 ; Negative test - avoid changing poison ops
 
 define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) {
@@ -154,6 +282,19 @@ define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) {
   ret <4 x float> %r
 }
 
+define <4 x float> @ext12_v2f32v4f32(<2 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @ext12_v2f32v4f32(
+; CHECK-NEXT:    [[E:%.*]] = extractelement <2 x float> [[X:%.*]], i32 6
+; CHECK-NEXT:    [[N:%.*]] = fneg float [[E]]
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 12
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %e = extractelement <2 x float> %x, i32 6
+  %n = fneg float %e
+  %r = insertelement <4 x float> %y, float %n, i32 12
+  ret <4 x float> %r
+}
+
 ; This used to crash because we assumed matching a true, unary fneg instruction.
 
 define <2 x float> @ext1_v2f32_fsub(<2 x float> %x) {
@@ -181,3 +322,16 @@ define <2 x float> @ext1_v2f32_fsub_fmf(<2 x float> %x, <2 x float> %y) {
   %r = insertelement <2 x float> %y, float %s, i32 1
   ret <2 x float> %r
 }
+
+define <4 x float> @ext1_v2f32v4f32_fsub_fmf(<2 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @ext1_v2f32v4f32_fsub_fmf(
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg nnan nsz <2 x float> [[X:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[R]]
+;
+  %e = extractelement <2 x float> %x, i32 1
+  %s = fsub nsz nnan float 0.0, %e
+  %r = insertelement <4 x float> %y, float %s, i32 1
+  ret <4 x float> %r
+}

From 708e1437ff82181dc42f7b43f95428bfd0a9e8ff Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 20 Dec 2024 10:48:21 +0000
Subject: [PATCH 170/209] [gn build] Port 0b5b09b67c57

---
 llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
index 655264509db59..0483400d74803 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
@@ -21,7 +21,7 @@ static_library("Passes") {
   ]
   sources = [
     "CodeGenPassBuilder.cpp",
-    "DroppedVariableStats.cpp",
+    "DroppedVariableStatsIR.cpp",
     "OptimizationLevel.cpp",
     "PassBuilder.cpp",
     "PassBuilderBindings.cpp",

From 42bc7bf40a13227110a95079b108615a85cddd7d Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 20 Dec 2024 10:48:22 +0000
Subject: [PATCH 171/209] [gn build] Port d8a5fae6913a

---
 .../gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn     | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn
index 059321503f5b6..a10a0d5637e95 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/Mips/MCTargetDesc/BUILD.gn
@@ -72,5 +72,7 @@ static_library("MCTargetDesc") {
     "MipsNaClELFStreamer.cpp",
     "MipsOptionRecord.cpp",
     "MipsTargetStreamer.cpp",
+    "MipsWinCOFFObjectWriter.cpp",
+    "MipsWinCOFFStreamer.cpp",
   ]
 }

From 919aead1db64b2f1444842bc75a3af7836238671 Mon Sep 17 00:00:00 2001
From: Thirumalai Shaktivel
 <74826228+Thirumalai-Shaktivel@users.noreply.github.com>
Date: Fri, 20 Dec 2024 16:36:51 +0530
Subject: [PATCH 172/209] [Flang OpenMP] Add LLVM translation support for
 UNTIED in Task (#115283)

Implementation details:
The UNTIED clause is recognized by setting the flag=0 for the default
case or performing logical OR to flag if other clauses are specified,
and this flag is passed as an argument to the `__kmpc_omp_task_alloc`
runtime call.
---
 flang/lib/Lower/OpenMP/OpenMP.cpp             |  1 +
 flang/test/Lower/OpenMP/Todo/task_untied.f90  | 13 ----------
 flang/test/Lower/OpenMP/task.f90              | 17 +++++++++++++
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  5 +++-
 mlir/test/Target/LLVMIR/openmp-llvm.mlir      | 12 ++++++++++
 mlir/test/Target/LLVMIR/openmp-todo.mlir      | 24 ++++++++++---------
 6 files changed, 47 insertions(+), 25 deletions(-)
 delete mode 100644 flang/test/Lower/OpenMP/Todo/task_untied.f90

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index b07e89d201d19..4de5ecf187a4c 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2867,6 +2867,7 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
         !std::holds_alternative<clause::UseDevicePtr>(clause.u) &&
         !std::holds_alternative<clause::InReduction>(clause.u) &&
         !std::holds_alternative<clause::Mergeable>(clause.u) &&
+        !std::holds_alternative<clause::Untied>(clause.u) &&
         !std::holds_alternative<clause::TaskReduction>(clause.u) &&
         !std::holds_alternative<clause::Detach>(clause.u)) {
       std::string name =
diff --git a/flang/test/Lower/OpenMP/Todo/task_untied.f90 b/flang/test/Lower/OpenMP/Todo/task_untied.f90
deleted file mode 100644
index 87d242ba3e9d2..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/task_untied.f90
+++ /dev/null
@@ -1,13 +0,0 @@
-! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
-! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s
-
-!===============================================================================
-! `untied` clause
-!===============================================================================
-
-! CHECK: not yet implemented: UNTIED clause is not implemented yet
-subroutine omp_task_untied()
-  !$omp task untied
-  call foo()
-  !$omp end task
-end subroutine omp_task_untied
diff --git a/flang/test/Lower/OpenMP/task.f90 b/flang/test/Lower/OpenMP/task.f90
index 6e525a044b011..a8cc16c540c9c 100644
--- a/flang/test/Lower/OpenMP/task.f90
+++ b/flang/test/Lower/OpenMP/task.f90
@@ -235,6 +235,10 @@ subroutine task_multiple_clauses()
   !$omp end task
 end subroutine task_multiple_clauses
 
+!===============================================================================
+! `mergeable` clause
+!===============================================================================
+
 subroutine task_mergeable()
 !CHECK: omp.task mergeable {
 !CHECK: omp.terminator
@@ -242,3 +246,16 @@ subroutine task_mergeable()
  !$omp task mergeable
  !$omp end task
 end subroutine
+
+!===============================================================================
+! `untied` clause
+!===============================================================================
+
+!CHECK-LABEL: func.func @_QPomp_task_untied() {
+subroutine omp_task_untied()
+  !CHECK: omp.task untied {
+  !$omp task untied
+    call foo()
+  !CHECK: omp.terminator
+  !$omp end task
+end subroutine omp_task_untied
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 060113c412324..d591c98a5497f 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -258,7 +258,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
         checkAllocate(op, result);
         checkInReduction(op, result);
         checkPriority(op, result);
-        checkUntied(op, result);
       })
       .Case([&](omp::TaskgroupOp op) {
         checkAllocate(op, result);
@@ -268,6 +267,10 @@ static LogicalResult checkImplementationStatus(Operation &op) {
         checkDepend(op, result);
         checkNowait(op, result);
       })
+      .Case([&](omp::TaskloopOp op) {
+        // TODO: Add other clauses check
+        checkUntied(op, result);
+      })
       .Case([&](omp::WsloopOp op) {
         checkAllocate(op, result);
         checkLinear(op, result);
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index 5f8bdf8afdf78..8903bf283b5c7 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -3020,6 +3020,18 @@ module attributes {omp.is_target_device = true} {
 
 // -----
 
+llvm.func @omp_task_untied() {
+  // The third argument is 0: which signifies the united task
+  // CHECK: {{.*}} = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 %{{.*}}, i32 0,
+  // CHECK-SAME:     i64 40, i64 0, ptr @{{.*}})
+  omp.task untied {
+        omp.terminator
+  }
+  llvm.return
+}
+
+// -----
+
 // Third argument is 5: essentially (4 || 1)
 // signifying this task is TIED and MERGEABLE
 
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 8f3e466cfbbeb..8ae795ec1ec6b 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -440,17 +440,6 @@ llvm.func @task_priority(%x : i32) {
 
 // -----
 
-llvm.func @task_untied() {
-  // expected-error@below {{not yet implemented: Unhandled clause untied in omp.task operation}}
-  // expected-error@below {{LLVM Translation failed for operation: omp.task}}
-  omp.task untied {
-    omp.terminator
-  }
-  llvm.return
-}
-
-// -----
-
 llvm.func @taskgroup_allocate(%x : !llvm.ptr) {
   // expected-error@below {{not yet implemented: Unhandled clause allocate in omp.taskgroup operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.taskgroup}}
@@ -503,6 +492,19 @@ llvm.func @taskloop(%lb : i32, %ub : i32, %step : i32) {
 
 // -----
 
+llvm.func @taskloop_untied(%lb : i32, %ub : i32, %step : i32) {
+  // expected-error@below {{not yet implemented: omp.taskloop}}
+  // expected-error@below {{LLVM Translation failed for operation: omp.taskloop}}
+  omp.taskloop untied {
+    omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
+      omp.yield
+    }
+  }
+  llvm.return
+}
+
+// -----
+
 llvm.func @taskwait_depend(%x: !llvm.ptr) {
   // expected-error@below {{not yet implemented: Unhandled clause depend in omp.taskwait operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.taskwait}}

From c2bd5c25b3634e55089d34afe922aa38eee743e2 Mon Sep 17 00:00:00 2001
From: SpencerAbson <Spencer.Abson@arm.com>
Date: Fri, 20 Dec 2024 11:07:37 +0000
Subject: [PATCH 173/209] [AArch64] Avoid GPR trip when moving truncated i32
 vector elements (#114541)

This patch implements a DAG combine whereby
```
        a: v2i64 = ...
      b: i64 = extract_vector_elt a, Constant:i64<n>
    c: i32 = truncate b
```
Becomes
```
        a: v2i64 = ...
      b: v4i32 = AArch64ISD::NVCAST a
    c: i32 = extract_vector_elt c, Constant:i64<2n>
```

The primary goal of this work is to enable the use of [INS
(element)](https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/INS--element---Insert-vector-element-from-another-vector-element-?lang=en)
when moving a truncated i32 element between vectors. This combine
canonicalises the structure of the DAG for all legal instances of the
pattern above (by removing the explicit `trunc` operator in this
specific case), allowing us to take advantage of existing ISEL patterns
for this behavior.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  40 +++++-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  58 +++-----
 .../aarch64-neon-vector-insert-uaddlv.ll      |  24 ++--
 .../CodeGen/AArch64/neon-ins-trunc-elt.ll     | 136 ++++++++++++++++++
 llvm/test/CodeGen/AArch64/sve-doublereduct.ll |   9 +-
 .../CodeGen/AArch64/sve-extract-element.ll    |   4 +-
 .../AArch64/sve-extract-fixed-vector.ll       |   7 +-
 .../AArch64/sve-fixed-length-int-reduce.ll    |  45 ++----
 llvm/test/CodeGen/AArch64/sve-int-reduce.ll   |  12 +-
 .../CodeGen/AArch64/sve-split-int-reduce.ll   |  18 +--
 ...-streaming-mode-fixed-length-int-reduce.ll |  27 ++--
 ...-streaming-mode-fixed-length-reductions.ll |   6 +-
 .../test/CodeGen/AArch64/sve-vecreduce-dot.ll |   3 +-
 .../CodeGen/AArch64/uaddlv-vaddlp-combine.ll  |   3 +-
 .../AArch64/vec-combine-compare-to-bitmask.ll |   2 +-
 llvm/test/CodeGen/AArch64/vecreduce-bool.ll   |   2 +-
 16 files changed, 250 insertions(+), 146 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index eb007c25ac89e..5b941d173ec2b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -20945,8 +20945,9 @@ static SDValue performBuildVectorCombine(SDNode *N,
   return SDValue();
 }
 
-static SDValue performTruncateCombine(SDNode *N,
-                                      SelectionDAG &DAG) {
+static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
+                                      TargetLowering::DAGCombinerInfo &DCI) {
+  SDLoc DL(N);
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
@@ -20954,8 +20955,37 @@ static SDValue performTruncateCombine(SDNode *N,
     SDValue Op = N0.getOperand(0);
     if (VT.getScalarType() == MVT::i32 &&
         N0.getOperand(0).getValueType().getScalarType() == MVT::i64)
-      Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), MVT::i32, Op);
-    return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Op);
+      Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op);
+    return DAG.getNode(N0.getOpcode(), DL, VT, Op);
+  }
+
+  // Performing the following combine produces a preferable form for ISEL.
+  // i32 (trunc (extract Vi64, idx)) -> i32 (extract (nvcast Vi32), idx*2))
+  if (DCI.isAfterLegalizeDAG() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      N0.hasOneUse()) {
+    SDValue Op = N0.getOperand(0);
+    SDValue ExtractIndexNode = N0.getOperand(1);
+    if (!isa<ConstantSDNode>(ExtractIndexNode))
+      return SDValue();
+
+    // For a legal DAG, EXTRACT_VECTOR_ELT can only have produced an i32 or i64.
+    // So we can only expect: i32 (trunc (i64 (extract Vi64, idx))).
+    assert((VT == MVT::i32 && N0.getValueType() == MVT::i64) &&
+           "Unexpected legalisation result!");
+
+    EVT SrcVectorType = Op.getValueType();
+    // We also assume that SrcVectorType cannot be a V64 (see
+    // LowerEXTRACT_VECTOR_ELT).
+    assert((SrcVectorType == MVT::v2i64 || SrcVectorType == MVT::nxv2i64) &&
+           "Unexpected legalisation result!");
+
+    unsigned ExtractIndex =
+        cast<ConstantSDNode>(ExtractIndexNode)->getZExtValue();
+    MVT CastVT = SrcVectorType.isScalableVector() ? MVT::nxv4i32 : MVT::v4i32;
+
+    Op = DAG.getNode(AArch64ISD::NVCAST, DL, CastVT, Op);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
+                       DAG.getVectorIdxConstant(ExtractIndex * 2, DL));
   }
 
   return SDValue();
@@ -26258,7 +26288,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::BUILD_VECTOR:
     return performBuildVectorCombine(N, DCI, DAG);
   case ISD::TRUNCATE:
-    return performTruncateCombine(N, DAG);
+    return performTruncateCombine(N, DAG, DCI);
   case AArch64ISD::ANDS:
     return performFlagSettingCombine(N, DCI, ISD::AND);
   case AArch64ISD::ADC:
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index d015cc15581ad..b37f4a08755c5 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6977,6 +6977,12 @@ def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
 def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
           (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
 
+// Also covers DUP (truncate i64 to i32)
+def : Pat<(v2i32 (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
+          (DUPv2i32lane V128:$Rn, imm:$idx)>;
+def : Pat<(v4i32 (AArch64dup (i32 (extractelt (v4i32 V128:$Rn), imm:$idx)))),
+          (DUPv4i32lane V128:$Rn, imm:$idx)>;
+
 // If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
 // instruction even if the types don't match: we just have to remap the lane
 // carefully. N.b. this trick only applies to truncations.
@@ -6990,44 +6996,20 @@ def VecIndex_x8 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(8 * N->getZExtValue(), SDLoc(N), MVT::i64);
 }]>;
 
-multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
-                            ValueType Src128VT, ValueType ScalVT,
-                            Instruction DUP, SDNodeXForm IdxXFORM> {
-  def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
-                                                     imm:$idx)))),
-            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
-
-  def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
-                                                     imm:$idx)))),
-            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
-}
-
-defm : DUPWithTruncPats<v8i8,   v4i16, v8i16, i32, DUPv8i8lane,  VecIndex_x2>;
-defm : DUPWithTruncPats<v8i8,   v2i32, v4i32, i32, DUPv8i8lane,  VecIndex_x4>;
-defm : DUPWithTruncPats<v4i16,  v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
-
-defm : DUPWithTruncPats<v16i8,  v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
-defm : DUPWithTruncPats<v16i8,  v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
-defm : DUPWithTruncPats<v8i16,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
-
-multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
-                               SDNodeXForm IdxXFORM> {
-  def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
-                                                         imm:$idx))))),
-            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
-
-  def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn),
-                                                       imm:$idx))))),
-            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
-}
-
-defm : DUPWithTrunci64Pats<v8i8,  DUPv8i8lane,   VecIndex_x8>;
-defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane,  VecIndex_x4>;
-defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane,  VecIndex_x2>;
-
-defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
-defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
-defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
+class DUPWithTruncPat<ValueType ResVT, ValueType SrcVT, ValueType ScalVT,
+           Instruction DUP, SDNodeXForm IdxXFORM>
+  : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (SrcVT V128:$Rn), imm:$idx)))),
+        (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
+
+// DUP (truncate i16 to i8)
+def : DUPWithTruncPat<v8i8,  v8i16, i32, DUPv8i8lane,  VecIndex_x2>;
+def : DUPWithTruncPat<v16i8, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
+// DUP (truncate i32/64 to i8)
+def : DUPWithTruncPat<v8i8,  v4i32, i32, DUPv8i8lane,  VecIndex_x4>;
+def : DUPWithTruncPat<v16i8, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
+// DUP (truncate i32/i64 to i16)
+def : DUPWithTruncPat<v4i16, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
+def : DUPWithTruncPat<v8i16, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
 
 // SMOV and UMOV definitions, with some extra patterns for convenience
 defm SMOV : SMov;
diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
index 2e16517938182..1b7bc128d6332 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
@@ -384,9 +384,9 @@ define void @insert_vec_v4i16_uaddlv_from_v4i32(ptr %0) {
 ; CHECK-NEXT:    movi.2d v1, #0000000000000000
 ; CHECK-NEXT:    uaddlv.4s d0, v0
 ; CHECK-NEXT:    mov.h v1[0], v0[0]
-; CHECK-NEXT:    ushll.4s v0, v1, #0
-; CHECK-NEXT:    ucvtf.4s v0, v0
-; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ushll.4s v1, v1, #0
+; CHECK-NEXT:    ucvtf.4s v1, v1
+; CHECK-NEXT:    str q1, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
@@ -403,13 +403,13 @@ define void @insert_vec_v16i16_uaddlv_from_v4i32(ptr %0) {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-NEXT:    movi.2d v1, #0000000000000000
+; CHECK-NEXT:    movi.2d v2, #0000000000000000
 ; CHECK-NEXT:    uaddlv.4s d0, v0
+; CHECK-NEXT:    stp q2, q2, [x0, #32]
 ; CHECK-NEXT:    mov.h v1[0], v0[0]
-; CHECK-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-NEXT:    ushll.4s v1, v1, #0
-; CHECK-NEXT:    stp q0, q0, [x0, #32]
 ; CHECK-NEXT:    ucvtf.4s v1, v1
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    stp q1, q2, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
@@ -430,9 +430,9 @@ define void @insert_vec_v8i8_uaddlv_from_v4i32(ptr %0) {
 ; CHECK-NEXT:    uaddlv.4s d0, v0
 ; CHECK-NEXT:    mov.h v1[0], v0[0]
 ; CHECK-NEXT:    bic.4h v1, #255, lsl #8
-; CHECK-NEXT:    ushll.4s v0, v1, #0
-; CHECK-NEXT:    ucvtf.4s v0, v0
-; CHECK-NEXT:    str q0, [x0]
+; CHECK-NEXT:    ushll.4s v1, v1, #0
+; CHECK-NEXT:    ucvtf.4s v1, v1
+; CHECK-NEXT:    str q1, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
@@ -449,14 +449,14 @@ define void @insert_vec_v16i8_uaddlv_from_v4i32(ptr %0) {
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-NEXT:    movi.2d v1, #0000000000000000
+; CHECK-NEXT:    movi.2d v2, #0000000000000000
 ; CHECK-NEXT:    uaddlv.4s d0, v0
+; CHECK-NEXT:    stp q2, q2, [x0, #32]
 ; CHECK-NEXT:    mov.h v1[0], v0[0]
-; CHECK-NEXT:    movi.2d v0, #0000000000000000
 ; CHECK-NEXT:    bic.4h v1, #255, lsl #8
-; CHECK-NEXT:    stp q0, q0, [x0, #32]
 ; CHECK-NEXT:    ushll.4s v1, v1, #0
 ; CHECK-NEXT:    ucvtf.4s v1, v1
-; CHECK-NEXT:    stp q1, q0, [x0]
+; CHECK-NEXT:    stp q1, q2, [x0]
 ; CHECK-NEXT:    ret
 
 entry:
diff --git a/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
new file mode 100644
index 0000000000000..0d58fc59c2c31
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-ins-trunc-elt.ll
@@ -0,0 +1,136 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+neon < %s | FileCheck %s
+
+; Inserting a truncated (i64 to i32) element from the bottom 128-bits of any vector type into a NEON vector should use INS (element) of the
+; truncated size to avoid pointless GPR trips.
+
+
+define <2 x i32> @test_s_trunc_d_lane0(<2 x i32> %a, <1 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_d_lane0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %c = extractelement <1 x i64> %b, i32 0
+    %d = trunc i64 %c to i32
+    %e = insertelement <2 x i32> %a, i32 %d, i64 0
+    ret <2 x i32> %e
+}
+
+define <2 x i32> @test_s_trunc_d_qlane1(<2 x i32> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_d_qlane1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.s[0], v1.s[2]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %c = extractelement <2 x i64> %b, i32 1
+    %d = trunc i64 %c to i32
+    %e = insertelement <2 x i32> %a, i32 %d, i64 0
+    ret <2 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_d_lane0(<4 x i32> %a, <1 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_d_lane0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    ret
+    %c = extractelement <1 x i64> %b, i32 0
+    %d = trunc i64 %c to i32
+    %e = insertelement <4 x i32> %a, i32 %d, i64 0
+    ret <4 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_d_qlane1(<4 x i32> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_d_qlane1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.s[3], v1.s[2]
+; CHECK-NEXT:    ret
+    %c = extractelement <2 x i64> %b, i32 1
+    %d = trunc i64 %c to i32
+    %e = insertelement <4 x i32> %a, i32 %d, i64 3
+    ret <4 x i32> %e
+}
+
+; ---- From the bottom 128b of an SVE vector
+
+define <2 x i32> @test_s_trunc_dsve_lane0(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 0
+    %d = trunc i64 %c to i32
+    %e = insertelement <2 x i32> %a, i32 %d, i64 0
+    ret <2 x i32> %e
+}
+
+define <2 x i32> @test_s_trunc_dsve_lane1(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov v0.s[1], v1.s[2]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 1
+    %d = trunc i64 %c to i32
+    %e = insertelement <2 x i32> %a, i32 %d, i64 1
+    ret <2 x i32> %e
+}
+
+; (negative test) Extracted element is not within V-register.
+define <2 x i32> @test_s_trunc_dsve_lane2(<2 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_s_trunc_dsve_lane2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.s, z1.s[4]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov v0.s[1], w8
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 2
+    %d = trunc i64 %c to i32
+    %e = insertelement <2 x i32> %a, i32 %d, i64 1
+    ret <2 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_dsve_lane0(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.s[0], v1.s[0]
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 0
+    %d = trunc i64 %c to i32
+    %e = insertelement <4 x i32> %a, i32 %d, i64 0
+    ret <4 x i32> %e
+}
+
+define <4 x i32> @test_qs_trunc_dsve_lane1(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.s[3], v1.s[2]
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 1
+    %d = trunc i64 %c to i32
+    %e = insertelement <4 x i32> %a, i32 %d, i64 3
+    ret <4 x i32> %e
+}
+
+; (negative test) Extracted element is not within V-register.
+define <4 x i32> @test_qs_trunc_dsve_lane2(<4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: test_qs_trunc_dsve_lane2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z1.s, z1.s[4]
+; CHECK-NEXT:    fmov w8, s1
+; CHECK-NEXT:    mov v0.s[3], w8
+; CHECK-NEXT:    ret
+    %c = extractelement <vscale x 2 x i64> %b, i32 2
+    %d = trunc i64 %c to i32
+    %e = insertelement <4 x i32> %a, i32 %d, i64 3
+    ret <4 x i32> %e
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
index 7bc31d44bb654..b813b8f84ba16 100644
--- a/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
+++ b/llvm/test/CodeGen/AArch64/sve-doublereduct.ll
@@ -91,8 +91,7 @@ define i32 @add_i32(<vscale x 8 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    add z0.s, z0.s, z2.s
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %r1 = call i32 @llvm.vector.reduce.add.i32.nxv8i32(<vscale x 8 x i32> %a)
   %r2 = call i32 @llvm.vector.reduce.add.i32.nxv4i32(<vscale x 4 x i32> %b)
@@ -112,8 +111,7 @@ define i16 @add_ext_i16(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-NEXT:    add z1.h, z1.h, z3.h
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %ae = zext <vscale x 16 x i8> %a to <vscale x 16 x i16>
   %be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
@@ -139,8 +137,7 @@ define i16 @add_ext_v32i16(<vscale x 32 x i8> %a, <vscale x 16 x i8> %b) {
 ; CHECK-NEXT:    add z1.h, z2.h, z5.h
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %ae = zext <vscale x 32 x i8> %a to <vscale x 32 x i16>
   %be = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-element.ll b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
index 6d4f5963881e5..939c7e4310018 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-element.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-element.ll
@@ -644,8 +644,8 @@ define i1 @test_lane4_2xi1(<vscale x 2 x i1> %a) #0 {
 ; CHECK-LABEL: test_lane4_2xi1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, p0/z, #1 // =0x1
-; CHECK-NEXT:    mov z0.d, z0.d[4]
-; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    mov z0.s, z0.s[8]
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x1
 ; CHECK-NEXT:    ret
   %b = extractelement <vscale x 2 x i1> %a, i32 4
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
index 518e3573b5edd..965af2a745afd 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-fixed-vector.ll
@@ -238,11 +238,8 @@ define <2 x i1> @extract_v2i1_nxv2i1(<vscale x 2 x i1> %inmask) {
 ; CHECK-LABEL: extract_v2i1_nxv2i1:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mov z0.d, p0/z, #1 // =0x1
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    mov x8, v0.d[1]
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    mov v0.s[1], w8
-; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT:    mov v0.s[1], v0.s[2]
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
 ; CHECK-NEXT:    ret
   %mask = call <2 x i1> @llvm.vector.extract.v2i1.nxv2i1(<vscale x 2 x i1> %inmask, i64 0)
   ret <2 x i1> %mask
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
index 752c2cd34bfe4..be19e9ef5e86f 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-reduce.ll
@@ -37,8 +37,7 @@ define i8 @uaddv_v32i8(ptr %a) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.b, vl32
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %op = load <32 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op)
@@ -54,8 +53,7 @@ define i8 @uaddv_v64i8(ptr %a) #0 {
 ; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    add z0.b, z1.b, z0.b
 ; VBITS_GE_256-NEXT:    uaddv d0, p0, z0.b
-; VBITS_GE_256-NEXT:    fmov x0, d0
-; VBITS_GE_256-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_256-NEXT:    fmov w0, s0
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: uaddv_v64i8:
@@ -63,8 +61,7 @@ define i8 @uaddv_v64i8(ptr %a) #0 {
 ; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
 ; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    uaddv d0, p0, z0.b
-; VBITS_GE_512-NEXT:    fmov x0, d0
-; VBITS_GE_512-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_512-NEXT:    fmov w0, s0
 ; VBITS_GE_512-NEXT:    ret
   %op = load <64 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %op)
@@ -77,8 +74,7 @@ define i8 @uaddv_v128i8(ptr %a) vscale_range(8,0) #0 {
 ; CHECK-NEXT:    ptrue p0.b, vl128
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %op = load <128 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %op)
@@ -91,8 +87,7 @@ define i8 @uaddv_v256i8(ptr %a) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p0.b, vl256
 ; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %op = load <256 x i8>, ptr %a
   %res = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> %op)
@@ -127,8 +122,7 @@ define i16 @uaddv_v16i16(ptr %a) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.h, vl16
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %op = load <16 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op)
@@ -144,8 +138,7 @@ define i16 @uaddv_v32i16(ptr %a) #0 {
 ; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    add z0.h, z1.h, z0.h
 ; VBITS_GE_256-NEXT:    uaddv d0, p0, z0.h
-; VBITS_GE_256-NEXT:    fmov x0, d0
-; VBITS_GE_256-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_256-NEXT:    fmov w0, s0
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: uaddv_v32i16:
@@ -153,8 +146,7 @@ define i16 @uaddv_v32i16(ptr %a) #0 {
 ; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
 ; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    uaddv d0, p0, z0.h
-; VBITS_GE_512-NEXT:    fmov x0, d0
-; VBITS_GE_512-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_512-NEXT:    fmov w0, s0
 ; VBITS_GE_512-NEXT:    ret
   %op = load <32 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %op)
@@ -167,8 +159,7 @@ define i16 @uaddv_v64i16(ptr %a) vscale_range(8,0) #0 {
 ; CHECK-NEXT:    ptrue p0.h, vl64
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %op = load <64 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %op)
@@ -181,8 +172,7 @@ define i16 @uaddv_v128i16(ptr %a) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p0.h, vl128
 ; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %op = load <128 x i16>, ptr %a
   %res = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %op)
@@ -217,8 +207,7 @@ define i32 @uaddv_v8i32(ptr %a) vscale_range(2,0) #0 {
 ; CHECK-NEXT:    ptrue p0.s, vl8
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %op = load <8 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op)
@@ -234,8 +223,7 @@ define i32 @uaddv_v16i32(ptr %a) #0 {
 ; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    add z0.s, z1.s, z0.s
 ; VBITS_GE_256-NEXT:    uaddv d0, p0, z0.s
-; VBITS_GE_256-NEXT:    fmov x0, d0
-; VBITS_GE_256-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_256-NEXT:    fmov w0, s0
 ; VBITS_GE_256-NEXT:    ret
 ;
 ; VBITS_GE_512-LABEL: uaddv_v16i32:
@@ -243,8 +231,7 @@ define i32 @uaddv_v16i32(ptr %a) #0 {
 ; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
 ; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; VBITS_GE_512-NEXT:    uaddv d0, p0, z0.s
-; VBITS_GE_512-NEXT:    fmov x0, d0
-; VBITS_GE_512-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; VBITS_GE_512-NEXT:    fmov w0, s0
 ; VBITS_GE_512-NEXT:    ret
   %op = load <16 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %op)
@@ -257,8 +244,7 @@ define i32 @uaddv_v32i32(ptr %a) vscale_range(8,0) #0 {
 ; CHECK-NEXT:    ptrue p0.s, vl32
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %op = load <32 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %op)
@@ -271,8 +257,7 @@ define i32 @uaddv_v64i32(ptr %a) vscale_range(16,0) #0 {
 ; CHECK-NEXT:    ptrue p0.s, vl64
 ; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %op = load <64 x i32>, ptr %a
   %res = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %op)
diff --git a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
index 8c1b5225b7f25..6ec18477fe1a0 100644
--- a/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-int-reduce.ll
@@ -146,8 +146,7 @@ define i8 @uaddv_nxv16i8(<vscale x 16 x i8> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.b
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %a)
   ret i8 %res
@@ -158,8 +157,7 @@ define i16 @uaddv_nxv8i16(<vscale x 8 x i16> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> %a)
   ret i16 %res
@@ -170,8 +168,7 @@ define i32 @uaddv_nxv4i32(<vscale x 4 x i32> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %a)
   ret i32 %res
@@ -422,8 +419,7 @@ define i8 @uaddv_nxv12i8(<vscale x 12 x i8> %a) {
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z2.h
 ; CHECK-NEXT:    uzp1 z0.b, z0.b, z1.b
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = call i8 @llvm.vector.reduce.add.nxv12i8(<vscale x 12 x i8> %a)
   ret i8 %res
diff --git a/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll
index dd7b15ef5ee6f..90383b43d5812 100644
--- a/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-split-int-reduce.ll
@@ -33,8 +33,7 @@ define i32 @orv_nxv2i32(<vscale x 2 x i32> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    orv d0, p0, z0.d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.or.nxv2i32(<vscale x 2 x i32> %a)
   ret i32 %res
@@ -61,8 +60,7 @@ define i16 @xorv_nxv2i16(<vscale x 2 x i16> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    eorv d0, p0, z0.d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.xor.nxv2i16(<vscale x 2 x i16> %a)
   ret i16 %res
@@ -87,8 +85,7 @@ define i16 @uaddv_nxv4i16(<vscale x 4 x i16> %a) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> %a)
   ret i16 %res
@@ -100,8 +97,7 @@ define i16 @uaddv_nxv16i16(<vscale x 16 x i16> %a) {
 ; CHECK-NEXT:    add z0.h, z0.h, z1.h
 ; CHECK-NEXT:    ptrue p0.h
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = call i16 @llvm.vector.reduce.add.nxv16i16(<vscale x 16 x i16> %a)
   ret i16 %res
@@ -115,8 +111,7 @@ define i32 @uaddv_nxv16i32(<vscale x 16 x i32> %a) {
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    add z0.s, z0.s, z1.s
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> %a)
   ret i32 %res
@@ -130,8 +125,7 @@ define i32 @umin_nxv2i32(<vscale x 2 x i32> %a) {
 ; CHECK-NEXT:    and z0.d, z0.d, #0xffffffff
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    uminv d0, p0, z0.d
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %res = call i32 @llvm.vector.reduce.umin.nxv2i32(<vscale x 2 x i32> %a)
   ret i32 %res
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
index 92a67cba55f7a..244dcc734bd7c 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-reduce.ll
@@ -15,8 +15,7 @@ define i8 @uaddv_v8i8(<8 x i8> %a) {
 ; CHECK-NEXT:    ptrue p0.b, vl8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v8i8:
@@ -51,8 +50,7 @@ define i8 @uaddv_v16i8(<16 x i8> %a) {
 ; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v16i8:
@@ -103,8 +101,7 @@ define i8 @uaddv_v32i8(ptr %a) {
 ; CHECK-NEXT:    ptrue p0.b, vl16
 ; CHECK-NEXT:    add z0.b, z1.b, z0.b
 ; CHECK-NEXT:    uaddv d0, p0, z0.b
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v32i8:
@@ -188,8 +185,7 @@ define i16 @uaddv_v4i16(<4 x i16> %a) {
 ; CHECK-NEXT:    ptrue p0.h, vl4
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v4i16:
@@ -216,8 +212,7 @@ define i16 @uaddv_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v8i16:
@@ -252,8 +247,7 @@ define i16 @uaddv_v16i16(ptr %a) {
 ; CHECK-NEXT:    ptrue p0.h, vl8
 ; CHECK-NEXT:    add z0.h, z1.h, z0.h
 ; CHECK-NEXT:    uaddv d0, p0, z0.h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v16i16:
@@ -305,8 +299,7 @@ define i32 @uaddv_v2i32(<2 x i32> %a) {
 ; CHECK-NEXT:    ptrue p0.s, vl2
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v2i32:
@@ -328,8 +321,7 @@ define i32 @uaddv_v4i32(<4 x i32> %a) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v4i32:
@@ -353,8 +345,7 @@ define i32 @uaddv_v8i32(ptr %a) {
 ; CHECK-NEXT:    ptrue p0.s, vl4
 ; CHECK-NEXT:    add z0.s, z1.s, z0.s
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: uaddv_v8i32:
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
index 00a15f4bcd639..688537704a6f7 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-reductions.ll
@@ -66,8 +66,7 @@ define i32 @reduce_uaddv_v16i8(<32 x i8> %a) {
 ; STREAMING-SVE-NEXT:    add z0.s, z2.s, z0.s
 ; STREAMING-SVE-NEXT:    add z0.s, z1.s, z0.s
 ; STREAMING-SVE-NEXT:    uaddv d0, p0, z0.s
-; STREAMING-SVE-NEXT:    fmov x0, d0
-; STREAMING-SVE-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; STREAMING-SVE-NEXT:    fmov w0, s0
 ; STREAMING-SVE-NEXT:    ret
   %1 = zext <32 x i8> %a to <32 x i32>
   %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
@@ -134,8 +133,7 @@ define i32 @reduce_saddv_v16i8(<32 x i8> %a) {
 ; STREAMING-SVE-NEXT:    add z0.s, z2.s, z0.s
 ; STREAMING-SVE-NEXT:    add z0.s, z1.s, z0.s
 ; STREAMING-SVE-NEXT:    uaddv d0, p0, z0.s
-; STREAMING-SVE-NEXT:    fmov x0, d0
-; STREAMING-SVE-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; STREAMING-SVE-NEXT:    fmov w0, s0
 ; STREAMING-SVE-NEXT:    ret
   %1 = sext <32 x i8> %a to <32 x i32>
   %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
diff --git a/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll b/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
index 91f8f5c2c90d8..6af26067cd6d6 100644
--- a/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vecreduce-dot.ll
@@ -42,8 +42,7 @@ define i32 @test(<vscale x 32 x i8> %bin.rdx, <vscale x 32 x i8> %bin.rdx2)  {
 ; CHECK-NEXT:    add z1.s, z3.s, z1.s
 ; CHECK-NEXT:    add z0.s, z1.s, z0.s
 ; CHECK-NEXT:    uaddv d0, p0, z0.s
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %a = sext <vscale x 32 x i8> %bin.rdx to <vscale x 32 x i32>
   %b = sext <vscale x 32 x i8> %bin.rdx2 to <vscale x 32 x i32>
diff --git a/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll b/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
index f0856c43daf1d..e6905f687ad9a 100644
--- a/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
+++ b/llvm/test/CodeGen/AArch64/uaddlv-vaddlp-combine.ll
@@ -5,8 +5,7 @@ define i32 @uaddlv_uaddlp_v8i16(<8 x i16> %0) {
 ; CHECK-LABEL: uaddlv_uaddlp_v8i16:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    uaddlv s0, v0.8h
-; CHECK-NEXT:    fmov x0, d0
-; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %2 = tail call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %0)
   %3 = tail call i64 @llvm.aarch64.neon.uaddlv.i64.v4i32(<4 x i32> %2)
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 557aa010b3a7d..7f2eefe5ed72f 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -81,7 +81,7 @@ define i8 @convert_to_bitmask2(<2 x i64> %vec) {
 ; CHECK-NEXT:    ldr q1, [x8, lCPI3_0@PAGEOFF]
 ; CHECK-NEXT:    bic.16b v0, v1, v0
 ; CHECK-NEXT:    addp.2d d0, v0
-; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    and w0, w8, #0x3
 ; CHECK-NEXT:    ret
 ; CHECK-NEXT:    .loh AdrpLdr Lloh6, Lloh7
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
index 625e8ae6a98dc..1bdf7bbb7f813 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-bool.ll
@@ -833,7 +833,7 @@ define i32 @reduce_xor_v2i64(<2 x i64> %a0, i32 %a1, i32 %a2) nounwind {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cmlt v0.2d, v0.2d, #0
 ; CHECK-NEXT:    addp d0, v0.2d
-; CHECK-NEXT:    fmov x8, d0
+; CHECK-NEXT:    fmov w8, s0
 ; CHECK-NEXT:    tst w8, #0x1
 ; CHECK-NEXT:    csel w0, w0, w1, ne
 ; CHECK-NEXT:    ret

From cf0bc8d0321a55f3f166131ec3fe45cdef7d5e3c Mon Sep 17 00:00:00 2001
From: Dhruv Srivastava <dhruv.srivastava@ibm.com>
Date: Fri, 20 Dec 2024 16:38:17 +0530
Subject: [PATCH 174/209] [lldb][AIX] Adding AIX version of ptrace64 (#120390)

This PR is in reference to porting LLDB on AIX.

Link to discussions on llvm discourse and github:

1. https://discourse.llvm.org/t/port-lldb-to-ibm-aix/80640
2. https://github.com/llvm/llvm-project/issues/101657
The complete changes for porting are present in this draft PR:
https://github.com/llvm/llvm-project/pull/102601

Adding changes for minimal build for lldb binary on AIX. ptrace64 is
needed to debug 64-bit AIX debuggee, and its format is different than
the traditional ptrace on other platforms: [AIX
ptrace](https://www.ibm.com/docs/en/aix/7.3?topic=p-ptrace-ptracex-ptrace64-subroutine)
---
 lldb/source/Host/posix/ProcessLauncherPosixFork.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp b/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp
index 4a0437b634ee3..7b8b42a4b7fe0 100644
--- a/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp
+++ b/lldb/source/Host/posix/ProcessLauncherPosixFork.cpp
@@ -192,7 +192,11 @@ struct ForkLaunchInfo {
     }
 
     // Start tracing this child that is about to exec.
+#ifdef _AIX
+    if (ptrace64(PT_TRACE_ME, 0, 0, 0, nullptr) == -1)
+#else
     if (ptrace(PT_TRACE_ME, 0, nullptr, 0) == -1)
+#endif
       ExitWithError(error_fd, "ptrace");
   }
 

From 6fd267d79b9bf3739c59662a7c09d78a6e09c94f Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Fri, 20 Dec 2024 11:23:55 +0000
Subject: [PATCH 175/209] [lldb][DWARFASTParserClang][NFC] Remove unused
 parameter to CompleteRecordType (#120456)

Became unused since the recent
https://github.com/llvm/llvm-project/pull/110648
---
 lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp | 3 +--
 lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h   | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index e9cc7cc781cc3..71a77a4ed7458 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -2052,7 +2052,6 @@ bool DWARFASTParserClang::ParseTemplateParameterInfos(
 }
 
 bool DWARFASTParserClang::CompleteRecordType(const DWARFDIE &die,
-                                             lldb_private::Type *type,
                                              const CompilerType &clang_type) {
   const dw_tag_t tag = die.Tag();
   SymbolFileDWARF *dwarf = die.GetDWARF();
@@ -2189,7 +2188,7 @@ bool DWARFASTParserClang::CompleteTypeFromDWARF(
   case DW_TAG_structure_type:
   case DW_TAG_union_type:
   case DW_TAG_class_type:
-    CompleteRecordType(die, type, clang_type);
+    CompleteRecordType(die, clang_type);
     break;
   case DW_TAG_enumeration_type:
     CompleteEnumType(die, type, clang_type);
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
index 6c3aac6d452e4..55f8e38d7486d 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
@@ -421,7 +421,6 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
       const lldb_private::CompilerType &class_clang_type);
 
   bool CompleteRecordType(const lldb_private::plugin::dwarf::DWARFDIE &die,
-                          lldb_private::Type *type,
                           const lldb_private::CompilerType &clang_type);
   bool CompleteEnumType(const lldb_private::plugin::dwarf::DWARFDIE &die,
                         lldb_private::Type *type,

From 84f0098ad103897112d3052fffbb244cd9db4815 Mon Sep 17 00:00:00 2001
From: Mikhail Goncharov <goncharov.mikhail@gmail.com>
Date: Fri, 20 Dec 2024 12:30:05 +0100
Subject: [PATCH 176/209] [bazel] port b03a09e74fa38eceddbc314c4f896a935224f453

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index c632cde3abd96..287e2be177529 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3269,6 +3269,7 @@ cc_library(
     local_defines = if_cuda_available(["MLIR_GPU_TO_CUBIN_PASS_ENABLE"]),
     deps = [
         ":ArithTransforms",
+        ":ArithToLLVM",
         ":BufferizationTransforms",
         ":ConversionPasses",
         ":FuncDialect",

From 9a1837f9b0d3c74cd35fd1af5f7587f31391ba82 Mon Sep 17 00:00:00 2001
From: Dhruv Srivastava <dhruv.srivastava@ibm.com>
Date: Fri, 20 Dec 2024 17:00:53 +0530
Subject: [PATCH 177/209] [lldb][AIX] Introducing _ALL_SOURCE macro into driver
 CMakeLists (#120607)

This PR is in reference to porting LLDB on AIX.

Link to discussions on llvm discourse and github:
1. https://discourse.llvm.org/t/port-lldb-to-ibm-aix/80640
2. https://github.com/llvm/llvm-project/issues/101657
The complete changes for porting are present in this draft PR:
https://github.com/llvm/llvm-project/pull/102601

Adding changes for minimal build for lldb binary on AIX:
The `struct winsize` needed by `lldb/tools/driver/Driver.cpp` is only
recognised in AIX under the AIX specific `_ALL_SOURCE` macro, hence its
enablement is required here.
---
 lldb/tools/driver/CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lldb/tools/driver/CMakeLists.txt b/lldb/tools/driver/CMakeLists.txt
index cd304a047dea6..89884ecd0601b 100644
--- a/lldb/tools/driver/CMakeLists.txt
+++ b/lldb/tools/driver/CMakeLists.txt
@@ -11,6 +11,11 @@ if(APPLE)
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-sectcreate,__TEXT,__info_plist,${CMAKE_CURRENT_BINARY_DIR}/lldb-Info.plist")
 endif()
 
+if (UNIX AND ${CMAKE_SYSTEM_NAME} MATCHES "AIX")                               
+  remove_definitions("-D_XOPEN_SOURCE=700")                                    
+  add_definitions("-D_ALL_SOURCE")                                             
+endif()
+
 add_lldb_tool(lldb
   Driver.cpp
   Platform.cpp

From 385b144c9477de6a4598bd08ce4f2883aeb236b9 Mon Sep 17 00:00:00 2001
From: Dhruv Srivastava <dhruv.srivastava@ibm.com>
Date: Fri, 20 Dec 2024 17:01:35 +0530
Subject: [PATCH 178/209] [lldb][Linux] Moving generic APIs from HostInfoLinux
 to HostInfoPosix (#119694)

This change is related merging some of the APIs in HostInfoLinux into
HostInfoPosix.

Here is the reference PR comment:

https://github.com/llvm/llvm-project/pull/117906#discussion_r1865427495,
https://github.com/llvm/llvm-project/pull/117906#discussion_r1861616945
---
 lldb/include/lldb/Host/linux/HostInfoLinux.h |  8 +--
 lldb/include/lldb/Host/posix/HostInfoPosix.h |  7 +-
 lldb/source/Host/linux/HostInfoLinux.cpp     | 58 -----------------
 lldb/source/Host/posix/HostInfoPosix.cpp     | 68 +++++++++++++++++++-
 4 files changed, 73 insertions(+), 68 deletions(-)

diff --git a/lldb/include/lldb/Host/linux/HostInfoLinux.h b/lldb/include/lldb/Host/linux/HostInfoLinux.h
index 2964f3f1dc989..904d679e6d953 100644
--- a/lldb/include/lldb/Host/linux/HostInfoLinux.h
+++ b/lldb/include/lldb/Host/linux/HostInfoLinux.h
@@ -15,7 +15,6 @@
 #include "llvm/Support/VersionTuple.h"
 
 #include <optional>
-#include <string>
 
 namespace lldb_private {
 
@@ -26,18 +25,13 @@ class HostInfoLinux : public HostInfoPosix {
   static void Initialize(SharedLibraryDirectoryHelper *helper = nullptr);
   static void Terminate();
 
-  static llvm::VersionTuple GetOSVersion();
-  static std::optional<std::string> GetOSBuildString();
   static llvm::StringRef GetDistributionId();
   static FileSpec GetProgramFileSpec();
 
 protected:
-  static bool ComputeSupportExeDirectory(FileSpec &file_spec);
-  static bool ComputeSystemPluginsDirectory(FileSpec &file_spec);
-  static bool ComputeUserPluginsDirectory(FileSpec &file_spec);
   static void ComputeHostArchitectureSupport(ArchSpec &arch_32,
                                              ArchSpec &arch_64);
 };
-}
+} // namespace lldb_private
 
 #endif
diff --git a/lldb/include/lldb/Host/posix/HostInfoPosix.h b/lldb/include/lldb/Host/posix/HostInfoPosix.h
index 8d070d3ac1e6f..779b67bd66eda 100644
--- a/lldb/include/lldb/Host/posix/HostInfoPosix.h
+++ b/lldb/include/lldb/Host/posix/HostInfoPosix.h
@@ -12,6 +12,7 @@
 #include "lldb/Host/HostInfoBase.h"
 #include "lldb/Utility/FileSpec.h"
 #include <optional>
+#include <string>
 
 namespace lldb_private {
 
@@ -35,11 +36,15 @@ class HostInfoPosix : public HostInfoBase {
   static bool GetEnvironmentVar(const std::string &var_name, std::string &var);
 
   static UserIDResolver &GetUserIDResolver();
+  static llvm::VersionTuple GetOSVersion();
+  static std::optional<std::string> GetOSBuildString();
 
 protected:
   static bool ComputeSupportExeDirectory(FileSpec &file_spec);
   static bool ComputeHeaderDirectory(FileSpec &file_spec);
+  static bool ComputeSystemPluginsDirectory(FileSpec &file_spec);
+  static bool ComputeUserPluginsDirectory(FileSpec &file_spec);
 };
-}
+} // namespace lldb_private
 
 #endif
diff --git a/lldb/source/Host/linux/HostInfoLinux.cpp b/lldb/source/Host/linux/HostInfoLinux.cpp
index 723f0c2fb3fdc..711d2ca6f13d3 100644
--- a/lldb/source/Host/linux/HostInfoLinux.cpp
+++ b/lldb/source/Host/linux/HostInfoLinux.cpp
@@ -30,8 +30,6 @@ namespace {
 struct HostInfoLinuxFields {
   llvm::once_flag m_distribution_once_flag;
   std::string m_distribution_id;
-  llvm::once_flag m_os_version_once_flag;
-  llvm::VersionTuple m_os_version;
 };
 } // namespace
 
@@ -50,33 +48,6 @@ void HostInfoLinux::Terminate() {
   HostInfoBase::Terminate();
 }
 
-llvm::VersionTuple HostInfoLinux::GetOSVersion() {
-  assert(g_fields && "Missing call to Initialize?");
-  llvm::call_once(g_fields->m_os_version_once_flag, []() {
-    struct utsname un;
-    if (uname(&un) != 0)
-      return;
-
-    llvm::StringRef release = un.release;
-    // The kernel release string can include a lot of stuff (e.g.
-    // 4.9.0-6-amd64). We're only interested in the numbered prefix.
-    release = release.substr(0, release.find_first_not_of("0123456789."));
-    g_fields->m_os_version.tryParse(release);
-  });
-
-  return g_fields->m_os_version;
-}
-
-std::optional<std::string> HostInfoLinux::GetOSBuildString() {
-  struct utsname un;
-  ::memset(&un, 0, sizeof(utsname));
-
-  if (uname(&un) < 0)
-    return std::nullopt;
-
-  return std::string(un.release);
-}
-
 llvm::StringRef HostInfoLinux::GetDistributionId() {
   assert(g_fields && "Missing call to Initialize?");
   // Try to run 'lbs_release -i', and use that response for the distribution
@@ -167,35 +138,6 @@ FileSpec HostInfoLinux::GetProgramFileSpec() {
   return g_program_filespec;
 }
 
-bool HostInfoLinux::ComputeSupportExeDirectory(FileSpec &file_spec) {
-  if (HostInfoPosix::ComputeSupportExeDirectory(file_spec) &&
-      file_spec.IsAbsolute() && FileSystem::Instance().Exists(file_spec))
-    return true;
-  file_spec.SetDirectory(GetProgramFileSpec().GetDirectory());
-  return !file_spec.GetDirectory().IsEmpty();
-}
-
-bool HostInfoLinux::ComputeSystemPluginsDirectory(FileSpec &file_spec) {
-  FileSpec temp_file("/usr/" LLDB_INSTALL_LIBDIR_BASENAME "/lldb/plugins");
-  FileSystem::Instance().Resolve(temp_file);
-  file_spec.SetDirectory(temp_file.GetPath());
-  return true;
-}
-
-bool HostInfoLinux::ComputeUserPluginsDirectory(FileSpec &file_spec) {
-  // XDG Base Directory Specification
-  // http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html If
-  // XDG_DATA_HOME exists, use that, otherwise use ~/.local/share/lldb.
-  const char *xdg_data_home = getenv("XDG_DATA_HOME");
-  if (xdg_data_home && xdg_data_home[0]) {
-    std::string user_plugin_dir(xdg_data_home);
-    user_plugin_dir += "/lldb";
-    file_spec.SetDirectory(user_plugin_dir.c_str());
-  } else
-    file_spec.SetDirectory("~/.local/share/lldb");
-  return true;
-}
-
 void HostInfoLinux::ComputeHostArchitectureSupport(ArchSpec &arch_32,
                                                    ArchSpec &arch_64) {
   HostInfoPosix::ComputeHostArchitectureSupport(arch_32, arch_64);
diff --git a/lldb/source/Host/posix/HostInfoPosix.cpp b/lldb/source/Host/posix/HostInfoPosix.cpp
index 731a7dee2e620..193f584900b63 100644
--- a/lldb/source/Host/posix/HostInfoPosix.cpp
+++ b/lldb/source/Host/posix/HostInfoPosix.cpp
@@ -7,16 +7,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Host/posix/HostInfoPosix.h"
+#include "lldb/Host/Config.h"
+#include "lldb/Host/FileSystem.h"
+#include "lldb/Host/HostInfo.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/UserIDResolver.h"
-
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 
 #include <climits>
+#include <cstdio>
 #include <cstdlib>
+#include <cstring>
 #include <grp.h>
 #include <mutex>
 #include <optional>
@@ -27,6 +31,31 @@
 
 using namespace lldb_private;
 
+namespace {
+struct HostInfoPosixFields {
+  llvm::once_flag m_os_version_once_flag;
+  llvm::VersionTuple m_os_version;
+};
+} // namespace
+
+llvm::VersionTuple HostInfoPosix::GetOSVersion() {
+  static HostInfoPosixFields *g_fields = new HostInfoPosixFields();
+  assert(g_fields && "Missing call to Initialize?");
+  llvm::call_once(g_fields->m_os_version_once_flag, []() {
+    struct utsname un;
+    if (uname(&un) != 0)
+      return;
+
+    llvm::StringRef release = un.release;
+    // The Linux kernel release string can include a lot of stuff (e.g.
+    // 4.9.0-6-amd64). We're only interested in the numbered prefix.
+    release = release.substr(0, release.find_first_not_of("0123456789."));
+    g_fields->m_os_version.tryParse(release);
+  });
+
+  return g_fields->m_os_version;
+}
+
 size_t HostInfoPosix::GetPageSize() { return ::getpagesize(); }
 
 bool HostInfoPosix::GetHostname(std::string &s) {
@@ -47,6 +76,16 @@ std::optional<std::string> HostInfoPosix::GetOSKernelDescription() {
   return std::string(un.version);
 }
 
+std::optional<std::string> HostInfoPosix::GetOSBuildString() {
+  struct utsname un;
+  ::memset(&un, 0, sizeof(utsname));
+
+  if (uname(&un) < 0)
+    return std::nullopt;
+
+  return std::string(un.release);
+}
+
 #ifdef __ANDROID__
 #include <android/api-level.h>
 #endif
@@ -140,7 +179,32 @@ FileSpec HostInfoPosix::GetDefaultShell() {
 }
 
 bool HostInfoPosix::ComputeSupportExeDirectory(FileSpec &file_spec) {
-  return ComputePathRelativeToLibrary(file_spec, "/bin");
+  if (ComputePathRelativeToLibrary(file_spec, "/bin") &&
+      file_spec.IsAbsolute() && FileSystem::Instance().Exists(file_spec))
+    return true;
+  file_spec.SetDirectory(HostInfo::GetProgramFileSpec().GetDirectory());
+  return !file_spec.GetDirectory().IsEmpty();
+}
+
+bool HostInfoPosix::ComputeSystemPluginsDirectory(FileSpec &file_spec) {
+  FileSpec temp_file("/usr/" LLDB_INSTALL_LIBDIR_BASENAME "/lldb/plugins");
+  FileSystem::Instance().Resolve(temp_file);
+  file_spec.SetDirectory(temp_file.GetPath());
+  return true;
+}
+
+bool HostInfoPosix::ComputeUserPluginsDirectory(FileSpec &file_spec) {
+  // XDG Base Directory Specification
+  // http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html If
+  // XDG_DATA_HOME exists, use that, otherwise use ~/.local/share/lldb.
+  const char *xdg_data_home = getenv("XDG_DATA_HOME");
+  if (xdg_data_home && xdg_data_home[0]) {
+    std::string user_plugin_dir(xdg_data_home);
+    user_plugin_dir += "/lldb";
+    file_spec.SetDirectory(user_plugin_dir.c_str());
+  } else
+    file_spec.SetDirectory("~/.local/share/lldb");
+  return true;
 }
 
 bool HostInfoPosix::ComputeHeaderDirectory(FileSpec &file_spec) {

From 8dc23efbe6c584c06d6472c6f1b679b5ca861b07 Mon Sep 17 00:00:00 2001
From: James Chesterman <James.Chesterman@arm.com>
Date: Fri, 20 Dec 2024 11:39:00 +0000
Subject: [PATCH 179/209] [NFC][AArch64][SVE] Rename variables in partial
 reduction lowering functions (#120589)

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 115 +++++++++---------
 1 file changed, 55 insertions(+), 60 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5b941d173ec2b..a27c030237c87 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21952,74 +21952,72 @@ SDValue tryLowerPartialReductionToDot(SDNode *N,
 
   SDLoc DL(N);
 
-  // The narrower of the two operands. Used as the accumulator
-  auto NarrowOp = N->getOperand(1);
-  auto MulOp = N->getOperand(2);
-  if (MulOp->getOpcode() != ISD::MUL)
+  SDValue Op2 = N->getOperand(2);
+  if (Op2->getOpcode() != ISD::MUL ||
+      !ISD::isExtOpcode(Op2->getOperand(0)->getOpcode()) ||
+      !ISD::isExtOpcode(Op2->getOperand(1)->getOpcode()))
     return SDValue();
 
-  auto ExtA = MulOp->getOperand(0);
-  auto ExtB = MulOp->getOperand(1);
-
-  if (!ISD::isExtOpcode(ExtA->getOpcode()) ||
-      !ISD::isExtOpcode(ExtB->getOpcode()))
-    return SDValue();
-  bool AIsSigned = ExtA->getOpcode() == ISD::SIGN_EXTEND;
-  bool BIsSigned = ExtB->getOpcode() == ISD::SIGN_EXTEND;
+  SDValue Acc = N->getOperand(1);
+  SDValue Mul = N->getOperand(2);
+  SDValue ExtMulOpLHS = Mul->getOperand(0);
+  SDValue ExtMulOpRHS = Mul->getOperand(1);
 
-  auto A = ExtA->getOperand(0);
-  auto B = ExtB->getOperand(0);
-  if (A.getValueType() != B.getValueType())
+  SDValue MulOpLHS = ExtMulOpLHS->getOperand(0);
+  SDValue MulOpRHS = ExtMulOpRHS->getOperand(0);
+  if (MulOpLHS.getValueType() != MulOpRHS.getValueType())
     return SDValue();
 
-  EVT ReducedType = N->getValueType(0);
-  EVT MulSrcType = A.getValueType();
+  EVT ReducedVT = N->getValueType(0);
+  EVT MulSrcVT = MulOpLHS.getValueType();
 
   // Dot products operate on chunks of four elements so there must be four times
   // as many elements in the wide type
-  if (!(ReducedType == MVT::nxv4i64 && MulSrcType == MVT::nxv16i8) &&
-      !(ReducedType == MVT::nxv4i32 && MulSrcType == MVT::nxv16i8) &&
-      !(ReducedType == MVT::nxv2i64 && MulSrcType == MVT::nxv8i16) &&
-      !(ReducedType == MVT::v4i64 && MulSrcType == MVT::v16i8) &&
-      !(ReducedType == MVT::v4i32 && MulSrcType == MVT::v16i8) &&
-      !(ReducedType == MVT::v2i32 && MulSrcType == MVT::v8i8))
+  if (!(ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) &&
+      !(ReducedVT == MVT::nxv4i32 && MulSrcVT == MVT::nxv16i8) &&
+      !(ReducedVT == MVT::nxv2i64 && MulSrcVT == MVT::nxv8i16) &&
+      !(ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8) &&
+      !(ReducedVT == MVT::v4i32 && MulSrcVT == MVT::v16i8) &&
+      !(ReducedVT == MVT::v2i32 && MulSrcVT == MVT::v8i8))
     return SDValue();
 
+  bool MulOpLHSIsSigned = ExtMulOpLHS->getOpcode() == ISD::SIGN_EXTEND;
+  bool MulOpRHSIsSigned = ExtMulOpRHS->getOpcode() == ISD::SIGN_EXTEND;
   // If the extensions are mixed, we should lower it to a usdot instead
   unsigned Opcode = 0;
-  if (AIsSigned != BIsSigned) {
+  if (MulOpLHSIsSigned != MulOpRHSIsSigned) {
     if (!Subtarget->hasMatMulInt8())
       return SDValue();
 
     bool Scalable = N->getValueType(0).isScalableVT();
     // There's no nxv2i64 version of usdot
-    if (Scalable && ReducedType != MVT::nxv4i32 && ReducedType != MVT::nxv4i64)
+    if (Scalable && ReducedVT != MVT::nxv4i32 && ReducedVT != MVT::nxv4i64)
       return SDValue();
 
     Opcode = AArch64ISD::USDOT;
     // USDOT expects the signed operand to be last
-    if (!BIsSigned)
-      std::swap(A, B);
-  } else if (AIsSigned)
+    if (!MulOpRHSIsSigned)
+      std::swap(MulOpLHS, MulOpRHS);
+  } else if (MulOpLHSIsSigned)
     Opcode = AArch64ISD::SDOT;
   else
     Opcode = AArch64ISD::UDOT;
 
   // Partial reduction lowering for (nx)v16i8 to (nx)v4i64 requires an i32 dot
   // product followed by a zero / sign extension
-  if ((ReducedType == MVT::nxv4i64 && MulSrcType == MVT::nxv16i8) ||
-      (ReducedType == MVT::v4i64 && MulSrcType == MVT::v16i8)) {
-    EVT ReducedTypeI32 =
-        (ReducedType.isScalableVector()) ? MVT::nxv4i32 : MVT::v4i32;
+  if ((ReducedVT == MVT::nxv4i64 && MulSrcVT == MVT::nxv16i8) ||
+      (ReducedVT == MVT::v4i64 && MulSrcVT == MVT::v16i8)) {
+    EVT ReducedVTI32 =
+        (ReducedVT.isScalableVector()) ? MVT::nxv4i32 : MVT::v4i32;
 
-    auto DotI32 = DAG.getNode(Opcode, DL, ReducedTypeI32,
-                              DAG.getConstant(0, DL, ReducedTypeI32), A, B);
-    auto Extended = DAG.getSExtOrTrunc(DotI32, DL, ReducedType);
-    return DAG.getNode(ISD::ADD, DL, NarrowOp.getValueType(), NarrowOp,
-                       Extended);
+    SDValue DotI32 =
+        DAG.getNode(Opcode, DL, ReducedVTI32,
+                    DAG.getConstant(0, DL, ReducedVTI32), MulOpLHS, MulOpRHS);
+    SDValue Extended = DAG.getSExtOrTrunc(DotI32, DL, ReducedVT);
+    return DAG.getNode(ISD::ADD, DL, ReducedVT, Acc, Extended);
   }
 
-  return DAG.getNode(Opcode, DL, ReducedType, NarrowOp, A, B);
+  return DAG.getNode(Opcode, DL, ReducedVT, Acc, MulOpLHS, MulOpRHS);
 }
 
 SDValue tryLowerPartialReductionToWideAdd(SDNode *N,
@@ -22036,32 +22034,29 @@ SDValue tryLowerPartialReductionToWideAdd(SDNode *N,
 
   SDLoc DL(N);
 
-  auto Acc = N->getOperand(1);
-  auto ExtInput = N->getOperand(2);
-
-  EVT AccVT = Acc.getValueType();
-  EVT AccElemVT = AccVT.getVectorElementType();
-
-  if (ExtInput.getValueType().getVectorElementType() != AccElemVT)
+  if (!ISD::isExtOpcode(N->getOperand(2).getOpcode()))
     return SDValue();
-
-  unsigned ExtInputOpcode = ExtInput->getOpcode();
-  if (!ISD::isExtOpcode(ExtInputOpcode))
+  SDValue Acc = N->getOperand(1);
+  SDValue Ext = N->getOperand(2);
+  EVT AccVT = Acc.getValueType();
+  EVT ExtVT = Ext.getValueType();
+  if (ExtVT.getVectorElementType() != AccVT.getVectorElementType())
     return SDValue();
 
-  auto Input = ExtInput->getOperand(0);
-  EVT InputVT = Input.getValueType();
+  SDValue ExtOp = Ext->getOperand(0);
+  EVT ExtOpVT = ExtOp.getValueType();
 
-  if (!(InputVT == MVT::nxv4i32 && AccVT == MVT::nxv2i64) &&
-      !(InputVT == MVT::nxv8i16 && AccVT == MVT::nxv4i32) &&
-      !(InputVT == MVT::nxv16i8 && AccVT == MVT::nxv8i16))
+  if (!(ExtOpVT == MVT::nxv4i32 && AccVT == MVT::nxv2i64) &&
+      !(ExtOpVT == MVT::nxv8i16 && AccVT == MVT::nxv4i32) &&
+      !(ExtOpVT == MVT::nxv16i8 && AccVT == MVT::nxv8i16))
     return SDValue();
 
-  bool InputIsSigned = ExtInputOpcode == ISD::SIGN_EXTEND;
-  auto BottomOpcode = InputIsSigned ? AArch64ISD::SADDWB : AArch64ISD::UADDWB;
-  auto TopOpcode = InputIsSigned ? AArch64ISD::SADDWT : AArch64ISD::UADDWT;
-  auto BottomNode = DAG.getNode(BottomOpcode, DL, AccVT, Acc, Input);
-  return DAG.getNode(TopOpcode, DL, AccVT, BottomNode, Input);
+  bool ExtOpIsSigned = Ext.getOpcode() == ISD::SIGN_EXTEND;
+  unsigned BottomOpcode =
+      ExtOpIsSigned ? AArch64ISD::SADDWB : AArch64ISD::UADDWB;
+  unsigned TopOpcode = ExtOpIsSigned ? AArch64ISD::SADDWT : AArch64ISD::UADDWT;
+  SDValue BottomNode = DAG.getNode(BottomOpcode, DL, AccVT, Acc, ExtOp);
+  return DAG.getNode(TopOpcode, DL, AccVT, BottomNode, ExtOp);
 }
 
 static SDValue performIntrinsicCombine(SDNode *N,
@@ -22073,9 +22068,9 @@ static SDValue performIntrinsicCombine(SDNode *N,
   default:
     break;
   case Intrinsic::experimental_vector_partial_reduce_add: {
-    if (auto Dot = tryLowerPartialReductionToDot(N, Subtarget, DAG))
+    if (SDValue Dot = tryLowerPartialReductionToDot(N, Subtarget, DAG))
       return Dot;
-    if (auto WideAdd = tryLowerPartialReductionToWideAdd(N, Subtarget, DAG))
+    if (SDValue WideAdd = tryLowerPartialReductionToWideAdd(N, Subtarget, DAG))
       return WideAdd;
     return DAG.getPartialReduceAdd(SDLoc(N), N->getValueType(0),
                                    N->getOperand(1), N->getOperand(2));

From d7ddc976d544528fe7f16882f5bec66c3b2a7884 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp@bytedance.com>
Date: Fri, 20 Dec 2024 19:30:51 +0800
Subject: [PATCH 180/209] [MachinePipeliner] Remove unused private field MF

---
 llvm/lib/CodeGen/MachinePipeliner.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index db12b9c343b6b..b7d03a10266b0 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -1216,7 +1216,6 @@ struct FuncUnitSorter {
 /// Calculate the maximum register pressure of the scheduled instructions stream
 class HighRegisterPressureDetector {
   MachineBasicBlock *OrigMBB;
-  const MachineFunction &MF;
   const MachineRegisterInfo &MRI;
   const TargetRegisterInfo *TRI;
 
@@ -1489,7 +1488,7 @@ class HighRegisterPressureDetector {
 public:
   HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
                                const MachineFunction &MF)
-      : OrigMBB(OrigMBB), MF(MF), MRI(MF.getRegInfo()),
+      : OrigMBB(OrigMBB), MRI(MF.getRegInfo()),
         TRI(MF.getSubtarget().getRegisterInfo()),
         PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
         PressureSetLimit(PSetNum, 0) {}

From 000febd0290698728abd9e23da6b27969c529177 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Fri, 20 Dec 2024 12:16:19 +0000
Subject: [PATCH 181/209] [lldb][test] Add test-coverage for
 DW_AT_APPLE_objc_complete_type parsing (#120279)

When given a DIE for an Objective-C interface (which doesn't have a
`DW_AT_APPLE_objc_complete_type`), the `DWARFASTParserClang` will try to
find the DIE which corresponds to the implementation to complete the
interface DIE. The code is here:

https://github.com/llvm/llvm-project/blob/d2e7ee77d33e8b3be3b1d4e9bc5bc4c60b62b554/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp#L1718-L1738

However, this was currently not exercised in our test-suite (removing
the code above didn't fail any LLDB test).

This patch adds a test which exercises this codepath (it will fail if we
don't fetch the implementation DIE in the `DWARFASTParserClang`).

Something that's not currently clear to me is why `frame var *f`
succeeds even without the `DW_AT_APPLE_objc_complete_type`
infrastructure. If it's using the ObjC runtime, we should make `expr` do
the same, in which case we can remove this code from
`DWARFASTParserClang`.
---
 lldb/test/Shell/Expr/TestObjCHiddenIvars.test | 65 +++++++++++++++++++
 1 file changed, 65 insertions(+)
 create mode 100644 lldb/test/Shell/Expr/TestObjCHiddenIvars.test

diff --git a/lldb/test/Shell/Expr/TestObjCHiddenIvars.test b/lldb/test/Shell/Expr/TestObjCHiddenIvars.test
new file mode 100644
index 0000000000000..18c496e4d2d27
--- /dev/null
+++ b/lldb/test/Shell/Expr/TestObjCHiddenIvars.test
@@ -0,0 +1,65 @@
+# REQUIRES: system-darwin
+#
+# Tests that LLDB correctly finds the implementation
+# DIE (with a `DW_AT_APPLE_objc_complete_type`)
+# given an interface DIE (without said attribute).
+#
+# RUN: split-file %s %t
+# RUN: %clangxx_host %t/lib.m  -c -g -o %t/lib.o
+# RUN: %clangxx_host %t/main.m -c -g -o %t/main.o
+# RUN: %clangxx_host %t/main.o %t/lib.o -o %t/a.out -framework Foundation
+#
+# RUN: %lldb %t/a.out \
+# RUN:       -o "breakpoint set -p 'return' -X main" \
+# RUN:       -o run \
+# RUN:       -o "expression *f" \
+# RUN:       -o exit | FileCheck %s
+
+# CHECK:      (lldb) expression *f
+# CHECK:      (Foo) ${{[0-9]+}} = {
+# CHECK:         y = 2 
+# CHECK-NEXT:    i = 1 
+
+#--- main.m
+#import <Foundation/Foundation.h>
+#import "lib.h"
+
+extern Foo * func();
+
+int main() {
+    Foo * f = func();
+    return 0;
+}
+
+#--- lib.m
+#import <Foundation/Foundation.h>
+#import "lib.h"
+
+@implementation Foo {
+int i;
+}
+
+- (id)init {
+  self->i = 1;
+  self->y = 2;
+
+  return self;
+}
+@end
+
+Foo * func() {
+  return [[Foo alloc] init];
+}
+
+#--- lib.h
+#ifndef LIB_H_IN
+#define LIB_H_IN
+
+#import <Foundation/Foundation.h>
+
+@interface Foo : NSObject {
+int y;
+}
+@end
+
+#endif  // _H_IN

From e4db3f0d97681a10a76e71465f1379801cd45f54 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@mozilla.com>
Date: Fri, 20 Dec 2024 12:16:49 +0000
Subject: [PATCH 182/209] =?UTF-8?q?[llvm]=20Bail=20out=20when=20meeting=20?=
 =?UTF-8?q?pointer=20with=20negative=20offset=20in=20approximated=20mode?=
 =?UTF-8?q?=20instead=20of=20=E2=80=A6=20(#120424)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…generating empty location

Fix the regression detected by
https://github.com/llvm/llvm-test-suite/pull/188
---
 llvm/lib/Analysis/MemoryBuiltins.cpp          |  9 ++++---
 .../builtin-object-size-phi.ll                |  2 +-
 .../builtin-object-size-range.ll              | 24 ++++++++++++++++++-
 .../objectsize_basic.ll                       |  4 ++--
 4 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp
index 57b97999b0886..6b7a3e1ffe347 100644
--- a/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -838,11 +838,14 @@ OffsetSpan ObjectSizeOffsetVisitor::computeImpl(Value *V) {
 
   // We end up pointing on a location that's outside of the original object.
   if (ORT.knownBefore() && ORT.Before.isNegative()) {
-    // This is UB, and we'd rather return an empty location then.
+    // This means that we *may* be accessing memory before the allocation.
+    // Conservatively return an unknown size.
+    //
+    // TODO: working with ranges instead of value would make it possible to take
+    // a better decision.
     if (Options.EvalMode == ObjectSizeOpts::Mode::Min ||
         Options.EvalMode == ObjectSizeOpts::Mode::Max) {
-      ORT.Before = APInt::getZero(ORT.Before.getBitWidth());
-      ORT.After = APInt::getZero(ORT.Before.getBitWidth());
+      return ObjectSizeOffsetVisitor::unknown();
     }
     // Otherwise it's fine, caller can handle negative offset.
   }
diff --git a/llvm/test/Transforms/LowerConstantIntrinsics/builtin-object-size-phi.ll b/llvm/test/Transforms/LowerConstantIntrinsics/builtin-object-size-phi.ll
index cba4da073ff2a..564311da64a81 100644
--- a/llvm/test/Transforms/LowerConstantIntrinsics/builtin-object-size-phi.ll
+++ b/llvm/test/Transforms/LowerConstantIntrinsics/builtin-object-size-phi.ll
@@ -143,7 +143,7 @@ define dso_local i64 @pick_max_one_oob(i1 %c0, i1 %c1) {
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[P_END:%.*]] = phi ptr [ [[P_ELSE]], [[IF_ELSE]] ], [ [[P_THEN]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[OBJSIZE:%.*]] = select i1 [[C1:%.*]], i64 1, i64 0
+; CHECK-NEXT:    [[OBJSIZE:%.*]] = select i1 [[C1:%.*]], i64 -1, i64 0
 ; CHECK-NEXT:    ret i64 [[OBJSIZE]]
 ;
   %p = alloca [2 x i8], align 1
diff --git a/llvm/test/Transforms/LowerConstantIntrinsics/builtin-object-size-range.ll b/llvm/test/Transforms/LowerConstantIntrinsics/builtin-object-size-range.ll
index f84ebee144289..00af652bb7f69 100644
--- a/llvm/test/Transforms/LowerConstantIntrinsics/builtin-object-size-range.ll
+++ b/llvm/test/Transforms/LowerConstantIntrinsics/builtin-object-size-range.ll
@@ -78,7 +78,8 @@ define i64 @select_neg_oob_offset(i1 %c0, i1 %c1) {
 ; CHECK-NEXT:    [[PTR:%.*]] = alloca i8, i64 10, align 1
 ; CHECK-NEXT:    [[OFFSET:%.*]] = select i1 [[C0:%.*]], i64 -3, i64 -4
 ; CHECK-NEXT:    [[PTR_SLIDE:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[OFFSET]]
-; CHECK-NEXT:    ret i64 0
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[C1:%.*]], i64 -1, i64 0
+; CHECK-NEXT:    ret i64 [[RES]]
 ;
   %ptr = alloca i8, i64 10
   %offset = select i1 %c0, i64 -3, i64 -4
@@ -106,4 +107,25 @@ define i64 @select_gep_offsets(i1 %cond) {
   ret i64 %res
 }
 
+define i64 @select_gep_oob_overapproximated_offsets(i1 %cond) {
+; CHECK-LABEL: @select_gep_oob_overapproximated_offsets(
+; CHECK-NEXT:    [[BASE1:%.*]] = alloca [288 x i8], align 16
+; CHECK-NEXT:    [[SELECT0:%.*]] = select i1 [[COND:%.*]], i64 -4, i64 -64
+; CHECK-NEXT:    [[SELECT1:%.*]] = select i1 [[COND]], i64 16, i64 64
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds nuw i8, ptr [[BASE1]], i64 [[SELECT1]]
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[GEP0]], i64 [[SELECT0]]
+; CHECK-NEXT:    ret i64 -1
+;
+  %base1 = alloca [288 x i8], align 16
+  %select0 = select i1 %cond, i64 -4, i64 -64
+  %select1 = select i1 %cond, i64 16, i64 64
+; This never actually goes oob, but because we approximate each select
+; independently, this actually ranges in [16 - 64 ; 64 - 4] instead of [64 - 64; 16 - 4]
+  %gep0 = getelementptr inbounds nuw i8, ptr %base1, i64 %select1
+  %gep1 = getelementptr inbounds i8, ptr %gep0, i64 %select0
+  %call = call i64 @llvm.objectsize.i64.p0(ptr %gep1, i1 false, i1 true, i1 false)
+  ret i64 %call
+}
+
+
 attributes #0 = { nounwind allocsize(0) }
diff --git a/llvm/test/Transforms/LowerConstantIntrinsics/objectsize_basic.ll b/llvm/test/Transforms/LowerConstantIntrinsics/objectsize_basic.ll
index 212b4a432db3c..0eec7f75014eb 100644
--- a/llvm/test/Transforms/LowerConstantIntrinsics/objectsize_basic.ll
+++ b/llvm/test/Transforms/LowerConstantIntrinsics/objectsize_basic.ll
@@ -214,7 +214,7 @@ define i64 @wrapping_gep_neg(i1 %c) {
 ; CHECK-NEXT:    [[OBJ:%.*]] = alloca i8, i64 4, align 1
 ; CHECK-NEXT:    [[SLIDE:%.*]] = getelementptr i8, ptr [[OBJ]], i64 9223372036854775807
 ; CHECK-NEXT:    [[SLIDE_BIS:%.*]] = getelementptr i8, ptr [[SLIDE]], i64 9223372036854775807
-; CHECK-NEXT:    ret i64 0
+; CHECK-NEXT:    ret i64 -1
 ;
   %obj = alloca i8, i64 4
   %slide = getelementptr i8, ptr %obj, i64 9223372036854775807
@@ -269,7 +269,7 @@ define i64 @out_of_bound_negative_gep(i1 %c) {
 ; CHECK-LABEL: @out_of_bound_negative_gep(
 ; CHECK-NEXT:    [[OBJ:%.*]] = alloca i8, i32 4, align 1
 ; CHECK-NEXT:    [[SLIDE:%.*]] = getelementptr i8, ptr [[OBJ]], i8 -8
-; CHECK-NEXT:    ret i64 0
+; CHECK-NEXT:    ret i64 -1
 ;
   %obj = alloca i8, i32 4
   %slide = getelementptr i8, ptr %obj, i8 -8

From 451a80ccc034799151d3a82c15e320cdde5a2e04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Fri, 20 Dec 2024 14:39:05 +0200
Subject: [PATCH 183/209] [docs] Mention ffmpeg and dav1d in llvm-test-suite
 (#120570)

Since https://github.com/llvm/llvm-test-suite/pull/182 and
https://github.com/llvm/llvm-test-suite/pull/188, these projects can now
be added as external projects within llvm-test-suite.
---
 llvm/docs/TestSuiteGuide.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/llvm/docs/TestSuiteGuide.md b/llvm/docs/TestSuiteGuide.md
index 35984bd6f7b3b..aa8332ed8b7e1 100644
--- a/llvm/docs/TestSuiteGuide.md
+++ b/llvm/docs/TestSuiteGuide.md
@@ -367,6 +367,19 @@ For the SPEC benchmarks you can switch between the `test`, `train` and
 `ref` input datasets via the `TEST_SUITE_RUN_TYPE` configuration option.
 The `train` dataset is used by default.
 
+In addition to SPEC, the multimedia frameworks ffmpeg and dav1d can also
+be hooked up as external projects in the same way. By including them in
+llvm-test-suite, a lot more of potentially vectorizable code gets compiled
+- which can catch compiler bugs merely by triggering code generation asserts.
+Including them also adds small code correctness tests, that compare the
+output of the compiler generated functions against handwritten assembly
+functions. (On x86, building the assembly requires having the nasm tool
+available.) The integration into llvm-test-suite doesn't run the projects'
+full testsuites though. The projects also contain microbenchmarks for
+measuring the performance of some functions. See the `README.md` files in
+the respective `ffmpeg` and `dav1d` directories under
+`llvm-test-suite/External` for further details.
+
 
 Custom Suites
 -------------

From cf7b3f8d827abba49930202e51702714349c716d Mon Sep 17 00:00:00 2001
From: William Tran-Viet <wtranviet@proton.me>
Date: Fri, 20 Dec 2024 07:40:17 -0500
Subject: [PATCH 184/209] Fix double-quotes in diagnostic when attempting to
 access a ext_vector of bools (#118186)

Fixes #116932

- Remove the quotation marks in the diagnostic message for
err_ext_vector_component_name_illegal
- Pass in the quotation marks directly when reporting an illegal vector
component name inside `CheckExtVectorComponent`
- Add an offset to the `OpLoc` passed into `S.Diag` so the error message
arrow points directly to the offending illegal component rather than to
the '.' at the start of the component identifier.
- Modify the `vector-bool.cpp` element-wise access test case so it
(correctly) now only expects a single set of quotes.
---
 clang/include/clang/Basic/DiagnosticSemaKinds.td | 2 +-
 clang/lib/Sema/SemaExprMember.cpp                | 7 +++++--
 clang/test/SemaCXX/vector-bool.cpp               | 8 ++++----
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 8d19e9030ac2e..491bc83c1e129 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3423,7 +3423,7 @@ def warn_typecheck_vector_element_sizes_not_equal : Warning<
 def err_ext_vector_component_exceeds_length : Error<
   "vector component access exceeds type %0">;
 def err_ext_vector_component_name_illegal : Error<
-  "illegal vector component name '%0'">;
+  "illegal vector component name %0">;
 def err_attribute_address_space_negative : Error<
   "address space is negative">;
 def err_attribute_address_space_too_high : Error<
diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp
index bcc1b92ffdec7..d130e8b86bc56 100644
--- a/clang/lib/Sema/SemaExprMember.cpp
+++ b/clang/lib/Sema/SemaExprMember.cpp
@@ -434,8 +434,11 @@ CheckExtVectorComponent(Sema &S, QualType baseType, ExprValueKind &VK,
   if (!HalvingSwizzle && *compStr) {
     // We didn't get to the end of the string. This means the component names
     // didn't come from the same set *or* we encountered an illegal name.
-    S.Diag(OpLoc, diag::err_ext_vector_component_name_illegal)
-      << StringRef(compStr, 1) << SourceRange(CompLoc);
+    size_t Offset = compStr - CompName->getNameStart() + 1;
+    char Fmt[3] = {'\'', *compStr, '\''};
+    S.Diag(OpLoc.getLocWithOffset(Offset),
+           diag::err_ext_vector_component_name_illegal)
+        << StringRef(Fmt, 3) << SourceRange(CompLoc);
     return QualType();
   }
 
diff --git a/clang/test/SemaCXX/vector-bool.cpp b/clang/test/SemaCXX/vector-bool.cpp
index e99d420e73fab..cd638056f348b 100644
--- a/clang/test/SemaCXX/vector-bool.cpp
+++ b/clang/test/SemaCXX/vector-bool.cpp
@@ -85,10 +85,10 @@ void foo(const bool& X);
 
 // Disallow element-wise access.
 bool* ElementRefs() {
-  eight_bools.y = false; // expected-error@88 {{illegal vector component name ''y''}}
-  &eight_bools.z;        // expected-error@89 {{illegal vector component name ''z''}}
-  foo(eight_bools.w);    // expected-error@90 {{illegal vector component name ''w''}}
-  foo(eight_bools.wyx);  // expected-error@91 {{illegal vector component name ''wyx''}}
+  eight_bools.y = false; // expected-error@88 {{illegal vector component name 'y'}}
+  &eight_bools.z;        // expected-error@89 {{illegal vector component name 'z'}}
+  foo(eight_bools.w);    // expected-error@90 {{illegal vector component name 'w'}}
+  foo(eight_bools.wyx);  // expected-error@91 {{illegal vector component name 'wyx'}}
 }
 
 void Sizeof() {

From eb6c4197d5263ed2e086925b2b2f032a19442d2b Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 20 Dec 2024 13:46:45 +0100
Subject: [PATCH 185/209] [mlir][CF] Split `cf-to-llvm` from `func-to-llvm`
 (#120580)

Do not run `cf-to-llvm` as part of `func-to-llvm`. This commit fixes
https://github.com/llvm/llvm-project/issues/70982.

This commit changes the way how `func.func` ops are lowered to LLVM.
Previously, the signature of the entire region (i.e., entry block and
all other blocks in the `func.func` op) was converted as part of the
`func.func` lowering pattern.

Now, only the entry block is converted. The remaining block signatures
are converted together with `cf.br` and `cf.cond_br` as part of
`cf-to-llvm`. All unstructured control flow is not converted as part of
a single pass (`cf-to-llvm`). `func-to-llvm` no longer deals with
unstructured control flow.

Also add more test cases for control flow dialect ops.

Note: This PR is in preparation of #120431, which adds an additional
GPU-specific lowering for `cf.assert`. This was a problem because
`cf.assert` used to be converted as part of `func-to-llvm`.

Note for LLVM integration: If you see failures, add
`-convert-cf-to-llvm` to your pass pipeline.
---
 flang/lib/Optimizer/CodeGen/CodeGen.cpp       |  58 +++++--
 mlir/include/mlir/Conversion/Passes.td        |   4 -
 .../ControlFlowToLLVM/ControlFlowToLLVM.cpp   | 153 ++++++++++--------
 mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp |  14 +-
 .../Pipelines/SparseTensorPipelines.cpp       |   2 +
 .../Conversion/ControlFlowToLLVM/branch.mlir  |  69 ++++++++
 .../Conversion/ControlFlowToLLVM/invalid.mlir |  42 -----
 .../Conversion/ControlFlowToLLVM/switch.mlir  |  66 ++++++++
 .../Conversion/FuncToLLVM/convert-funcs.mlir  |   2 +-
 .../Conversion/FuncToLLVM/func-memref.mlir    |   4 +-
 .../Conversion/FuncToLLVM/func-to-llvm.mlir   |  22 ++-
 .../microbench-linalg-async-parallel-for.mlir |   2 +
 .../microbench-scf-async-parallel-for.mlir    |   3 +
 .../Async/CPU/test-async-parallel-for-1d.mlir |   3 +
 .../Async/CPU/test-async-parallel-for-2d.mlir |   3 +
 .../Dialect/Complex/CPU/correctness.mlir      |   3 +-
 .../CPU/X86/test-inline-asm-vector.mlir       |   2 +-
 .../Dialect/Linalg/CPU/matmul-vs-matvec.mlir  |   2 +-
 .../Linalg/CPU/runtime-verification.mlir      |   1 +
 .../Dialect/Linalg/CPU/test-conv-1d-call.mlir |   4 +-
 .../Linalg/CPU/test-conv-1d-nwc-wcf-call.mlir |   4 +-
 .../Dialect/Linalg/CPU/test-conv-2d-call.mlir |   4 +-
 .../CPU/test-conv-2d-nhwc-hwcf-call.mlir      |   4 +-
 .../Dialect/Linalg/CPU/test-conv-3d-call.mlir |   4 +-
 .../CPU/test-conv-3d-ndhwc-dhwcf-call.mlir    |   4 +-
 .../Linalg/CPU/test-one-shot-bufferize.mlir   |   2 +-
 .../Dialect/Linalg/CPU/test-padtensor.mlir    |   2 +-
 .../Linalg/CPU/test-tensor-matmul.mlir        |   4 +-
 .../Integration/Dialect/MemRef/memref_abi.c   |   3 +-
 .../Standard/CPU/test-ceil-floor-pos-neg.mlir |   4 +-
 .../Dialect/Vector/CPU/0-d-vectors.mlir       |   2 +-
 .../Dialect/Vector/CPU/broadcast.mlir         |   2 +-
 .../Dialect/Vector/CPU/compress.mlir          |   2 +-
 .../Dialect/Vector/CPU/constant-mask.mlir     |   2 +-
 .../Dialect/Vector/CPU/contraction.mlir       |   2 +-
 .../Dialect/Vector/CPU/create-mask-v4i1.mlir  |   2 +-
 .../Dialect/Vector/CPU/create-mask.mlir       |   2 +-
 .../Dialect/Vector/CPU/expand.mlir            |   2 +-
 .../Vector/CPU/extract-strided-slice.mlir     |   2 +-
 .../Vector/CPU/flat-transpose-col.mlir        |   2 +-
 .../Vector/CPU/flat-transpose-row.mlir        |   2 +-
 .../Integration/Dialect/Vector/CPU/fma.mlir   |   2 +-
 .../Dialect/Vector/CPU/gather.mlir            |   2 +-
 .../Dialect/Vector/CPU/index-vectors.mlir     |   2 +-
 .../Vector/CPU/insert-strided-slice.mlir      |   2 +-
 .../Dialect/Vector/CPU/maskedload.mlir        |   2 +-
 .../Dialect/Vector/CPU/maskedstore.mlir       |   2 +-
 .../Vector/CPU/matrix-multiply-col.mlir       |   2 +-
 .../Vector/CPU/matrix-multiply-row.mlir       |   2 +-
 .../Dialect/Vector/CPU/outerproduct-f32.mlir  |   2 +-
 .../Dialect/Vector/CPU/outerproduct-i64.mlir  |   2 +-
 .../Dialect/Vector/CPU/print-fp.mlir          |   2 +-
 .../Dialect/Vector/CPU/print-int.mlir         |   2 +-
 .../Dialect/Vector/CPU/realloc.mlir           |   4 +-
 .../Vector/CPU/reductions-f32-reassoc.mlir    |   2 +-
 .../Dialect/Vector/CPU/reductions-f32.mlir    |   2 +-
 .../Vector/CPU/reductions-f64-reassoc.mlir    |   2 +-
 .../Dialect/Vector/CPU/reductions-f64.mlir    |   2 +-
 .../Dialect/Vector/CPU/reductions-i32.mlir    |   2 +-
 .../Dialect/Vector/CPU/reductions-i4.mlir     |   2 +-
 .../Dialect/Vector/CPU/reductions-i64.mlir    |   2 +-
 .../Dialect/Vector/CPU/reductions-si4.mlir    |   2 +-
 .../Dialect/Vector/CPU/reductions-ui4.mlir    |   2 +-
 .../Integration/Dialect/Vector/CPU/scan.mlir  |   2 +-
 .../Dialect/Vector/CPU/scatter.mlir           |   2 +-
 .../Dialect/Vector/CPU/shape-cast.mlir        |   2 +-
 .../Dialect/Vector/CPU/shuffle.mlir           |   2 +-
 .../Dialect/Vector/CPU/shuffle16x16.mlir      |   2 +-
 .../Dialect/Vector/CPU/sparse-dot-matvec.mlir |   2 +-
 .../CPU/sparse-saxpy-jagged-matvec.mlir       |   2 +-
 .../Dialect/Vector/CPU/transfer-read-1d.mlir  |   4 +-
 .../Dialect/Vector/CPU/transfer-read-2d.mlir  |   4 +-
 .../Dialect/Vector/CPU/transfer-read-3d.mlir  |   4 +-
 .../Dialect/Vector/CPU/transfer-read.mlir     |   4 +-
 .../Dialect/Vector/CPU/transfer-to-loops.mlir |   4 +-
 .../Dialect/Vector/CPU/transfer-write.mlir    |   2 +-
 .../Dialect/Vector/CPU/transpose.mlir         |   2 +-
 .../test/lib/Dialect/LLVM/TestLowerToLLVM.cpp |   3 +
 mlir/test/mlir-cpu-runner/async-error.mlir    |   2 +-
 mlir/test/mlir-cpu-runner/async-group.mlir    |   2 +-
 mlir/test/mlir-cpu-runner/async-value.mlir    |   2 +-
 mlir/test/mlir-cpu-runner/async.mlir          |   2 +-
 .../mlir-cpu-runner/bare-ptr-call-conv.mlir   |   2 +-
 mlir/test/mlir-cpu-runner/copy.mlir           |   2 +-
 .../memref-reinterpret-cast.mlir              |   2 +-
 mlir/test/mlir-cpu-runner/memref-reshape.mlir |   2 +-
 .../mlir-cpu-runner/sgemm-naive-codegen.mlir  |   2 +-
 .../test/mlir-cpu-runner/unranked-memref.mlir |   2 +-
 mlir/test/mlir-cpu-runner/utils.mlir          |   8 +-
 mlir/test/python/execution_engine.py          |   2 +-
 .../integration/dialects/linalg/opsrun.py     |   1 +
 .../mlir-vulkan-runner/mlir-vulkan-runner.cpp |   2 +
 92 files changed, 410 insertions(+), 226 deletions(-)
 create mode 100644 mlir/test/Conversion/ControlFlowToLLVM/branch.mlir
 delete mode 100644 mlir/test/Conversion/ControlFlowToLLVM/invalid.mlir
 create mode 100644 mlir/test/Conversion/ControlFlowToLLVM/switch.mlir

diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index aaf97d46d83d4..2f4cd84dda7de 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -3287,10 +3287,40 @@ struct SelectCaseOpConversion : public fir::FIROpConversion<fir::SelectCaseOp> {
   }
 };
 
+/// Helper function for converting select ops. This function converts the
+/// signature of the given block. If the new block signature is different from
+/// `expectedTypes`, returns "failure".
+static llvm::FailureOr<mlir::Block *>
+getConvertedBlock(mlir::ConversionPatternRewriter &rewriter,
+                  const mlir::TypeConverter *converter,
+                  mlir::Operation *branchOp, mlir::Block *block,
+                  mlir::TypeRange expectedTypes) {
+  assert(converter && "expected non-null type converter");
+  assert(!block->isEntryBlock() && "entry blocks have no predecessors");
+
+  // There is nothing to do if the types already match.
+  if (block->getArgumentTypes() == expectedTypes)
+    return block;
+
+  // Compute the new block argument types and convert the block.
+  std::optional<mlir::TypeConverter::SignatureConversion> conversion =
+      converter->convertBlockSignature(block);
+  if (!conversion)
+    return rewriter.notifyMatchFailure(branchOp,
+                                       "could not compute block signature");
+  if (expectedTypes != conversion->getConvertedTypes())
+    return rewriter.notifyMatchFailure(
+        branchOp,
+        "mismatch between adaptor operand types and computed block signature");
+  return rewriter.applySignatureConversion(block, *conversion, converter);
+}
+
 template <typename OP>
-static void selectMatchAndRewrite(const fir::LLVMTypeConverter &lowering,
-                                  OP select, typename OP::Adaptor adaptor,
-                                  mlir::ConversionPatternRewriter &rewriter) {
+static llvm::LogicalResult
+selectMatchAndRewrite(const fir::LLVMTypeConverter &lowering, OP select,
+                      typename OP::Adaptor adaptor,
+                      mlir::ConversionPatternRewriter &rewriter,
+                      const mlir::TypeConverter *converter) {
   unsigned conds = select.getNumConditions();
   auto cases = select.getCases().getValue();
   mlir::Value selector = adaptor.getSelector();
@@ -3308,15 +3338,24 @@ static void selectMatchAndRewrite(const fir::LLVMTypeConverter &lowering,
     auto destOps = select.getSuccessorOperands(adaptor.getOperands(), t);
     const mlir::Attribute &attr = cases[t];
     if (auto intAttr = mlir::dyn_cast<mlir::IntegerAttr>(attr)) {
-      destinations.push_back(dest);
       destinationsOperands.push_back(destOps ? *destOps : mlir::ValueRange{});
+      auto convertedBlock =
+          getConvertedBlock(rewriter, converter, select, dest,
+                            mlir::TypeRange(destinationsOperands.back()));
+      if (mlir::failed(convertedBlock))
+        return mlir::failure();
+      destinations.push_back(*convertedBlock);
       caseValues.push_back(intAttr.getInt());
       continue;
     }
     assert(mlir::dyn_cast_or_null<mlir::UnitAttr>(attr));
     assert((t + 1 == conds) && "unit must be last");
-    defaultDestination = dest;
     defaultOperands = destOps ? *destOps : mlir::ValueRange{};
+    auto convertedBlock = getConvertedBlock(rewriter, converter, select, dest,
+                                            mlir::TypeRange(defaultOperands));
+    if (mlir::failed(convertedBlock))
+      return mlir::failure();
+    defaultDestination = *convertedBlock;
   }
 
   // LLVM::SwitchOp takes a i32 type for the selector.
@@ -3332,6 +3371,7 @@ static void selectMatchAndRewrite(const fir::LLVMTypeConverter &lowering,
       /*caseDestinations=*/destinations,
       /*caseOperands=*/destinationsOperands,
       /*branchWeights=*/llvm::ArrayRef<std::int32_t>());
+  return mlir::success();
 }
 
 /// conversion of fir::SelectOp to an if-then-else ladder
@@ -3341,8 +3381,8 @@ struct SelectOpConversion : public fir::FIROpConversion<fir::SelectOp> {
   llvm::LogicalResult
   matchAndRewrite(fir::SelectOp op, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
-    selectMatchAndRewrite<fir::SelectOp>(lowerTy(), op, adaptor, rewriter);
-    return mlir::success();
+    return selectMatchAndRewrite<fir::SelectOp>(lowerTy(), op, adaptor,
+                                                rewriter, getTypeConverter());
   }
 };
 
@@ -3353,8 +3393,8 @@ struct SelectRankOpConversion : public fir::FIROpConversion<fir::SelectRankOp> {
   llvm::LogicalResult
   matchAndRewrite(fir::SelectRankOp op, OpAdaptor adaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
-    selectMatchAndRewrite<fir::SelectRankOp>(lowerTy(), op, adaptor, rewriter);
-    return mlir::success();
+    return selectMatchAndRewrite<fir::SelectRankOp>(
+        lowerTy(), op, adaptor, rewriter, getTypeConverter());
   }
 };
 
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 8835e0a9099fd..58ee87cf82039 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -460,10 +460,6 @@ def ConvertFuncToLLVMPass : Pass<"convert-func-to-llvm", "ModuleOp"> {
     1 value is returned, packed into an LLVM IR struct type. Function calls and
     returns are updated accordingly. Block argument types are updated to use
     LLVM IR types.
-
-    Note that until https://github.com/llvm/llvm-project/issues/70982 is resolved,
-    this pass includes patterns that lower `arith` and `cf` to LLVM. This is legacy
-    code due to when they were all converted in the same pass.
   }];
   let dependentDialects = ["LLVM::LLVMDialect"];
   let options = [
diff --git a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
index e5c735e10703a..8672e7b849d9d 100644
--- a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
+++ b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
@@ -94,60 +94,57 @@ struct AssertOpLowering : public ConvertOpToLLVMPattern<cf::AssertOp> {
   bool abortOnFailedAssert = true;
 };
 
-/// The cf->LLVM lowerings for branching ops require that the blocks they jump
-/// to first have updated types which should be handled by a pattern operating
-/// on the parent op.
-static LogicalResult verifyMatchingValues(ConversionPatternRewriter &rewriter,
-                                          ValueRange operands,
-                                          ValueRange blockArgs, Location loc,
-                                          llvm::StringRef messagePrefix) {
-  for (const auto &idxAndTypes :
-       llvm::enumerate(llvm::zip(blockArgs, operands))) {
-    int64_t i = idxAndTypes.index();
-    Value argValue =
-        rewriter.getRemappedValue(std::get<0>(idxAndTypes.value()));
-    Type operandType = std::get<1>(idxAndTypes.value()).getType();
-    // In the case of an invalid jump, the block argument will have been
-    // remapped to an UnrealizedConversionCast. In the case of a valid jump,
-    // there might still be a no-op conversion cast with both types being equal.
-    // Consider both of these details to see if the jump would be invalid.
-    if (auto op = dyn_cast_or_null<UnrealizedConversionCastOp>(
-            argValue.getDefiningOp())) {
-      if (op.getOperandTypes().front() != operandType) {
-        return rewriter.notifyMatchFailure(loc, [&](Diagnostic &diag) {
-          diag << messagePrefix;
-          diag << "mismatched types from operand # " << i << " ";
-          diag << operandType;
-          diag << " not compatible with destination block argument type ";
-          diag << op.getOperandTypes().front();
-          diag << " which should be converted with the parent op.";
-        });
-      }
-    }
-  }
-  return success();
+/// Helper function for converting branch ops. This function converts the
+/// signature of the given block. If the new block signature is different from
+/// `expectedTypes`, returns "failure".
+static FailureOr<Block *> getConvertedBlock(ConversionPatternRewriter &rewriter,
+                                            const TypeConverter *converter,
+                                            Operation *branchOp, Block *block,
+                                            TypeRange expectedTypes) {
+  assert(converter && "expected non-null type converter");
+  assert(!block->isEntryBlock() && "entry blocks have no predecessors");
+
+  // There is nothing to do if the types already match.
+  if (block->getArgumentTypes() == expectedTypes)
+    return block;
+
+  // Compute the new block argument types and convert the block.
+  std::optional<TypeConverter::SignatureConversion> conversion =
+      converter->convertBlockSignature(block);
+  if (!conversion)
+    return rewriter.notifyMatchFailure(branchOp,
+                                       "could not compute block signature");
+  if (expectedTypes != conversion->getConvertedTypes())
+    return rewriter.notifyMatchFailure(
+        branchOp,
+        "mismatch between adaptor operand types and computed block signature");
+  return rewriter.applySignatureConversion(block, *conversion, converter);
 }
 
-/// Ensure that all block types were updated and then create an LLVM::BrOp
+/// Convert the destination block signature (if necessary) and lower the branch
+/// op to llvm.br.
 struct BranchOpLowering : public ConvertOpToLLVMPattern<cf::BranchOp> {
   using ConvertOpToLLVMPattern<cf::BranchOp>::ConvertOpToLLVMPattern;
 
   LogicalResult
   matchAndRewrite(cf::BranchOp op, typename cf::BranchOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    if (failed(verifyMatchingValues(rewriter, adaptor.getDestOperands(),
-                                    op.getSuccessor()->getArguments(),
-                                    op.getLoc(),
-                                    /*messagePrefix=*/"")))
+    FailureOr<Block *> convertedBlock =
+        getConvertedBlock(rewriter, getTypeConverter(), op, op.getSuccessor(),
+                          TypeRange(adaptor.getOperands()));
+    if (failed(convertedBlock))
       return failure();
-
-    rewriter.replaceOpWithNewOp<LLVM::BrOp>(
-        op, adaptor.getOperands(), op->getSuccessors(), op->getAttrs());
+    Operation *newOp = rewriter.replaceOpWithNewOp<LLVM::BrOp>(
+        op, adaptor.getOperands(), *convertedBlock);
+    // TODO: We should not just forward all attributes like that. But there are
+    // existing Flang tests that depend on this behavior.
+    newOp->setAttrs(op->getAttrDictionary());
     return success();
   }
 };
 
-/// Ensure that all block types were updated and then create an LLVM::CondBrOp
+/// Convert the destination block signatures (if necessary) and lower the
+/// branch op to llvm.cond_br.
 struct CondBranchOpLowering : public ConvertOpToLLVMPattern<cf::CondBranchOp> {
   using ConvertOpToLLVMPattern<cf::CondBranchOp>::ConvertOpToLLVMPattern;
 
@@ -155,45 +152,59 @@ struct CondBranchOpLowering : public ConvertOpToLLVMPattern<cf::CondBranchOp> {
   matchAndRewrite(cf::CondBranchOp op,
                   typename cf::CondBranchOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    if (failed(verifyMatchingValues(rewriter, adaptor.getFalseDestOperands(),
-                                    op.getFalseDest()->getArguments(),
-                                    op.getLoc(), "in false case branch ")))
+    FailureOr<Block *> convertedTrueBlock =
+        getConvertedBlock(rewriter, getTypeConverter(), op, op.getTrueDest(),
+                          TypeRange(adaptor.getTrueDestOperands()));
+    if (failed(convertedTrueBlock))
       return failure();
-    if (failed(verifyMatchingValues(rewriter, adaptor.getTrueDestOperands(),
-                                    op.getTrueDest()->getArguments(),
-                                    op.getLoc(), "in true case branch ")))
+    FailureOr<Block *> convertedFalseBlock =
+        getConvertedBlock(rewriter, getTypeConverter(), op, op.getFalseDest(),
+                          TypeRange(adaptor.getFalseDestOperands()));
+    if (failed(convertedFalseBlock))
       return failure();
-
-    rewriter.replaceOpWithNewOp<LLVM::CondBrOp>(
-        op, adaptor.getOperands(), op->getSuccessors(), op->getAttrs());
+    Operation *newOp = rewriter.replaceOpWithNewOp<LLVM::CondBrOp>(
+        op, adaptor.getCondition(), *convertedTrueBlock,
+        adaptor.getTrueDestOperands(), *convertedFalseBlock,
+        adaptor.getFalseDestOperands());
+    // TODO: We should not just forward all attributes like that. But there are
+    // existing Flang tests that depend on this behavior.
+    newOp->setAttrs(op->getAttrDictionary());
     return success();
   }
 };
 
-/// Ensure that all block types were updated and then create an LLVM::SwitchOp
+/// Convert the destination block signatures (if necessary) and lower the
+/// switch op to llvm.switch.
 struct SwitchOpLowering : public ConvertOpToLLVMPattern<cf::SwitchOp> {
   using ConvertOpToLLVMPattern<cf::SwitchOp>::ConvertOpToLLVMPattern;
 
   LogicalResult
   matchAndRewrite(cf::SwitchOp op, typename cf::SwitchOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    if (failed(verifyMatchingValues(rewriter, adaptor.getDefaultOperands(),
-                                    op.getDefaultDestination()->getArguments(),
-                                    op.getLoc(), "in switch default case ")))
+    // Get or convert default block.
+    FailureOr<Block *> convertedDefaultBlock = getConvertedBlock(
+        rewriter, getTypeConverter(), op, op.getDefaultDestination(),
+        TypeRange(adaptor.getDefaultOperands()));
+    if (failed(convertedDefaultBlock))
       return failure();
 
-    for (const auto &i : llvm::enumerate(
-             llvm::zip(adaptor.getCaseOperands(), op.getCaseDestinations()))) {
-      if (failed(verifyMatchingValues(
-              rewriter, std::get<0>(i.value()),
-              std::get<1>(i.value())->getArguments(), op.getLoc(),
-              "in switch case " + std::to_string(i.index()) + " "))) {
+    // Get or convert all case blocks.
+    SmallVector<Block *> caseDestinations;
+    SmallVector<ValueRange> caseOperands = adaptor.getCaseOperands();
+    for (auto it : llvm::enumerate(op.getCaseDestinations())) {
+      Block *b = it.value();
+      FailureOr<Block *> convertedBlock =
+          getConvertedBlock(rewriter, getTypeConverter(), op, b,
+                            TypeRange(caseOperands[it.index()]));
+      if (failed(convertedBlock))
         return failure();
-      }
+      caseDestinations.push_back(*convertedBlock);
     }
 
     rewriter.replaceOpWithNewOp<LLVM::SwitchOp>(
-        op, adaptor.getOperands(), op->getSuccessors(), op->getAttrs());
+        op, adaptor.getFlag(), *convertedDefaultBlock,
+        adaptor.getDefaultOperands(), adaptor.getCaseValuesAttr(),
+        caseDestinations, caseOperands);
     return success();
   }
 };
@@ -230,14 +241,22 @@ struct ConvertControlFlowToLLVM
 
   /// Run the dialect converter on the module.
   void runOnOperation() override {
-    LLVMConversionTarget target(getContext());
-    RewritePatternSet patterns(&getContext());
-
-    LowerToLLVMOptions options(&getContext());
+    MLIRContext *ctx = &getContext();
+    LLVMConversionTarget target(*ctx);
+    // This pass lowers only CF dialect ops, but it also modifies block
+    // signatures inside other ops. These ops should be treated as legal. They
+    // are lowered by other passes.
+    target.markUnknownOpDynamicallyLegal([&](Operation *op) {
+      return op->getDialect() !=
+             ctx->getLoadedDialect<cf::ControlFlowDialect>();
+    });
+
+    LowerToLLVMOptions options(ctx);
     if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
       options.overrideIndexBitwidth(indexBitwidth);
 
-    LLVMTypeConverter converter(&getContext(), options);
+    LLVMTypeConverter converter(ctx, options);
+    RewritePatternSet patterns(ctx);
     mlir::cf::populateControlFlowToLLVMConversionPatterns(converter, patterns);
 
     if (failed(applyPartialConversion(getOperation(), target,
diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index 938d7cb9a2004..790e18d2fcceb 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -432,11 +432,11 @@ mlir::convertFuncOpToLLVMFuncOp(FunctionOpInterface funcOp,
 
   rewriter.inlineRegionBefore(funcOp.getFunctionBody(), newFuncOp.getBody(),
                               newFuncOp.end());
-  if (failed(rewriter.convertRegionTypes(&newFuncOp.getBody(), converter,
-                                         &result))) {
-    return rewriter.notifyMatchFailure(funcOp,
-                                       "region types conversion failed");
-  }
+  // Convert just the entry block. The remaining unstructured control flow is
+  // converted by ControlFlowToLLVM.
+  if (!newFuncOp.getBody().empty())
+    rewriter.applySignatureConversion(&newFuncOp.getBody().front(), result,
+                                      &converter);
 
   // Fix the type mismatch between the materialized `llvm.ptr` and the expected
   // pointee type in the function body when converting `llvm.byval`/`llvm.byref`
@@ -785,10 +785,6 @@ struct ConvertFuncToLLVMPass
     RewritePatternSet patterns(&getContext());
     populateFuncToLLVMConversionPatterns(typeConverter, patterns, symbolTable);
 
-    // TODO(https://github.com/llvm/llvm-project/issues/70982): Remove these in
-    // favor of their dedicated conversion passes.
-    cf::populateControlFlowToLLVMConversionPatterns(typeConverter, patterns);
-
     LLVMConversionTarget target(getContext());
     if (failed(applyPartialConversion(m, target, std::move(patterns))))
       signalPassFailure();
diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
index e922a88d5d3ef..887ab1d40caba 100644
--- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp
@@ -9,6 +9,7 @@
 #include "mlir/Dialect/SparseTensor/Pipelines/Passes.h"
 
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
+#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
 #include "mlir/Conversion/Passes.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
@@ -91,6 +92,7 @@ void mlir::sparse_tensor::buildSparsifier(OpPassManager &pm,
       createConvertVectorToLLVMPass(options.convertVectorToLLVMOptions()));
   pm.addPass(createConvertFuncToLLVMPass());
   pm.addPass(createArithToLLVMConversionPass());
+  pm.addPass(createConvertControlFlowToLLVMPass());
 
   // Finalize GPU code generation.
   if (gpuCodegen) {
diff --git a/mlir/test/Conversion/ControlFlowToLLVM/branch.mlir b/mlir/test/Conversion/ControlFlowToLLVM/branch.mlir
new file mode 100644
index 0000000000000..9a0f2b7714544
--- /dev/null
+++ b/mlir/test/Conversion/ControlFlowToLLVM/branch.mlir
@@ -0,0 +1,69 @@
+// RUN: mlir-opt %s -convert-cf-to-llvm -split-input-file | FileCheck %s
+
+// Unstructured control flow is converted, but the enclosing op is not
+// converted.
+
+// CHECK-LABEL: func.func @cf_br(
+//  CHECK-SAME:     %[[arg0:.*]]: index) -> index {
+//       CHECK:   %[[cast0:.*]] = builtin.unrealized_conversion_cast %[[arg0]] : index to i64
+//       CHECK:   llvm.br ^[[bb1:.*]](%[[cast0]] : i64)
+//       CHECK: ^[[bb1]](%[[arg1:.*]]: i64):
+//       CHECK:   %[[cast1:.*]] = builtin.unrealized_conversion_cast %[[arg1]] : i64 to index
+//       CHECK:   return %[[cast1]] : index
+//       CHECK: }
+func.func @cf_br(%arg0: index) -> index {
+  cf.br ^bb1(%arg0 : index)
+^bb1(%arg1: index):
+  return %arg1 : index
+}
+
+// -----
+
+// func.func and func.return types match. No unrealized_conversion_cast is
+// needed.
+
+// CHECK-LABEL: func.func @cf_br_type_match(
+//  CHECK-SAME:     %[[arg0:.*]]: i64) -> i64 {
+//       CHECK:   llvm.br ^[[bb1:.*]](%[[arg0:.*]] : i64)
+//       CHECK: ^[[bb1]](%[[arg1:.*]]: i64):
+//       CHECK:   return %[[arg1]] : i64
+//       CHECK: }
+func.func @cf_br_type_match(%arg0: i64) -> i64 {
+  cf.br ^bb1(%arg0 : i64)
+^bb1(%arg1: i64):
+  return %arg1 : i64
+}
+
+// -----
+
+// Test case for cf.cond_br.
+
+//   CHECK-LABEL: func.func @cf_cond_br
+// CHECK-COUNT-2:   unrealized_conversion_cast {{.*}} : index to i64
+//         CHECK:   llvm.cond_br %{{.*}}, ^{{.*}}(%{{.*}} : i64), ^{{.*}}(%{{.*}} : i64)
+//         CHECK: ^{{.*}}(%{{.*}}: i64):
+//         CHECK:   unrealized_conversion_cast {{.*}} : i64 to index
+//         CHECK: ^{{.*}}(%{{.*}}: i64):
+//         CHECK:   unrealized_conversion_cast {{.*}} : i64 to index
+func.func @cf_cond_br(%cond: i1, %a: index, %b: index) -> index {
+  cf.cond_br %cond, ^bb1(%a : index), ^bb2(%b : index)
+^bb1(%arg1: index):
+  return %arg1 : index
+^bb2(%arg2: index):
+  return %arg2 : index
+}
+
+// -----
+
+// Unreachable block (and IR in general) is not converted during a dialect
+// conversion.
+
+// CHECK-LABEL: func.func @unreachable_block()
+//       CHECK:   return
+//       CHECK: ^[[bb1:.*]](%[[arg0:.*]]: index):
+//       CHECK:   cf.br ^[[bb1]](%[[arg0]] : index)
+func.func @unreachable_block() {
+  return
+^bb1(%arg0: index):
+  cf.br ^bb1(%arg0 : index)
+}
diff --git a/mlir/test/Conversion/ControlFlowToLLVM/invalid.mlir b/mlir/test/Conversion/ControlFlowToLLVM/invalid.mlir
deleted file mode 100644
index a2afa233a26e8..0000000000000
--- a/mlir/test/Conversion/ControlFlowToLLVM/invalid.mlir
+++ /dev/null
@@ -1,42 +0,0 @@
-// RUN: mlir-opt %s -convert-cf-to-llvm | FileCheck %s
-
-func.func @name(%flag: i32, %pred: i1){
-    // Test cf.br lowering failure with type mismatch
-    // CHECK: cf.br
-    %c0 = arith.constant 0 : index
-    cf.br ^bb1(%c0 : index)
-
-  // Test cf.cond_br lowering failure with type mismatch in false_dest
-  // CHECK: cf.cond_br
-  ^bb1(%0: index):  // 2 preds: ^bb0, ^bb2
-    %c1 = arith.constant 1 : i1
-    %c2 = arith.constant 1 : index
-    cf.cond_br %pred, ^bb2(%c1: i1), ^bb3(%c2: index)
-
-  // Test cf.cond_br lowering failure with type mismatch in true_dest
-  // CHECK: cf.cond_br
-  ^bb2(%1: i1):
-    %c3 = arith.constant 1 : i1
-    %c4 = arith.constant 1 : index
-    cf.cond_br %pred, ^bb3(%c4: index), ^bb2(%c3: i1)
-
-  // Test cf.switch lowering failure with type mismatch in default case
-  // CHECK: cf.switch
-  ^bb3(%2: index):  // pred: ^bb1
-    %c5 = arith.constant 1 : i1
-    %c6 = arith.constant 1 : index
-    cf.switch %flag : i32, [
-      default: ^bb1(%c6 : index),
-      42: ^bb4(%c5 : i1)
-    ]
-
-  // Test cf.switch lowering failure with type mismatch in non-default case
-  // CHECK: cf.switch
-  ^bb4(%3: i1):  // pred: ^bb1
-    %c7 = arith.constant 1 : i1
-    %c8 = arith.constant 1 : index
-    cf.switch %flag : i32, [
-      default: ^bb2(%c7 : i1),
-      41: ^bb1(%c8 : index)
-    ]
-  }
diff --git a/mlir/test/Conversion/ControlFlowToLLVM/switch.mlir b/mlir/test/Conversion/ControlFlowToLLVM/switch.mlir
new file mode 100644
index 0000000000000..0bf4b02e8e3d7
--- /dev/null
+++ b/mlir/test/Conversion/ControlFlowToLLVM/switch.mlir
@@ -0,0 +1,66 @@
+// RUN: mlir-opt %s -convert-cf-to-llvm -split-input-file | FileCheck %s
+
+// Unstructured control flow is converted, but the enclosing op is not
+// converted.
+
+// CHECK-LABEL: func.func @single_case(
+//  CHECK-SAME:     %[[val:.*]]: i32, %[[idx:.*]]: index) -> index {
+//       CHECK:   %[[cast0:.*]] = builtin.unrealized_conversion_cast %[[idx]] : index to i64
+//       CHECK:   llvm.switch %[[val]] : i32, ^[[bb1:.*]](%[[cast0]] : i64) [
+//       CHECK:   ]
+//       CHECK: ^[[bb1]](%[[arg0:.*]]: i64):
+//       CHECK:   %[[cast1:.*]] = builtin.unrealized_conversion_cast %[[arg0]] : i64 to index
+//       CHECK:   return %[[cast1]] : index
+//       CHECK: }
+func.func @single_case(%val: i32, %idx: index) -> index {
+  cf.switch %val : i32, [
+    default: ^bb1(%idx : index)
+  ]
+^bb1(%arg0: index):
+  return %arg0 : index
+}
+
+// -----
+
+// func.func and func.return types match. No unrealized_conversion_cast is
+// needed.
+
+// CHECK-LABEL: func.func @single_case_type_match(
+//  CHECK-SAME:     %[[val:.*]]: i32, %[[i:.*]]: i64) -> i64 {
+//       CHECK:   llvm.switch %[[val]] : i32, ^[[bb1:.*]](%[[i]] : i64) [
+//       CHECK:   ]
+//       CHECK: ^[[bb1]](%[[arg0:.*]]: i64):
+//       CHECK:   return %[[arg0]] : i64
+//       CHECK: }
+func.func @single_case_type_match(%val: i32, %i: i64) -> i64 {
+  cf.switch %val : i32, [
+    default: ^bb1(%i : i64)
+  ]
+^bb1(%arg0: i64):
+  return %arg0 : i64
+}
+
+// -----
+
+//   CHECK-LABEL: func.func @multi_case
+// CHECK-COUNT-2:   unrealized_conversion_cast {{.*}} : index to i64
+//         CHECK:   llvm.switch %{{.*}} : i32, ^{{.*}}(%{{.*}} : i64) [
+//         CHECK:     12: ^{{.*}}(%{{.*}} : i64),
+//         CHECK:     13: ^{{.*}}(%{{.*}} : i64),
+//         CHECK:     14: ^{{.*}}(%{{.*}} : i64)
+//         CHECK:   ]
+func.func @multi_case(%val: i32, %idx1: index, %idx2: index, %i: i64) -> index {
+  cf.switch %val : i32, [
+    default: ^bb1(%idx1 : index),
+    12: ^bb2(%idx2 : index),
+    13: ^bb1(%idx1 : index),
+    14: ^bb3(%i : i64)
+  ]
+^bb1(%arg0: index):
+  return %arg0 : index
+^bb2(%arg1: index):
+  return %arg1 : index
+^bb3(%arg2: i64):
+  %cast = arith.index_cast %arg2 : i64 to index
+  return %cast : index
+}
diff --git a/mlir/test/Conversion/FuncToLLVM/convert-funcs.mlir b/mlir/test/Conversion/FuncToLLVM/convert-funcs.mlir
index 755c4cf42689c..ae1dc70d0686b 100644
--- a/mlir/test/Conversion/FuncToLLVM/convert-funcs.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/convert-funcs.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-func-to-llvm -split-input-file -verify-diagnostics %s | FileCheck %s
+// RUN: mlir-opt -convert-func-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts -split-input-file -verify-diagnostics %s | FileCheck %s
 
 //CHECK: llvm.func @second_order_arg(!llvm.ptr)
 func.func private @second_order_arg(%arg0 : () -> ())
diff --git a/mlir/test/Conversion/FuncToLLVM/func-memref.mlir b/mlir/test/Conversion/FuncToLLVM/func-memref.mlir
index d44a07bdcc9ab..15a96543eb6b7 100644
--- a/mlir/test/Conversion/FuncToLLVM/func-memref.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/func-memref.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-arith-to-llvm),convert-func-to-llvm,reconcile-unrealized-casts)" -split-input-file %s | FileCheck %s
-// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-arith-to-llvm),convert-func-to-llvm{use-bare-ptr-memref-call-conv=1},reconcile-unrealized-casts)" -split-input-file %s | FileCheck %s --check-prefix=BAREPTR
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-arith-to-llvm),convert-func-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-arith-to-llvm),convert-func-to-llvm{use-bare-ptr-memref-call-conv=1},convert-cf-to-llvm,reconcile-unrealized-casts)" -split-input-file %s | FileCheck %s --check-prefix=BAREPTR
 
 // BAREPTR-LABEL: func @check_noalias
 // BAREPTR-SAME: %{{.*}}: !llvm.ptr {llvm.noalias}, %{{.*}}: !llvm.ptr {llvm.noalias}
diff --git a/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir b/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir
index 9cc6bbf0873ab..8396e5ad8ade1 100644
--- a/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir
@@ -1,6 +1,8 @@
-// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-math-to-llvm,convert-arith-to-llvm),convert-func-to-llvm,reconcile-unrealized-casts)" %s | FileCheck %s
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-math-to-llvm,convert-arith-to-llvm),convert-func-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" %s | FileCheck %s
 
-// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-math-to-llvm,convert-arith-to-llvm{index-bitwidth=32}),convert-func-to-llvm{index-bitwidth=32},reconcile-unrealized-casts)" %s | FileCheck --check-prefix=CHECK32 %s
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-math-to-llvm,convert-arith-to-llvm{index-bitwidth=32}),convert-func-to-llvm{index-bitwidth=32},convert-cf-to-llvm{index-bitwidth=32},reconcile-unrealized-casts)" %s | FileCheck --check-prefix=CHECK32 %s
+
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-math-to-llvm,convert-arith-to-llvm),convert-func-to-llvm,reconcile-unrealized-casts)" %s | FileCheck --check-prefix=CHECK-NO-CF %s
 
 // RUN: mlir-opt -transform-interpreter %s | FileCheck --check-prefix=CHECK32 %s
 
@@ -104,6 +106,7 @@ func.func @ml_caller() {
 
 // CHECK-LABEL: llvm.func @body_args(i64) -> i64
 // CHECK32-LABEL: llvm.func @body_args(i32) -> i32
+// CHECK-NO-CF-LABEL: llvm.func @body_args(i64) -> i64
 func.func private @body_args(index) -> index
 // CHECK-LABEL: llvm.func @other(i64, i32) -> i32
 // CHECK32-LABEL: llvm.func @other(i32, i32) -> i32
@@ -537,6 +540,21 @@ func.func @switchi8(%arg0 : i8) -> i32 {
 // CHECK-NEXT:     llvm.return %[[E1]] : i32
 // CHECK-NEXT:   }
 
+// Convert the entry block but not the unstructured control flow.
+
+// CHECK-NO-CF-LABEL: llvm.func @index_arg(
+//  CHECK-NO-CF-SAME:     %[[arg0:.*]]: i64) -> i64 {
+//       CHECK-NO-CF:   %[[cast:.*]] = builtin.unrealized_conversion_cast %[[arg0]] : i64 to index
+//       CHECK-NO-CF:   cf.br ^[[bb1:.*]](%[[cast]] : index)
+//       CHECK-NO-CF: ^[[bb1]](%[[arg1:.*]]: index):
+//       CHECK-NO-CF:   %[[cast2:.*]] = builtin.unrealized_conversion_cast %[[arg1]] : index to i64
+//       CHECK-NO-CF:   llvm.return %[[cast2]] : i64
+func.func @index_arg(%arg0: index) -> index {
+  cf.br ^bb1(%arg0 : index)
+^bb1(%arg1: index):
+  return %arg1 : index
+}
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%toplevel_module: !transform.any_op {transform.readonly}) {
     %func = transform.structured.match ops{["func.func"]} in %toplevel_module
diff --git a/mlir/test/Integration/Dialect/Async/CPU/microbench-linalg-async-parallel-for.mlir b/mlir/test/Integration/Dialect/Async/CPU/microbench-linalg-async-parallel-for.mlir
index db579647b1ad3..e659741aa96de 100644
--- a/mlir/test/Integration/Dialect/Async/CPU/microbench-linalg-async-parallel-for.mlir
+++ b/mlir/test/Integration/Dialect/Async/CPU/microbench-linalg-async-parallel-for.mlir
@@ -12,6 +12,7 @@
 // RUN:               -finalize-memref-to-llvm                                 \
 // RUN:               -convert-func-to-llvm                                    \
 // RUN:               -convert-arith-to-llvm                                   \
+// RUN:               -convert-cf-to-llvm                                      \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN: -e entry -entry-point-result=void -O3                                  \
@@ -27,6 +28,7 @@
 // RUN:               -finalize-memref-to-llvm                                 \
 // RUN:               -convert-func-to-llvm                                    \
 // RUN:               -convert-arith-to-llvm                                   \
+// RUN:               -convert-cf-to-llvm                                      \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN: -e entry -entry-point-result=void -O3                                  \
diff --git a/mlir/test/Integration/Dialect/Async/CPU/microbench-scf-async-parallel-for.mlir b/mlir/test/Integration/Dialect/Async/CPU/microbench-scf-async-parallel-for.mlir
index 3e6129dd4fb14..791c3934b2391 100644
--- a/mlir/test/Integration/Dialect/Async/CPU/microbench-scf-async-parallel-for.mlir
+++ b/mlir/test/Integration/Dialect/Async/CPU/microbench-scf-async-parallel-for.mlir
@@ -12,6 +12,7 @@
 // RUN:               -finalize-memref-to-llvm                                 \
 // RUN:               -convert-func-to-llvm                                    \
 // RUN:               -convert-arith-to-llvm                                   \
+// RUN:               -convert-cf-to-llvm                                      \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN: -e entry -entry-point-result=void -O3                                  \
@@ -34,6 +35,7 @@
 // RUN:               -finalize-memref-to-llvm                                 \
 // RUN:               -convert-func-to-llvm                                    \
 // RUN:               -convert-arith-to-llvm                                   \
+// RUN:               -convert-cf-to-llvm                                      \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN: -e entry -entry-point-result=void -O3                                  \
@@ -49,6 +51,7 @@
 // RUN:               -finalize-memref-to-llvm                                 \
 // RUN:               -convert-func-to-llvm                                    \
 // RUN:               -convert-arith-to-llvm                                   \
+// RUN:               -convert-cf-to-llvm                                      \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN: -e entry -entry-point-result=void -O3                                  \
diff --git a/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-1d.mlir b/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-1d.mlir
index 0f1835b7006e5..34461f8896d14 100644
--- a/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-1d.mlir
+++ b/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-1d.mlir
@@ -9,6 +9,7 @@
 // RUN:               -memref-expand                                           \
 // RUN:               -convert-func-to-llvm                                    \
 // RUN:               -convert-arith-to-llvm                                   \
+// RUN:               -convert-cf-to-llvm                                      \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN:  -e entry -entry-point-result=void -O0                                 \
@@ -26,6 +27,7 @@
 // RUN:               -memref-expand                                           \
 // RUN:               -convert-func-to-llvm                                    \
 // RUN:               -convert-arith-to-llvm                                   \
+// RUN:               -convert-cf-to-llvm                                      \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN:  -e entry -entry-point-result=void -O0                                 \
@@ -46,6 +48,7 @@
 // RUN:               -memref-expand                                           \
 // RUN:               -convert-func-to-llvm                                    \
 // RUN:               -convert-arith-to-llvm                                   \
+// RUN:               -convert-cf-to-llvm                                      \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN:  -e entry -entry-point-result=void -O0                                 \
diff --git a/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-2d.mlir b/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-2d.mlir
index 869e34067f78f..ce3cd3fea65cc 100644
--- a/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-2d.mlir
+++ b/mlir/test/Integration/Dialect/Async/CPU/test-async-parallel-for-2d.mlir
@@ -8,6 +8,7 @@
 // RUN:               -finalize-memref-to-llvm                                 \
 // RUN:               -convert-func-to-llvm                                    \
 // RUN:               -convert-arith-to-llvm                                   \
+// RUN:               -convert-cf-to-llvm                                      \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN:  -e entry -entry-point-result=void -O0                                 \
@@ -24,6 +25,7 @@
 // RUN:               -finalize-memref-to-llvm                                 \
 // RUN:               -convert-func-to-llvm                                    \
 // RUN:               -convert-arith-to-llvm                                   \
+// RUN:               -convert-cf-to-llvm                                      \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN:  -e entry -entry-point-result=void -O0                                 \
@@ -43,6 +45,7 @@
 // RUN:               -finalize-memref-to-llvm                                 \
 // RUN:               -convert-func-to-llvm                                    \
 // RUN:               -convert-arith-to-llvm                                   \
+// RUN:               -convert-cf-to-llvm                                      \
 // RUN:               -reconcile-unrealized-casts                              \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN:  -e entry -entry-point-result=void -O0                                 \
diff --git a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
index 6c81e07b3cccb..c0689761cfd16 100644
--- a/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
+++ b/mlir/test/Integration/Dialect/Complex/CPU/correctness.mlir
@@ -3,7 +3,8 @@
 // RUN:   -convert-scf-to-cf --convert-complex-to-standard \
 // RUN:   -finalize-memref-to-llvm -convert-math-to-llvm -convert-math-to-libm \
 // RUN:   -convert-vector-to-llvm -convert-complex-to-llvm \
-// RUN:   -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts |\
+// RUN:   -convert-func-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm \
+// RUN:   -reconcile-unrealized-casts |\
 // RUN: mlir-cpu-runner \
 // RUN:  -e entry -entry-point-result=void  \
 // RUN:  -shared-libs=%mlir_c_runner_utils |\
diff --git a/mlir/test/Integration/Dialect/LLVMIR/CPU/X86/test-inline-asm-vector.mlir b/mlir/test/Integration/Dialect/LLVMIR/CPU/X86/test-inline-asm-vector.mlir
index c14f3fa656261..750f1e651f50b 100644
--- a/mlir/test/Integration/Dialect/LLVMIR/CPU/X86/test-inline-asm-vector.mlir
+++ b/mlir/test/Integration/Dialect/LLVMIR/CPU/X86/test-inline-asm-vector.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts |  \
+// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts |  \
 // RUN: mlir-cpu-runner -e entry_point_with_all_constants -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils
 
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/matmul-vs-matvec.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/matmul-vs-matvec.mlir
index 37f5c88191517..3af826438927d 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/matmul-vs-matvec.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/matmul-vs-matvec.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-cf  -expand-strided-metadata -lower-affine -convert-arith-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -convert-linalg-to-loops -convert-scf-to-cf  -expand-strided-metadata -lower-affine -convert-arith-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_runner_utils \
 // RUN: | FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir
index c125d8041847f..278afb7363743 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir
@@ -9,6 +9,7 @@
 // RUN: -finalize-memref-to-llvm \
 // RUN: -convert-func-to-llvm \
 // RUN: -convert-arith-to-llvm \
+// RUN: -convert-cf-to-llvm \
 // RUN: -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:     -shared-libs=%mlir_runner_utils \
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-call.mlir
index 9b46056918b56..31e2453a17aa7 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-call.mlir
@@ -1,10 +1,10 @@
-// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf  -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf  -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_runner_utils \
 // RUN: | FileCheck %s
 
 // RUN: mlir-opt %s -transform-interpreter -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf \
-// RUN:    -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN:    -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_runner_utils \
 // RUN: | FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-wcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-wcf-call.mlir
index d6726fe1a6b42..a448874e27d21 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-wcf-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-1d-nwc-wcf-call.mlir
@@ -1,10 +1,10 @@
-// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf  -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf  -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_runner_utils \
 // RUN: | FileCheck %s
 
 // RUN: mlir-opt %s -transform-interpreter -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf \
-// RUN:    -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN:    -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_runner_utils \
 // RUN: | FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-call.mlir
index bb77d5eb9b8d8..53f58b0b3c93c 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-call.mlir
@@ -1,10 +1,10 @@
-// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf  -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf  -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_runner_utils \
 // RUN: | FileCheck %s
 
 // RUN: mlir-opt %s -transform-interpreter -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf \
-// RUN:    -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN:    -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_runner_utils \
 // RUN: | FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-hwcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-hwcf-call.mlir
index 39415dff1cbb9..8c4d5d1d4146d 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-hwcf-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-2d-nhwc-hwcf-call.mlir
@@ -1,10 +1,10 @@
-// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf  -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf  -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_runner_utils \
 // RUN: | FileCheck %s
 
 // RUN: mlir-opt %s -transform-interpreter -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf \
-// RUN:    -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN:    -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_runner_utils \
 // RUN: | FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-call.mlir
index ece054ac71765..5fa435d0fa38d 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-call.mlir
@@ -1,10 +1,10 @@
-// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf  -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf  -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_runner_utils \
 // RUN: | FileCheck %s
 
 // RUN: mlir-opt %s -transform-interpreter -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf \
-// RUN:    -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN:    -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_runner_utils \
 // RUN: | FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-dhwcf-call.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-dhwcf-call.mlir
index ce169ee470c3f..4842f9ecf810d 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-dhwcf-call.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-conv-3d-ndhwc-dhwcf-call.mlir
@@ -1,10 +1,10 @@
-// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf  -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf  -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_runner_utils \
 // RUN: | FileCheck %s
 
 // RUN: mlir-opt %s -transform-interpreter -test-transform-dialect-erase-schedule -convert-linalg-to-loops -convert-scf-to-cf \
-// RUN:    -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN:    -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_runner_utils \
 // RUN: | FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-one-shot-bufferize.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-one-shot-bufferize.mlir
index 06165515d4613..2b1dcf1fd092d 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-one-shot-bufferize.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-one-shot-bufferize.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(canonicalize,cse),one-shot-bufferize{bufferize-function-boundaries})" |\
 // RUN: mlir-opt -pass-pipeline="builtin.module(buffer-deallocation-pipeline,convert-bufferization-to-memref,func.func(convert-vector-to-scf,lower-affine,convert-linalg-to-loops))" |\
-// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(canonicalize,convert-scf-to-cf),convert-vector-to-llvm,expand-strided-metadata,lower-affine,convert-arith-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(canonicalize,convert-scf-to-cf),convert-vector-to-llvm,expand-strided-metadata,lower-affine,convert-arith-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" | \
 
 // RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils |\
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir
index a5cf6c6c3031e..902dffc75008d 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir
@@ -2,7 +2,7 @@
 // RUN: -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN: -buffer-deallocation-pipeline -convert-bufferization-to-memref \
 // RUN: -convert-linalg-to-loops -convert-scf-to-cf -expand-strided-metadata \
-// RUN: -lower-affine -convert-arith-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: -lower-affine -convert-arith-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils \
 // RUN: | FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir
index 04e64251b98d6..2e77b09ed8c2c 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-tensor-matmul.mlir
@@ -2,7 +2,7 @@
 // RUN: mlir-opt %s -test-transform-dialect-erase-schedule \
 // RUN: -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN: -buffer-deallocation-pipeline -convert-bufferization-to-memref -convert-linalg-to-loops -convert-scf-to-cf \
-// RUN: -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils \
 // RUN: | FileCheck %s
@@ -10,7 +10,7 @@
 // RUN: mlir-opt %s -transform-interpreter -test-transform-dialect-erase-schedule \
 // RUN: -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN: -convert-linalg-to-loops -convert-scf-to-cf -convert-scf-to-cf \
-// RUN:  -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN:  -expand-strided-metadata -lower-affine -convert-arith-to-llvm -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils \
 // RUN: | FileCheck %s
diff --git a/mlir/test/Integration/Dialect/MemRef/memref_abi.c b/mlir/test/Integration/Dialect/MemRef/memref_abi.c
index 645c29c30c451..3fb19177deb09 100644
--- a/mlir/test/Integration/Dialect/MemRef/memref_abi.c
+++ b/mlir/test/Integration/Dialect/MemRef/memref_abi.c
@@ -4,7 +4,8 @@
 // Compile the MLIR file to LLVM:
 // RUN: mlir-opt %t/input.mlir \
 // RUN:  -lower-affine  -convert-scf-to-cf  -finalize-memref-to-llvm \
-// RUN:  -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts \
+// RUN:  -convert-func-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm \
+// RUN:  -reconcile-unrealized-casts \
 // RUN: | mlir-translate --mlir-to-llvmir -o %t.ll
 
 // Generate an object file for the MLIR code
diff --git a/mlir/test/Integration/Dialect/Standard/CPU/test-ceil-floor-pos-neg.mlir b/mlir/test/Integration/Dialect/Standard/CPU/test-ceil-floor-pos-neg.mlir
index 4550c1f960d40..fc55ecd533174 100644
--- a/mlir/test/Integration/Dialect/Standard/CPU/test-ceil-floor-pos-neg.mlir
+++ b/mlir/test/Integration/Dialect/Standard/CPU/test-ceil-floor-pos-neg.mlir
@@ -1,8 +1,8 @@
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf,memref-expand,arith-expand),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf,memref-expand,arith-expand),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf,memref-expand,arith-expand),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf,memref-expand,arith-expand),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s --check-prefix=SCHECK
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/0-d-vectors.mlir b/mlir/test/Integration/Dialect/Vector/CPU/0-d-vectors.mlir
index 99e850a8d9a04..9c23ad7156778 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/0-d-vectors.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/0-d-vectors.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/broadcast.mlir b/mlir/test/Integration/Dialect/Vector/CPU/broadcast.mlir
index d2cf18791fec0..6b6b5271e092c 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/broadcast.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/broadcast.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts  | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/compress.mlir b/mlir/test/Integration/Dialect/Vector/CPU/compress.mlir
index fd871ab2fed1c..241973f9604a3 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/compress.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/compress.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/constant-mask.mlir b/mlir/test/Integration/Dialect/Vector/CPU/constant-mask.mlir
index 480e8ad05262b..6e95bef1c35c7 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/constant-mask.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/constant-mask.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/contraction.mlir b/mlir/test/Integration/Dialect/Vector/CPU/contraction.mlir
index 506911d3ac33d..06b6a93fa07c2 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/contraction.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/contraction.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/create-mask-v4i1.mlir b/mlir/test/Integration/Dialect/Vector/CPU/create-mask-v4i1.mlir
index b871983249c21..984533658f4f3 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/create-mask-v4i1.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/create-mask-v4i1.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts| \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/create-mask.mlir b/mlir/test/Integration/Dialect/Vector/CPU/create-mask.mlir
index 9601b4c73f5b9..c7a9308bcb17a 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/create-mask.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/create-mask.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/expand.mlir b/mlir/test/Integration/Dialect/Vector/CPU/expand.mlir
index 0c8f7acd99f84..ffef50fda2b51 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/expand.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/expand.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/extract-strided-slice.mlir b/mlir/test/Integration/Dialect/Vector/CPU/extract-strided-slice.mlir
index 397c50522a7ff..3fcc6a0eea4c0 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/extract-strided-slice.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/extract-strided-slice.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-col.mlir b/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-col.mlir
index dc13f8ce53529..bccc53a62dc6e 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-col.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-col.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -O0 -enable-matrix -matrix-allow-contract -matrix-default-layout=column-major \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-row.mlir b/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-row.mlir
index 9cb35c17b1462..6514b05924557 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-row.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/flat-transpose-row.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -O0 -enable-matrix -matrix-allow-contract -matrix-default-layout=row-major \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/fma.mlir b/mlir/test/Integration/Dialect/Vector/CPU/fma.mlir
index 31816cc3d6c83..6d0d0dde3a84e 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/fma.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/fma.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/gather.mlir b/mlir/test/Integration/Dialect/Vector/CPU/gather.mlir
index 61bb35f00d6ab..36b9b749cc4f9 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/gather.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/gather.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/index-vectors.mlir b/mlir/test/Integration/Dialect/Vector/CPU/index-vectors.mlir
index 8570ad56e021b..0865a8d54ac91 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/index-vectors.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/index-vectors.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/insert-strided-slice.mlir b/mlir/test/Integration/Dialect/Vector/CPU/insert-strided-slice.mlir
index d64916af38425..5bfbcdafd452d 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/insert-strided-slice.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/insert-strided-slice.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/maskedload.mlir b/mlir/test/Integration/Dialect/Vector/CPU/maskedload.mlir
index 76709ab1e7fca..cb99f54853627 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/maskedload.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/maskedload.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/maskedstore.mlir b/mlir/test/Integration/Dialect/Vector/CPU/maskedstore.mlir
index b343dc9d29318..6f4c692402822 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/maskedstore.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/maskedstore.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-col.mlir b/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-col.mlir
index dc711185f2722..cb04dacc965ba 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-col.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-col.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -O0 -enable-matrix -matrix-allow-contract -matrix-default-layout=column-major \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-row.mlir b/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-row.mlir
index fe1152df38141..b93a5ae42a873 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-row.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/matrix-multiply-row.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -O0 -enable-matrix -matrix-allow-contract -matrix-default-layout=row-major \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-f32.mlir
index 482a555e67f71..3e6e3ef7a3a2c 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-f32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-f32.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-i64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-i64.mlir
index b44775a4f264a..702d2ca937874 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-i64.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/outerproduct-i64.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/print-fp.mlir b/mlir/test/Integration/Dialect/Vector/CPU/print-fp.mlir
index bdd912005e25c..252ace34fe4a9 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/print-fp.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/print-fp.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/print-int.mlir b/mlir/test/Integration/Dialect/Vector/CPU/print-int.mlir
index 98f5e8d72a42d..730d836035434 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/print-int.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/print-int.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/realloc.mlir b/mlir/test/Integration/Dialect/Vector/CPU/realloc.mlir
index 099d920c55beb..d478b6f0c298c 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/realloc.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/realloc.mlir
@@ -1,7 +1,7 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -expand-realloc -expand-strided-metadata -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts |\
+// RUN: mlir-opt %s -convert-vector-to-scf -expand-realloc -expand-strided-metadata -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts |\
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils
-// RUN: mlir-opt %s -convert-vector-to-scf -expand-realloc -expand-strided-metadata -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm='use-aligned-alloc=1' -convert-func-to-llvm -arith-expand -convert-arith-to-llvm -reconcile-unrealized-casts |\
+// RUN: mlir-opt %s -convert-vector-to-scf -expand-realloc -expand-strided-metadata -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm='use-aligned-alloc=1' -convert-func-to-llvm -arith-expand -convert-arith-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts |\
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | FileCheck %s
 
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/reductions-f32-reassoc.mlir b/mlir/test/Integration/Dialect/Vector/CPU/reductions-f32-reassoc.mlir
index 7f832cd45814c..4cf3dec45d6d1 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/reductions-f32-reassoc.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/reductions-f32-reassoc.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf \
 // RUN:             -convert-vector-to-llvm='reassociate-fp-reductions' \
 // RUN:             -convert-func-to-llvm -convert-arith-to-llvm \
-// RUN:             -reconcile-unrealized-casts | \
+// RUN:             -convert-cf-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/reductions-f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/reductions-f32.mlir
index ce0c625c1077f..9acf3d50b50ee 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/reductions-f32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/reductions-f32.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/reductions-f64-reassoc.mlir b/mlir/test/Integration/Dialect/Vector/CPU/reductions-f64-reassoc.mlir
index 850835fb091b9..78778bcddd900 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/reductions-f64-reassoc.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/reductions-f64-reassoc.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf \
 // RUN:             -convert-vector-to-llvm='reassociate-fp-reductions' \
 // RUN:             -convert-func-to-llvm -convert-arith-to-llvm \
-// RUN:             -reconcile-unrealized-casts | \
+// RUN:             -convert-cf-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/reductions-f64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/reductions-f64.mlir
index 85fdcd69e1b7c..8bf01c703e3c2 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/reductions-f64.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/reductions-f64.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/reductions-i32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/reductions-i32.mlir
index 1db49e3462acc..292a4693b31b5 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/reductions-i32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/reductions-i32.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/reductions-i4.mlir b/mlir/test/Integration/Dialect/Vector/CPU/reductions-i4.mlir
index 054173f172ebe..5aa97e8580817 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/reductions-i4.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/reductions-i4.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/reductions-i64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/reductions-i64.mlir
index bb2400f8e9ea0..8c89472e0724a 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/reductions-i64.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/reductions-i64.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/reductions-si4.mlir b/mlir/test/Integration/Dialect/Vector/CPU/reductions-si4.mlir
index 73e0e59c68fa8..4992a8795756b 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/reductions-si4.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/reductions-si4.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/reductions-ui4.mlir b/mlir/test/Integration/Dialect/Vector/CPU/reductions-ui4.mlir
index 7a4453c4e7170..251ead2bf1eb0 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/reductions-ui4.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/reductions-ui4.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/scan.mlir b/mlir/test/Integration/Dialect/Vector/CPU/scan.mlir
index b1be537831ba1..ae8b65b3533d1 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/scan.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/scan.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -test-vector-scan-lowering -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts  | \
+// RUN: mlir-opt %s -test-vector-scan-lowering -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/scatter.mlir b/mlir/test/Integration/Dialect/Vector/CPU/scatter.mlir
index cd9278213e376..5a1d6f693e7a8 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/scatter.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/scatter.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/shape-cast.mlir b/mlir/test/Integration/Dialect/Vector/CPU/shape-cast.mlir
index a60d891e0e3ed..755445c21a2b7 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/shape-cast.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/shape-cast.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts  | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/shuffle.mlir b/mlir/test/Integration/Dialect/Vector/CPU/shuffle.mlir
index e4f98a1407d02..bc5efef687830 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/shuffle.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/shuffle.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/shuffle16x16.mlir b/mlir/test/Integration/Dialect/Vector/CPU/shuffle16x16.mlir
index 03bd5cdfa0caf..4c4520a03e2d5 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/shuffle16x16.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/shuffle16x16.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf \
 // RUN:   -transform-interpreter \
 // RUN:   -test-transform-dialect-erase-schedule \
-// RUN:   -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN:   -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/sparse-dot-matvec.mlir b/mlir/test/Integration/Dialect/Vector/CPU/sparse-dot-matvec.mlir
index 7803f4404c84c..788d06df87c53 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/sparse-dot-matvec.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/sparse-dot-matvec.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/sparse-saxpy-jagged-matvec.mlir b/mlir/test/Integration/Dialect/Vector/CPU/sparse-saxpy-jagged-matvec.mlir
index 88797fa6de8ed..c9cb715f6e311 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/sparse-saxpy-jagged-matvec.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/sparse-saxpy-jagged-matvec.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir
index 488cd674bc02e..b0a2f255edb8c 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-1d.mlir
@@ -1,9 +1,9 @@
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,expand-strided-metadata,lower-affine,convert-arith-to-llvm,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,expand-strided-metadata,lower-affine,convert-arith-to-llvm,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
 
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true},expand-strided-metadata,lower-affine,convert-arith-to-llvm,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true},expand-strided-metadata,lower-affine,convert-arith-to-llvm,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir
index 229dbf144c5b9..7979bd9b11b7f 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-2d.mlir
@@ -1,9 +1,9 @@
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
 
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true},lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true},lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir
index a36cef858b3e5..e2f1d1ca07251 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read-3d.mlir
@@ -1,9 +1,9 @@
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
 
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true},lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true},lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read.mlir
index d912e705cdf31..4844daed93cd7 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-read.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-read.mlir
@@ -1,9 +1,9 @@
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
 
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true},lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true},lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-to-loops.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-to-loops.mlir
index b6e942938dc6b..c24c509d55ea1 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-to-loops.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-to-loops.mlir
@@ -1,9 +1,9 @@
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf,lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils | \
 // RUN: FileCheck %s
 
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true},lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)" | \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-vector-to-scf{full-unroll=true},lower-affine,convert-scf-to-cf),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" | \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transfer-write.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transfer-write.mlir
index 6e563625ca442..49fb12c2f6386 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transfer-write.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transfer-write.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/transpose.mlir b/mlir/test/Integration/Dialect/Vector/CPU/transpose.mlir
index 08c006b4913ee..22971478066a8 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/transpose.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/transpose.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-opt %s -test-lower-to-llvm  | \
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void  \
 // RUN:   -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s
diff --git a/mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp b/mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp
index b9033df7fe2b2..253748eb79ae1 100644
--- a/mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp
+++ b/mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp
@@ -13,6 +13,7 @@
 
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
+#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
 #include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
@@ -76,6 +77,8 @@ void buildTestLowerToLLVM(OpPassManager &pm,
   pm.addPass(createConvertFuncToLLVMPass());
   // Convert Arith to LLVM (always needed).
   pm.addPass(createArithToLLVMConversionPass());
+  // Convert CF to LLVM (always needed).
+  pm.addPass(createConvertControlFlowToLLVMPass());
   // Convert Index to LLVM (always needed).
   pm.addPass(createConvertIndexToLLVMPass());
   // Convert remaining unrealized_casts (always needed).
diff --git a/mlir/test/mlir-cpu-runner/async-error.mlir b/mlir/test/mlir-cpu-runner/async-error.mlir
index 76557395c892d..6be9641815aec 100644
--- a/mlir/test/mlir-cpu-runner/async-error.mlir
+++ b/mlir/test/mlir-cpu-runner/async-error.mlir
@@ -1,4 +1,4 @@
-// RUN:   mlir-opt %s -pass-pipeline="builtin.module(async-to-async-runtime,func.func(async-runtime-ref-counting,async-runtime-ref-counting-opt),convert-async-to-llvm,func.func(convert-linalg-to-loops,convert-scf-to-cf),convert-vector-to-llvm,func.func(convert-arith-to-llvm),convert-func-to-llvm,reconcile-unrealized-casts)" \
+// RUN:   mlir-opt %s -pass-pipeline="builtin.module(async-to-async-runtime,func.func(async-runtime-ref-counting,async-runtime-ref-counting-opt),convert-async-to-llvm,func.func(convert-linalg-to-loops,convert-scf-to-cf),convert-vector-to-llvm,func.func(convert-arith-to-llvm),convert-func-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN:     -e main -entry-point-result=void -O0                               \
 // RUN:     -shared-libs=%mlir_c_runner_utils  \
diff --git a/mlir/test/mlir-cpu-runner/async-group.mlir b/mlir/test/mlir-cpu-runner/async-group.mlir
index 9735a5d330ee5..547ea735574b8 100644
--- a/mlir/test/mlir-cpu-runner/async-group.mlir
+++ b/mlir/test/mlir-cpu-runner/async-group.mlir
@@ -5,7 +5,7 @@
 // to keep the bot green for now.
 // RUN: export LSAN_OPTIONS=detect_leaks=0
 
-// RUN:   mlir-opt %s -pass-pipeline="builtin.module(async-to-async-runtime,func.func(async-runtime-ref-counting,async-runtime-ref-counting-opt),convert-async-to-llvm,func.func(convert-arith-to-llvm),convert-func-to-llvm,reconcile-unrealized-casts)" \
+// RUN:   mlir-opt %s -pass-pipeline="builtin.module(async-to-async-runtime,func.func(async-runtime-ref-counting,async-runtime-ref-counting-opt),convert-async-to-llvm,func.func(convert-arith-to-llvm),convert-func-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN:     -e main -entry-point-result=void -O0                               \
 // RUN:     -shared-libs=%mlir_c_runner_utils  \
diff --git a/mlir/test/mlir-cpu-runner/async-value.mlir b/mlir/test/mlir-cpu-runner/async-value.mlir
index 836b40a222a9e..d35c9d3db7031 100644
--- a/mlir/test/mlir-cpu-runner/async-value.mlir
+++ b/mlir/test/mlir-cpu-runner/async-value.mlir
@@ -1,4 +1,4 @@
-// RUN:   mlir-opt %s -pass-pipeline="builtin.module(async-to-async-runtime,func.func(async-runtime-ref-counting,async-runtime-ref-counting-opt),convert-async-to-llvm,func.func(convert-arith-to-llvm),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" \
+// RUN:   mlir-opt %s -pass-pipeline="builtin.module(async-to-async-runtime,func.func(async-runtime-ref-counting,async-runtime-ref-counting-opt),convert-async-to-llvm,func.func(convert-arith-to-llvm),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN:     -e main -entry-point-result=void -O0                               \
 // RUN:     -shared-libs=%mlir_c_runner_utils  \
diff --git a/mlir/test/mlir-cpu-runner/async.mlir b/mlir/test/mlir-cpu-runner/async.mlir
index 678564b3767d5..4c9bad3d9f868 100644
--- a/mlir/test/mlir-cpu-runner/async.mlir
+++ b/mlir/test/mlir-cpu-runner/async.mlir
@@ -1,4 +1,4 @@
-// RUN:   mlir-opt %s -pass-pipeline="builtin.module(async-to-async-runtime,func.func(async-runtime-ref-counting,async-runtime-ref-counting-opt),convert-async-to-llvm,func.func(convert-linalg-to-loops,convert-scf-to-cf),finalize-memref-to-llvm,func.func(convert-arith-to-llvm),convert-func-to-llvm,reconcile-unrealized-casts)" \
+// RUN:   mlir-opt %s -pass-pipeline="builtin.module(async-to-async-runtime,func.func(async-runtime-ref-counting,async-runtime-ref-counting-opt),convert-async-to-llvm,func.func(convert-linalg-to-loops,convert-scf-to-cf),finalize-memref-to-llvm,func.func(convert-arith-to-llvm),convert-func-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" \
 // RUN: | mlir-cpu-runner                                                      \
 // RUN:     -e main -entry-point-result=void -O0                               \
 // RUN:     -shared-libs=%mlir_c_runner_utils  \
diff --git a/mlir/test/mlir-cpu-runner/bare-ptr-call-conv.mlir b/mlir/test/mlir-cpu-runner/bare-ptr-call-conv.mlir
index 8bbaf3fbcd5f3..2f116849cfaf8 100644
--- a/mlir/test/mlir-cpu-runner/bare-ptr-call-conv.mlir
+++ b/mlir/test/mlir-cpu-runner/bare-ptr-call-conv.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-scf-to-cf,convert-arith-to-llvm),finalize-memref-to-llvm,convert-func-to-llvm{use-bare-ptr-memref-call-conv=1},reconcile-unrealized-casts)" | mlir-cpu-runner -shared-libs=%mlir_c_runner_utils -entry-point-result=void | FileCheck %s
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-scf-to-cf,convert-arith-to-llvm),finalize-memref-to-llvm,convert-func-to-llvm{use-bare-ptr-memref-call-conv=1},convert-cf-to-llvm,reconcile-unrealized-casts)" | mlir-cpu-runner -shared-libs=%mlir_c_runner_utils -entry-point-result=void | FileCheck %s
 
 // Verify bare pointer memref calling convention. `simple_add1_add2_test`
 // gets two 2xf32 memrefs, adds 1.0f to the first one and 2.0f to the second
diff --git a/mlir/test/mlir-cpu-runner/copy.mlir b/mlir/test/mlir-cpu-runner/copy.mlir
index 7fa35fac70e4d..8ca91491d865d 100644
--- a/mlir/test/mlir-cpu-runner/copy.mlir
+++ b/mlir/test/mlir-cpu-runner/copy.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-scf-to-cf,convert-arith-to-llvm),finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-scf-to-cf,convert-arith-to-llvm),finalize-memref-to-llvm,convert-func-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" \
 // RUN: | mlir-cpu-runner -e main -entry-point-result=void \
 // RUN: -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils \
 // RUN: | FileCheck %s
diff --git a/mlir/test/mlir-cpu-runner/memref-reinterpret-cast.mlir b/mlir/test/mlir-cpu-runner/memref-reinterpret-cast.mlir
index f8f9d353fa3de..3f15d91d90a3d 100644
--- a/mlir/test/mlir-cpu-runner/memref-reinterpret-cast.mlir
+++ b/mlir/test/mlir-cpu-runner/memref-reinterpret-cast.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-scf-to-cf),finalize-memref-to-llvm,func.func(convert-arith-to-llvm),convert-func-to-llvm,reconcile-unrealized-casts)" \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-scf-to-cf),finalize-memref-to-llvm,func.func(convert-arith-to-llvm),convert-func-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" \
 // RUN: | mlir-cpu-runner -e main -entry-point-result=void \
 // RUN: -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils \
 // RUN: | FileCheck %s
diff --git a/mlir/test/mlir-cpu-runner/memref-reshape.mlir b/mlir/test/mlir-cpu-runner/memref-reshape.mlir
index fc74d644c1587..55a881e56f4dd 100644
--- a/mlir/test/mlir-cpu-runner/memref-reshape.mlir
+++ b/mlir/test/mlir-cpu-runner/memref-reshape.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-scf-to-cf,memref-expand,convert-arith-to-llvm),finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-scf-to-cf,memref-expand,convert-arith-to-llvm),finalize-memref-to-llvm,convert-func-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" \
 // RUN: | mlir-cpu-runner -e main -entry-point-result=void \
 // RUN: -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils \
 // RUN: | FileCheck %s
diff --git a/mlir/test/mlir-cpu-runner/sgemm-naive-codegen.mlir b/mlir/test/mlir-cpu-runner/sgemm-naive-codegen.mlir
index c82e78b4c6a18..bab6efc4b5bf7 100644
--- a/mlir/test/mlir-cpu-runner/sgemm-naive-codegen.mlir
+++ b/mlir/test/mlir-cpu-runner/sgemm-naive-codegen.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-linalg-to-loops,lower-affine,convert-scf-to-cf,convert-arith-to-llvm),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" %s | mlir-cpu-runner -O3 -e main -entry-point-result=void -shared-libs=%mlir_c_runner_utils | FileCheck %s
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-linalg-to-loops,lower-affine,convert-scf-to-cf,convert-arith-to-llvm),convert-vector-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" %s | mlir-cpu-runner -O3 -e main -entry-point-result=void -shared-libs=%mlir_c_runner_utils | FileCheck %s
 
 func.func @main() {
   %A = memref.alloc() : memref<16x16xf32>
diff --git a/mlir/test/mlir-cpu-runner/unranked-memref.mlir b/mlir/test/mlir-cpu-runner/unranked-memref.mlir
index 5b33ecb451907..16b21d4c28169 100644
--- a/mlir/test/mlir-cpu-runner/unranked-memref.mlir
+++ b/mlir/test/mlir-cpu-runner/unranked-memref.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-linalg-to-loops,convert-scf-to-cf,convert-arith-to-llvm),finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" |        \
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-linalg-to-loops,convert-scf-to-cf,convert-arith-to-llvm),finalize-memref-to-llvm,convert-func-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" |        \
 // RUN: mlir-cpu-runner -e main -entry-point-result=void \
 // RUN: -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils | FileCheck %s
 
diff --git a/mlir/test/mlir-cpu-runner/utils.mlir b/mlir/test/mlir-cpu-runner/utils.mlir
index 7c1c4ac592784..66d3a487f6c91 100644
--- a/mlir/test/mlir-cpu-runner/utils.mlir
+++ b/mlir/test/mlir-cpu-runner/utils.mlir
@@ -1,7 +1,7 @@
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-linalg-to-loops,convert-scf-to-cf,convert-arith-to-llvm),finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" | mlir-cpu-runner -e print_0d -entry-point-result=void -shared-libs=%mlir_runner_utils -shared-libs=%mlir_c_runner_utils | FileCheck %s --check-prefix=PRINT-0D
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-linalg-to-loops,convert-scf-to-cf,convert-arith-to-llvm),finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" | mlir-cpu-runner -e print_1d -entry-point-result=void -shared-libs=%mlir_runner_utils -shared-libs=%mlir_c_runner_utils | FileCheck %s --check-prefix=PRINT-1D
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-linalg-to-loops,convert-scf-to-cf,convert-arith-to-llvm),finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" | mlir-cpu-runner -e print_3d -entry-point-result=void -shared-libs=%mlir_runner_utils -shared-libs=%mlir_c_runner_utils | FileCheck %s --check-prefix=PRINT-3D
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-linalg-to-loops,convert-scf-to-cf,convert-arith-to-llvm),finalize-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" | mlir-cpu-runner -e vector_splat_2d -entry-point-result=void -shared-libs=%mlir_runner_utils -shared-libs=%mlir_c_runner_utils | FileCheck %s --check-prefix=PRINT-VECTOR-SPLAT-2D
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-linalg-to-loops,convert-scf-to-cf,convert-arith-to-llvm),finalize-memref-to-llvm,convert-func-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" | mlir-cpu-runner -e print_0d -entry-point-result=void -shared-libs=%mlir_runner_utils -shared-libs=%mlir_c_runner_utils | FileCheck %s --check-prefix=PRINT-0D
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-linalg-to-loops,convert-scf-to-cf,convert-arith-to-llvm),finalize-memref-to-llvm,convert-func-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" | mlir-cpu-runner -e print_1d -entry-point-result=void -shared-libs=%mlir_runner_utils -shared-libs=%mlir_c_runner_utils | FileCheck %s --check-prefix=PRINT-1D
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-linalg-to-loops,convert-scf-to-cf,convert-arith-to-llvm),finalize-memref-to-llvm,convert-func-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" | mlir-cpu-runner -e print_3d -entry-point-result=void -shared-libs=%mlir_runner_utils -shared-libs=%mlir_c_runner_utils | FileCheck %s --check-prefix=PRINT-3D
+// RUN: mlir-opt %s -pass-pipeline="builtin.module(func.func(convert-linalg-to-loops,convert-scf-to-cf,convert-arith-to-llvm),finalize-memref-to-llvm,convert-func-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)" | mlir-cpu-runner -e vector_splat_2d -entry-point-result=void -shared-libs=%mlir_runner_utils -shared-libs=%mlir_c_runner_utils | FileCheck %s --check-prefix=PRINT-VECTOR-SPLAT-2D
 
 func.func @print_0d() {
   %f = arith.constant 2.00000e+00 : f32
diff --git a/mlir/test/python/execution_engine.py b/mlir/test/python/execution_engine.py
index e085bc6d4c8b3..6d3a8db8c24be 100644
--- a/mlir/test/python/execution_engine.py
+++ b/mlir/test/python/execution_engine.py
@@ -74,7 +74,7 @@ def testInvalidModule():
 
 def lowerToLLVM(module):
     pm = PassManager.parse(
-        "builtin.module(convert-complex-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,reconcile-unrealized-casts)"
+        "builtin.module(convert-complex-to-llvm,finalize-memref-to-llvm,convert-func-to-llvm,convert-arith-to-llvm,convert-cf-to-llvm,reconcile-unrealized-casts)"
     )
     pm.run(module.operation)
     return module
diff --git a/mlir/test/python/integration/dialects/linalg/opsrun.py b/mlir/test/python/integration/dialects/linalg/opsrun.py
index d1639fb486527..69f97e15e139d 100644
--- a/mlir/test/python/integration/dialects/linalg/opsrun.py
+++ b/mlir/test/python/integration/dialects/linalg/opsrun.py
@@ -171,6 +171,7 @@ def transform(module, boilerplate):
     pm.add("finalize-memref-to-llvm")
     pm.add("convert-func-to-llvm")
     pm.add("convert-arith-to-llvm")
+    pm.add("convert-cf-to-llvm")
     pm.add("reconcile-unrealized-casts")
     pm.run(mod.operation)
     return mod
diff --git a/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp b/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
index 7c6ec481979f3..090df2d9ed2a5 100644
--- a/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
+++ b/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
+#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h"
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
 #include "mlir/Conversion/GPUToVulkan/ConvertGPUToVulkanPass.h"
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
@@ -57,6 +58,7 @@ static LogicalResult runMLIRPasses(Operation *op, JitRunnerOptions &) {
       DataLayout(module).getTypeSizeInBits(IndexType::get(module.getContext()));
   passManager.addPass(createConvertFuncToLLVMPass(funcToLLVMOptions));
   passManager.addPass(createArithToLLVMConversionPass());
+  passManager.addPass(createConvertControlFlowToLLVMPass());
   passManager.addPass(createReconcileUnrealizedCastsPass());
   passManager.addPass(createConvertVulkanLaunchFuncToVulkanCallsPass());
 

From 1738b75b615497b880d5a9e4a1b769e9ff001d23 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 20 Dec 2024 13:53:43 +0100
Subject: [PATCH 186/209] [LLVM] Update InstCombine maintainers (#120408)

> See [developer
policy](https://llvm.org/docs/DeveloperPolicy.html#maintainers) for
context on the maintainers terminology.

We currently list @majnemer as the maintainer for InstCombine. While
David does still occasionally contribute in this area, most of the
contributions/reviews come from other people nowadays.

I'd like to propose @dtcxzyw and myself as the new maintainers for this
area. I've also expanded it to include InstSimplify and ValueTracking,
and these tend to all go together.
---
 llvm/Maintainers.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/Maintainers.md b/llvm/Maintainers.md
index a1f3198ded94d..6d0fda148ce87 100644
--- a/llvm/Maintainers.md
+++ b/llvm/Maintainers.md
@@ -46,10 +46,12 @@ Matthew.Arsenault@amd.com, arsenm2@gmail.com (email), [arsenm](https://github.co
 Chandler Carruth \
 chandlerc@gmail.com, chandlerc@google.com (email), [chandlerc](https://github.com/chandlerc) (GitHub)
 
-#### InstCombine, ConstantFold
+#### InstCombine, InstSimplify, ValueTracking, ConstantFold
 
-David Majnemer \
-david.majnemer@gmail.com (email), [majnemer](https://github.com/majnemer) (GitHub)
+Nikita Popov \
+llvm@npopov.com, npopov@redhat.com (email), [nikic](https://github.com/nikic) (GitHub), nikic (Discourse) \
+Yingwei Zheng \
+dtcxzyw2333@gmail.com (email), [dtcxzyw](https://github.com/dtcxzyw) (GitHub)
 
 #### InstrProfiling and related parts of ProfileData
 
@@ -458,6 +460,7 @@ Renato Golin (rengolin@systemcall.eu, [rengolin](https://github.com/rengolin)) -
 Venkatraman Govindaraju (venkatra@cs.wisc.edu, [vegovin](https://github.com/vegovin) -- Sparc backend \
 James Grosbach (grosbach@apple.com) -- MC layer \
 Anton Korobeynikov (anton@korobeynikov.info, [asl](https://github.com/asl)) -- ARM EABI \
+David Majnemer (david.majnemer@gmail.com, [majnemer](https://github.com/majnemer)) -- InstCombine, ConstantFold \
 Chad Rosier (mcrosier@codeaurora.org) -- FastISel \
 Hans Wennborg (hans@chromium.org, [zmodem](https://github.com/zmodem)) -- Release management \
 Kostya Serebryany ([kcc](https://github.com/kcc)) -- Sanitizers \

From e11d49cbf5a210ea312f891d9dff6b4bf6433d57 Mon Sep 17 00:00:00 2001
From: Alexey Moksyakov <yavtuk@yandex.ru>
Date: Fri, 20 Dec 2024 15:54:36 +0300
Subject: [PATCH 187/209] [BOLT][AArch64] Adds tls relocations support
 (#117465)

Co-authored-by: yavtuk <yavtuk@ya.ru>
---
 bolt/lib/Core/Relocation.cpp                     | 8 ++++++++
 bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 2 ++
 2 files changed, 10 insertions(+)

diff --git a/bolt/lib/Core/Relocation.cpp b/bolt/lib/Core/Relocation.cpp
index 4e888a5b147ac..e9a9741bc3716 100644
--- a/bolt/lib/Core/Relocation.cpp
+++ b/bolt/lib/Core/Relocation.cpp
@@ -75,6 +75,8 @@ static bool isSupportedAArch64(uint64_t Type) {
   case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
   case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12:
   case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
+  case ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0:
+  case ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC:
   case ELF::R_AARCH64_LD64_GOT_LO12_NC:
   case ELF::R_AARCH64_TLSDESC_LD64_LO12:
   case ELF::R_AARCH64_TLSDESC_ADD_LO12:
@@ -183,6 +185,8 @@ static size_t getSizeForTypeAArch64(uint64_t Type) {
   case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
   case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12:
   case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
+  case ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0:
+  case ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC:
   case ELF::R_AARCH64_LD64_GOT_LO12_NC:
   case ELF::R_AARCH64_TLSDESC_LD64_LO12:
   case ELF::R_AARCH64_TLSDESC_ADD_LO12:
@@ -651,6 +655,8 @@ static bool isTLSAArch64(uint64_t Type) {
   case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
   case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12:
   case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
+  case ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0:
+  case ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC:
   case ELF::R_AARCH64_TLSDESC_LD64_LO12:
   case ELF::R_AARCH64_TLSDESC_ADD_LO12:
   case ELF::R_AARCH64_TLSDESC_CALL:
@@ -716,6 +722,8 @@ static bool isPCRelativeAArch64(uint64_t Type) {
   case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
   case ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12:
   case ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC:
+  case ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0:
+  case ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC:
   case ELF::R_AARCH64_LD64_GOT_LO12_NC:
   case ELF::R_AARCH64_TLSDESC_LD64_LO12:
   case ELF::R_AARCH64_TLSDESC_ADD_LO12:
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 7e08e5c81d26f..679c9774c767f 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -1449,6 +1449,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     case ELF::R_AARCH64_TLSDESC_LD64_LO12:
     case ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21:
     case ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC:
+    case ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0:
+    case ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC:
     case ELF::R_AARCH64_MOVW_UABS_G0:
     case ELF::R_AARCH64_MOVW_UABS_G0_NC:
     case ELF::R_AARCH64_MOVW_UABS_G1:

From 0dc086a787a49d7514f713d0ee0b709fc28bb702 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 20 Dec 2024 14:01:23 +0100
Subject: [PATCH 188/209] [mlir] Fix integration tests after #120580 (#120729)

This commit should have been part of #120580.
---
 mlir/test/Integration/GPU/CUDA/async.mlir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/test/Integration/GPU/CUDA/async.mlir b/mlir/test/Integration/GPU/CUDA/async.mlir
index f16c541aa2ccb..06de51bd6406a 100644
--- a/mlir/test/Integration/GPU/CUDA/async.mlir
+++ b/mlir/test/Integration/GPU/CUDA/async.mlir
@@ -3,7 +3,7 @@
 // RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)' \
 // RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -reconcile-unrealized-casts -gpu-module-to-binary="format=%gpu_compilation_format" \
 // RUN: | mlir-opt -async-to-async-runtime -async-runtime-ref-counting \
-// RUN: | mlir-opt -convert-async-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -reconcile-unrealized-casts \
+// RUN: | mlir-opt -convert-async-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm -reconcile-unrealized-casts \
 // RUN: | mlir-cpu-runner \
 // RUN:   --shared-libs=%mlir_cuda_runtime \
 // RUN:   --shared-libs=%mlir_async_runtime \

From 4eba40c604c75b5c5561ffd6e009dbbb5a4f0b4b Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Fri, 20 Dec 2024 21:20:47 +0800
Subject: [PATCH 189/209] [ConstraintElim] Remove dead code. NFC. (#118983)

`R2` should be always greater than `R1` here because both `R1` and `R2` are not modified inside the loop.
---
 llvm/lib/Analysis/ConstraintSystem.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/lib/Analysis/ConstraintSystem.cpp b/llvm/lib/Analysis/ConstraintSystem.cpp
index e4c9dcc7544e9..7216a0219080f 100644
--- a/llvm/lib/Analysis/ConstraintSystem.cpp
+++ b/llvm/lib/Analysis/ConstraintSystem.cpp
@@ -52,9 +52,6 @@ bool ConstraintSystem::eliminateUsingFM() {
   for (unsigned R1 = 0; R1 < NumRemainingConstraints; R1++) {
     // FIXME do not use copy
     for (unsigned R2 = R1 + 1; R2 < NumRemainingConstraints; R2++) {
-      if (R1 == R2)
-        continue;
-
       int64_t UpperLast = getLastCoefficient(RemainingRows[R2], LastIdx);
       int64_t LowerLast = getLastCoefficient(RemainingRows[R1], LastIdx);
       assert(

From a9034d0b7ff3b0bf90239f6b46ada7f3490b6904 Mon Sep 17 00:00:00 2001
From: Jonas Toth <development@jonas-toth.eu>
Date: Fri, 20 Dec 2024 14:23:24 +0100
Subject: [PATCH 190/209] [clang-tidy][docs] improve documentation on
 cppcoreguidelines-narrowing-conversions (#111510) (#118209)

This PR improves the docs for this check to include an example of hidden
narrowing conversions from the integer promotion rules in arithmetic.
---
 .../narrowing-conversions.rst                 | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/narrowing-conversions.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/narrowing-conversions.rst
index 04260e75aa558..7cc0b2809b458 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/narrowing-conversions.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/narrowing-conversions.rst
@@ -27,6 +27,40 @@ This check will flag:
  - All applications of binary operators with a narrowing conversions.
    For example: ``int i; i+= 0.1;``.
 
+Arithmetic with smaller integer types than ``int`` trigger implicit conversions,
+as explained under `"Integral Promotion" on cppreference.com
+<https://en.cppreference.com/w/cpp/language/implicit_conversion>`_.
+This check diagnoses more instances of narrowing than the compiler warning
+`-Wconversion` does. The example below demonstrates this behavior.
+
+.. code-block:: c++
+
+   // The following function definition demonstrates usage of arithmetic with
+   // integer types smaller than `int` and how the narrowing conversion happens
+   // implicitly.
+   void computation(short argument1, short argument2) {
+     // Arithmetic written by humans:
+     short result = argument1 + argument2;
+     // Arithmetic actually performed by C++:
+     short result = static_cast<short>(static_cast<int>(argument1) + static_cast<int>(argument2));
+   }
+
+   void recommended_resolution(short argument1, short argument2) {
+     short result = argument1 + argument2;
+     //           ^ warning: narrowing conversion from 'int' to signed type 'short' is implementation-defined
+
+     // The cppcoreguidelines recommend to resolve this issue by using the GSL
+     // in one of two ways. Either by a cast that throws if a loss of precision
+     // would occur.
+     short result = gsl::narrow<short>(argument1 + argument2);
+     // Or it can be resolved without checking the result risking invalid results.
+     short result = gsl::narrow_cast<short>(argument1 + argument2);
+
+     // A classical `static_cast` will silence the warning as well if the GSL
+     // is not available.
+     short result = static_cast<short>(argument1 + argument2);
+   }
+
 
 Options
 -------

From c5434804eeea643f0420bc5fd83dd4977731f4d2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 20 Dec 2024 13:11:26 +0000
Subject: [PATCH 191/209] [VectorCombine] foldInsExtVectorToShuffle - add debug
 message for match + cost-comparison

Helps with debugging to show to that the fold found the match, and shows the old + new costs to indicate whether the fold was/wasn't profitable.
---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 9ea8190cfb49d..aa3dff0c0a89e 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3003,6 +3003,10 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
   if (!Ext->hasOneUse())
     NewCost += TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx);
 
+  LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair : " << I
+                    << "\n  OldCost: " << OldCost << " vs NewCost: " << NewCost
+                    << "\n");
+
   if (OldCost < NewCost)
     return false;
 

From 5f0db7c11264fa235d73730b2b93a31407dfbef3 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 20 Dec 2024 13:32:38 +0000
Subject: [PATCH 192/209] [VectorCombine] Add "VECTORCOMBINE on
 <FUNCTION_NAME>" title debug message to help finding vectorcombine stages in
 the debug log

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index aa3dff0c0a89e..deff00ff1ae52 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3032,6 +3032,8 @@ bool VectorCombine::run() {
   if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
     return false;
 
+  LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
+
   bool MadeChange = false;
   auto FoldInst = [this, &MadeChange](Instruction &I) {
     Builder.SetInsertPoint(&I);

From 42873e0cf107045b76d0fc221cdb838cdb0766bf Mon Sep 17 00:00:00 2001
From: Mikhail Goncharov <goncharov.mikhail@gmail.com>
Date: Fri, 20 Dec 2024 14:45:48 +0100
Subject: [PATCH 193/209] [bazel] port eb6c4197d5263ed2e086925b2b2f032a19442d2b

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel      | 4 +++-
 utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 287e2be177529..d2ac43ef5bcff 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3268,9 +3268,10 @@ cc_library(
     includes = ["include"],
     local_defines = if_cuda_available(["MLIR_GPU_TO_CUBIN_PASS_ENABLE"]),
     deps = [
-        ":ArithTransforms",
         ":ArithToLLVM",
+        ":ArithTransforms",
         ":BufferizationTransforms",
+        ":ControlFlowToLLVM",
         ":ConversionPasses",
         ":FuncDialect",
         ":GPUDialect",
@@ -10221,6 +10222,7 @@ cc_binary(
         ":ArithDialect",
         ":ArithToLLVM",
         ":BuiltinToLLVMIRTranslation",
+        ":ControlFlowToLLVM",
         ":ConvertToSPIRV",
         ":ExecutionEngineUtils",
         ":FuncDialect",
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index daaa241d2bf92..7d51a3829e912 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -806,6 +806,7 @@ cc_library(
     deps = [
         "//mlir:AffineToStandard",
         "//mlir:ArithToLLVM",
+        "//mlir:ControlFlowToLLVM",
         "//mlir:FuncDialect",
         "//mlir:FuncToLLVM",
         "//mlir:IR",

From 54309b1c2f7a9acdb91ae1735cf4eb0877eadfc0 Mon Sep 17 00:00:00 2001
From: Jan Voung <jvoung@google.com>
Date: Fri, 20 Dec 2024 09:01:13 -0500
Subject: [PATCH 194/209] [clang][dataflow] Add matchers for smart pointer
 accessors to be cached (#120102)

This is part 1 of caching for smart pointer accessors, building on top
of the CachedConstAccessorsLattice, which caches "normal" accessors.

Smart pointer accessors are a bit different in that they may:
- have aliases to access the same underlying data (but potentially
  returning slightly different types like `&` vs `*`). Within a
  "checked" sequence users may mix uses of the different aliases and the
  check should apply to any of the spellings.
- may have non-const overloads in addition to the const version, where
  the non-const doesn't actually modify the container

Part 2 will follow and add transfer functions utilities. It will also
add a user UncheckedOptionalAccessModel. We'd seen false positives when
nesting StatusOr<optional<T>> and optional<StatusOr<T>>, etc. which this
can help address.
---
 .../SmartPointerAccessorCaching.h             |  63 ++++++
 .../lib/Analysis/FlowSensitive/CMakeLists.txt |   1 +
 .../SmartPointerAccessorCaching.cpp           | 147 +++++++++++++
 .../Analysis/FlowSensitive/CMakeLists.txt     |   1 +
 .../SmartPointerAccessorCachingTest.cpp       | 194 ++++++++++++++++++
 5 files changed, 406 insertions(+)
 create mode 100644 clang/include/clang/Analysis/FlowSensitive/SmartPointerAccessorCaching.h
 create mode 100644 clang/lib/Analysis/FlowSensitive/SmartPointerAccessorCaching.cpp
 create mode 100644 clang/unittests/Analysis/FlowSensitive/SmartPointerAccessorCachingTest.cpp

diff --git a/clang/include/clang/Analysis/FlowSensitive/SmartPointerAccessorCaching.h b/clang/include/clang/Analysis/FlowSensitive/SmartPointerAccessorCaching.h
new file mode 100644
index 0000000000000..3e4016518eaac
--- /dev/null
+++ b/clang/include/clang/Analysis/FlowSensitive/SmartPointerAccessorCaching.h
@@ -0,0 +1,63 @@
+//===-- SmartPointerAccessorCaching.h ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines utilities to help cache accessors for smart pointer
+// like objects.
+//
+// These should be combined with CachedConstAccessorsLattice.
+// Beyond basic const accessors, smart pointers may have the following two
+// additional issues:
+//
+// 1) There may be multiple accessors for the same underlying object, e.g.
+//    `operator->`, `operator*`, and `get`. Users may use a mixture of these
+//    accessors, so the cache should unify them.
+//
+// 2) There may be non-const overloads of accessors. They are still safe to
+//    cache, as they don't modify the container object.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_SMARTPOINTERACCESSORCACHING_H
+#define LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_SMARTPOINTERACCESSORCACHING_H
+
+#include <cassert>
+
+#include "clang/AST/Decl.h"
+#include "clang/AST/Stmt.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+
+namespace clang::dataflow {
+
+/// Matchers:
+/// For now, these match on any class with an `operator*` or `operator->`
+/// where the return types have a similar shape as std::unique_ptr
+/// and std::optional.
+///
+/// - `*` returns a reference to a type `T`
+/// - `->` returns a pointer to `T`
+/// - `get` returns a pointer to `T`
+/// - `value` returns a reference `T`
+///
+/// (1) The `T` should all match across the accessors (ignoring qualifiers).
+///
+/// (2) The specific accessor used in a call isn't required to be const,
+///     but the class must have a const overload of each accessor.
+///
+/// For now, we don't have customization to ignore certain classes.
+/// For example, if writing a ClangTidy check for `std::optional`, these
+/// would also match `std::optional`. In order to have special handling
+/// for `std::optional`, we assume the (Matcher, TransferFunction) case
+/// with custom handling is ordered early so that these generic cases
+/// do not trigger.
+ast_matchers::StatementMatcher isSmartPointerLikeOperatorStar();
+ast_matchers::StatementMatcher isSmartPointerLikeOperatorArrow();
+ast_matchers::StatementMatcher isSmartPointerLikeValueMethodCall();
+ast_matchers::StatementMatcher isSmartPointerLikeGetMethodCall();
+
+} // namespace clang::dataflow
+
+#endif // LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_SMARTPOINTERACCESSORCACHING_H
diff --git a/clang/lib/Analysis/FlowSensitive/CMakeLists.txt b/clang/lib/Analysis/FlowSensitive/CMakeLists.txt
index 05cdaa7e27823..0c30df8b4b194 100644
--- a/clang/lib/Analysis/FlowSensitive/CMakeLists.txt
+++ b/clang/lib/Analysis/FlowSensitive/CMakeLists.txt
@@ -10,6 +10,7 @@ add_clang_library(clangAnalysisFlowSensitive
   Logger.cpp
   RecordOps.cpp
   SimplifyConstraints.cpp
+  SmartPointerAccessorCaching.cpp
   Transfer.cpp
   TypeErasedDataflowAnalysis.cpp
   Value.cpp
diff --git a/clang/lib/Analysis/FlowSensitive/SmartPointerAccessorCaching.cpp b/clang/lib/Analysis/FlowSensitive/SmartPointerAccessorCaching.cpp
new file mode 100644
index 0000000000000..546f128531df3
--- /dev/null
+++ b/clang/lib/Analysis/FlowSensitive/SmartPointerAccessorCaching.cpp
@@ -0,0 +1,147 @@
+#include "clang/Analysis/FlowSensitive/SmartPointerAccessorCaching.h"
+
+#include "clang/AST/CanonicalType.h"
+#include "clang/AST/DeclCXX.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/ASTMatchers/ASTMatchersMacros.h"
+#include "clang/Basic/OperatorKinds.h"
+
+namespace clang::dataflow {
+
+namespace {
+
+using ast_matchers::callee;
+using ast_matchers::cxxMemberCallExpr;
+using ast_matchers::cxxMethodDecl;
+using ast_matchers::cxxOperatorCallExpr;
+using ast_matchers::hasName;
+using ast_matchers::hasOverloadedOperatorName;
+using ast_matchers::ofClass;
+using ast_matchers::parameterCountIs;
+using ast_matchers::pointerType;
+using ast_matchers::referenceType;
+using ast_matchers::returns;
+
+bool hasSmartPointerClassShape(const CXXRecordDecl &RD, bool &HasGet,
+                               bool &HasValue) {
+  // We may want to cache this search, but in current profiles it hasn't shown
+  // up as a hot spot (possibly because there aren't many hits, relatively).
+  bool HasArrow = false;
+  bool HasStar = false;
+  CanQualType StarReturnType, ArrowReturnType, GetReturnType, ValueReturnType;
+  for (const auto *MD : RD.methods()) {
+    // We only consider methods that are const and have zero parameters.
+    // It may be that there is a non-const overload for the method, but
+    // there should at least be a const overload as well.
+    if (!MD->isConst() || MD->getNumParams() != 0)
+      continue;
+    switch (MD->getOverloadedOperator()) {
+    case OO_Star:
+      if (MD->getReturnType()->isReferenceType()) {
+        HasStar = true;
+        StarReturnType = MD->getReturnType()
+                             .getNonReferenceType()
+                             ->getCanonicalTypeUnqualified();
+      }
+      break;
+    case OO_Arrow:
+      if (MD->getReturnType()->isPointerType()) {
+        HasArrow = true;
+        ArrowReturnType = MD->getReturnType()
+                              ->getPointeeType()
+                              ->getCanonicalTypeUnqualified();
+      }
+      break;
+    case OO_None: {
+      IdentifierInfo *II = MD->getIdentifier();
+      if (II == nullptr)
+        continue;
+      if (II->isStr("get")) {
+        if (MD->getReturnType()->isPointerType()) {
+          HasGet = true;
+          GetReturnType = MD->getReturnType()
+                              ->getPointeeType()
+                              ->getCanonicalTypeUnqualified();
+        }
+      } else if (II->isStr("value")) {
+        if (MD->getReturnType()->isReferenceType()) {
+          HasValue = true;
+          ValueReturnType = MD->getReturnType()
+                                .getNonReferenceType()
+                                ->getCanonicalTypeUnqualified();
+        }
+      }
+    }
+    default:
+      break;
+    }
+  }
+
+  if (!HasStar || !HasArrow || StarReturnType != ArrowReturnType)
+    return false;
+  HasGet = HasGet && (GetReturnType == StarReturnType);
+  HasValue = HasValue && (ValueReturnType == StarReturnType);
+  return true;
+}
+
+} // namespace
+} // namespace clang::dataflow
+
+// AST_MATCHER macros create an "internal" namespace, so we put it in
+// its own anonymous namespace instead of in clang::dataflow.
+namespace {
+
+AST_MATCHER(clang::CXXRecordDecl, smartPointerClassWithGet) {
+  bool HasGet = false;
+  bool HasValue = false;
+  bool HasStarAndArrow =
+      clang::dataflow::hasSmartPointerClassShape(Node, HasGet, HasValue);
+  return HasStarAndArrow && HasGet;
+}
+
+AST_MATCHER(clang::CXXRecordDecl, smartPointerClassWithValue) {
+  bool HasGet = false;
+  bool HasValue = false;
+  bool HasStarAndArrow =
+      clang::dataflow::hasSmartPointerClassShape(Node, HasGet, HasValue);
+  return HasStarAndArrow && HasValue;
+}
+
+AST_MATCHER(clang::CXXRecordDecl, smartPointerClassWithGetOrValue) {
+  bool HasGet = false;
+  bool HasValue = false;
+  bool HasStarAndArrow =
+      clang::dataflow::hasSmartPointerClassShape(Node, HasGet, HasValue);
+  return HasStarAndArrow && (HasGet || HasValue);
+}
+
+} // namespace
+
+namespace clang::dataflow {
+
+ast_matchers::StatementMatcher isSmartPointerLikeOperatorStar() {
+  return cxxOperatorCallExpr(
+      hasOverloadedOperatorName("*"),
+      callee(cxxMethodDecl(parameterCountIs(0), returns(referenceType()),
+                           ofClass(smartPointerClassWithGetOrValue()))));
+}
+
+ast_matchers::StatementMatcher isSmartPointerLikeOperatorArrow() {
+  return cxxOperatorCallExpr(
+      hasOverloadedOperatorName("->"),
+      callee(cxxMethodDecl(parameterCountIs(0), returns(pointerType()),
+                           ofClass(smartPointerClassWithGetOrValue()))));
+}
+ast_matchers::StatementMatcher isSmartPointerLikeValueMethodCall() {
+  return cxxMemberCallExpr(callee(
+      cxxMethodDecl(parameterCountIs(0), returns(referenceType()),
+                    hasName("value"), ofClass(smartPointerClassWithValue()))));
+}
+
+ast_matchers::StatementMatcher isSmartPointerLikeGetMethodCall() {
+  return cxxMemberCallExpr(callee(
+      cxxMethodDecl(parameterCountIs(0), returns(pointerType()), hasName("get"),
+                    ofClass(smartPointerClassWithGet()))));
+}
+
+} // namespace clang::dataflow
diff --git a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt
index 4e1819bfa166a..6c01ae8fc2e54 100644
--- a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt
+++ b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt
@@ -21,6 +21,7 @@ add_clang_unittest(ClangAnalysisFlowSensitiveTests
   SignAnalysisTest.cpp
   SimplifyConstraintsTest.cpp
   SingleVarConstantPropagationTest.cpp
+  SmartPointerAccessorCachingTest.cpp
   TestingSupport.cpp
   TestingSupportTest.cpp
   TransferBranchTest.cpp
diff --git a/clang/unittests/Analysis/FlowSensitive/SmartPointerAccessorCachingTest.cpp b/clang/unittests/Analysis/FlowSensitive/SmartPointerAccessorCachingTest.cpp
new file mode 100644
index 0000000000000..3f75dff60ee5f
--- /dev/null
+++ b/clang/unittests/Analysis/FlowSensitive/SmartPointerAccessorCachingTest.cpp
@@ -0,0 +1,194 @@
+//===- unittests/Analysis/FlowSensitive/SmartPointerAccessorCachingTest.cpp ==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Analysis/FlowSensitive/SmartPointerAccessorCaching.h"
+
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/Testing/TestAST.h"
+#include "llvm/ADT/StringRef.h"
+#include "gtest/gtest.h"
+
+namespace clang::dataflow {
+namespace {
+
+using clang::ast_matchers::match;
+
+template <typename MatcherT>
+bool matches(llvm::StringRef Decls, llvm::StringRef TestInput,
+             MatcherT Matcher) {
+  TestAST InputAST(Decls.str() + TestInput.str());
+  return !match(Matcher, InputAST.context()).empty();
+}
+
+TEST(SmartPointerAccessorCachingTest, MatchesClassWithStarArrowGet) {
+  llvm::StringRef Decls(R"cc(
+    namespace std {
+    template <class T>
+    struct unique_ptr {
+      T* operator->() const;
+      T& operator*() const;
+      T* get() const;
+    };
+    }  // namespace std
+
+    template <class T>
+    using UniquePtrAlias = std::unique_ptr<T>;
+
+    struct S { int i; };
+  )cc");
+
+  EXPECT_TRUE(matches(Decls,
+                      "int target(std::unique_ptr<S> P) { return (*P).i; }",
+                      isSmartPointerLikeOperatorStar()));
+  EXPECT_TRUE(matches(Decls,
+                      "int target(std::unique_ptr<S> P) { return P->i; }",
+                      isSmartPointerLikeOperatorArrow()));
+  EXPECT_TRUE(matches(Decls,
+                      "int target(std::unique_ptr<S> P) { return P.get()->i; }",
+                      isSmartPointerLikeGetMethodCall()));
+
+  EXPECT_TRUE(matches(Decls, "int target(UniquePtrAlias<S> P) { return P->i; }",
+                      isSmartPointerLikeOperatorArrow()));
+}
+
+TEST(SmartPointerAccessorCachingTest, NoMatchIfUnexpectedReturnTypes) {
+  llvm::StringRef Decls(R"cc(
+    namespace std {
+    // unique_ptr isn't really like this, but we aren't matching by name
+    template <class T, class U>
+    struct unique_ptr {
+      U* operator->() const;
+      T& operator*() const;
+      T* get() const;
+    };
+    }  // namespace std
+
+    struct S { int i; };
+    struct T { int j; };
+  )cc");
+
+  EXPECT_FALSE(matches(Decls,
+                       "int target(std::unique_ptr<S, T> P) { return (*P).i; }",
+                       isSmartPointerLikeOperatorStar()));
+  EXPECT_FALSE(matches(Decls,
+                       "int target(std::unique_ptr<S, T> P) { return P->j; }",
+                       isSmartPointerLikeOperatorArrow()));
+  // The class matching arguably accidentally matches, just because the
+  // instantiation is with S, S. Hopefully doesn't happen too much in real code
+  // with such operator* and operator-> overloads.
+  EXPECT_TRUE(matches(Decls,
+                      "int target(std::unique_ptr<S, S> P) { return P->i; }",
+                      isSmartPointerLikeOperatorArrow()));
+}
+
+TEST(SmartPointerAccessorCachingTest, NoMatchIfBinaryStar) {
+  llvm::StringRef Decls(R"cc(
+    namespace std {
+    template <class T>
+    struct unique_ptr {
+      T* operator->() const;
+      T& operator*(int x) const;
+      T* get() const;
+    };
+    }  // namespace std
+
+    struct S { int i; };
+  )cc");
+
+  EXPECT_FALSE(
+      matches(Decls, "int target(std::unique_ptr<S> P) { return (P * 10).i; }",
+              isSmartPointerLikeOperatorStar()));
+}
+
+TEST(SmartPointerAccessorCachingTest, NoMatchIfNoConstOverloads) {
+  llvm::StringRef Decls(R"cc(
+    namespace std {
+    template <class T>
+    struct unique_ptr {
+      T* operator->();
+      T& operator*();
+      T* get();
+    };
+    }  // namespace std
+
+    struct S { int i; };
+  )cc");
+
+  EXPECT_FALSE(matches(Decls,
+                       "int target(std::unique_ptr<S> P) { return (*P).i; }",
+                       isSmartPointerLikeOperatorStar()));
+  EXPECT_FALSE(matches(Decls,
+                       "int target(std::unique_ptr<S> P) { return P->i; }",
+                       isSmartPointerLikeOperatorArrow()));
+  EXPECT_FALSE(
+      matches(Decls, "int target(std::unique_ptr<S> P) { return P.get()->i; }",
+              isSmartPointerLikeGetMethodCall()));
+}
+
+TEST(SmartPointerAccessorCachingTest, NoMatchIfNoStarMethod) {
+  llvm::StringRef Decls(R"cc(
+    namespace std {
+    template <class T>
+    struct unique_ptr {
+      T* operator->();
+      T* get();
+    };
+    }  // namespace std
+
+    struct S { int i; };
+  )cc");
+
+  EXPECT_FALSE(matches(Decls,
+                       "int target(std::unique_ptr<S> P) { return P->i; }",
+                       isSmartPointerLikeOperatorArrow()));
+  EXPECT_FALSE(matches(Decls,
+                       "int target(std::unique_ptr<S> P) { return P->i; }",
+                       isSmartPointerLikeGetMethodCall()));
+}
+
+TEST(SmartPointerAccessorCachingTest, MatchesWithValueAndNonConstOverloads) {
+  llvm::StringRef Decls(R"cc(
+    namespace std {
+    template <class T>
+    struct optional {
+      const T* operator->() const;
+      T* operator->();
+      const T& operator*() const;
+      T& operator*();
+      const T& value() const;
+      T& value();
+    };
+    }  // namespace std
+
+    struct S { int i; };
+  )cc");
+
+  EXPECT_TRUE(matches(
+      Decls, "int target(std::optional<S> &NonConst) { return (*NonConst).i; }",
+      isSmartPointerLikeOperatorStar()));
+  EXPECT_TRUE(matches(
+      Decls, "int target(const std::optional<S> &Const) { return (*Const).i; }",
+      isSmartPointerLikeOperatorStar()));
+  EXPECT_TRUE(matches(
+      Decls, "int target(std::optional<S> &NonConst) { return NonConst->i; }",
+      isSmartPointerLikeOperatorArrow()));
+  EXPECT_TRUE(matches(
+      Decls, "int target(const std::optional<S> &Const) { return Const->i; }",
+      isSmartPointerLikeOperatorArrow()));
+  EXPECT_TRUE(matches(
+      Decls,
+      "int target(std::optional<S> &NonConst) { return NonConst.value().i; }",
+      isSmartPointerLikeValueMethodCall()));
+  EXPECT_TRUE(matches(
+      Decls,
+      "int target(const std::optional<S> &Const) { return Const.value().i; }",
+      isSmartPointerLikeValueMethodCall()));
+}
+
+} // namespace
+} // namespace clang::dataflow

From 54665f5252695922dd000f311f82af717f1df0c6 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 20 Dec 2024 14:01:33 +0000
Subject: [PATCH 195/209] [gn build] Port 54309b1c2f7a

---
 .../utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn | 1 +
 .../gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn | 1 +
 2 files changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn
index 393596186c0c6..0b6fa7cc5f5ce 100644
--- a/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn
@@ -35,6 +35,7 @@ static_library("FlowSensitive") {
     "Logger.cpp",
     "RecordOps.cpp",
     "SimplifyConstraints.cpp",
+    "SmartPointerAccessorCaching.cpp",
     "Transfer.cpp",
     "TypeErasedDataflowAnalysis.cpp",
     "Value.cpp",
diff --git a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
index 1287bdd2bb880..e4727d5a3298c 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
@@ -35,6 +35,7 @@ unittest("ClangAnalysisFlowSensitiveTests") {
     "SignAnalysisTest.cpp",
     "SimplifyConstraintsTest.cpp",
     "SingleVarConstantPropagationTest.cpp",
+    "SmartPointerAccessorCachingTest.cpp",
     "TestingSupport.cpp",
     "TestingSupportTest.cpp",
     "TransferBranchTest.cpp",

From fa9cef50b1afc203b6b653396f9e775862c26e68 Mon Sep 17 00:00:00 2001
From: Dominik Steenken <dost@de.ibm.com>
Date: Fri, 20 Dec 2024 15:15:51 +0100
Subject: [PATCH 196/209] Only guard loop metadata that has non-debug info in
 it (#118825)

This PR is motivated by a mismatch we discovered between compilation
results with vs. without `-g3`. We noticed this when compiling SPEC2017
testcases. The specific instance we saw is fixed in this PR by modifying
a guard (see below), but it is likely similar instances exist elsewhere
in the codebase.

The specific case fixed in this PR manifests itself in the `SimplifyCFG`
pass doing different things depending on whether DebugInfo is generated
or not. At the end of this comment, there is reduced example code that
shows the behavior in question.

The differing behavior has two root causes:
1. Commit https://github.com/llvm/llvm-project/commit/c07e19b adds loop
metadata including debug locations to loops that otherwise would not
have loop metadata
2. Commit https://github.com/llvm/llvm-project/commit/ac28efa6c100 adds
a guard to a simplification action in `SImplifyCFG` that prevents it
from simplifying away loop metadata

So, the change in 2. does not consider that when compiling with debug
symbols, loops that otherwise would not have metadata that needs
preserving, now have debug locations in their loop metadata. Thus, with
`-g3`, `SimplifyCFG` behaves differently than without it.

The larger issue is that while debug info is not supposed to influence
the final compilation result, commits like 1. blur the line between what
is and is not debug info, and not all optimization passes account for
this.

This PR does not address that and rather just modifies this particular
guard in order to restore equivalent behavior between debug and
non-debug builds in this one instance.

---

Here is a reduced version of a file from `f526.blender_r` that showcases
the behavior in question:
```C
struct LinkNode;
typedef struct LinkNode {
 struct LinkNode *next;
 void *link;
} LinkNode;

void do_projectpaint_thread_ph_v_state() {
  int *ps = do_projectpaint_thread_ph_v_state;
  LinkNode *node;
  while (do_projectpaint_thread_ph_v_state)
    for (node = ps; node; node = node->next)
      ;
}
```
Compiling this with and without DebugInfo, and then disassembling the
results, leads to different outcomes (tested on SystemZ and X86). The
reason for this is that the `SimplifyCFG` pass does different things in
either case.
---
 llvm/include/llvm/IR/Instruction.h            |  4 +
 llvm/lib/IR/Instruction.cpp                   | 25 ++++++
 llvm/lib/Transforms/Utils/Local.cpp           | 13 +--
 .../preserve-llvm-loop-metadata.ll            | 86 +++++++++++++++----
 4 files changed, 106 insertions(+), 22 deletions(-)

diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 730baa8cc0052..aa480aa8d9863 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -367,6 +367,10 @@ class Instruction : public User,
   /// Return true if this instruction has any metadata attached to it.
   bool hasMetadata() const { return DbgLoc || Value::hasMetadata(); }
 
+  // Return true if this instruction contains loop metadata other than
+  // a debug location
+  bool hasNonDebugLocLoopMetadata() const;
+
   /// Return true if this instruction has metadata attached to it other than a
   /// debug location.
   bool hasMetadataOtherThanDebugLoc() const { return Value::hasMetadata(); }
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 0137bb281e7ec..147cd84125c8d 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/IR/Instruction.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/AttributeMask.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
@@ -19,6 +20,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
@@ -461,6 +463,29 @@ bool Instruction::hasPoisonGeneratingMetadata() const {
          hasMetadata(LLVMContext::MD_align);
 }
 
+bool Instruction::hasNonDebugLocLoopMetadata() const {
+  // If there is no loop metadata at all, we also don't have
+  // non-debug loop metadata, obviously.
+  if (!hasMetadata(LLVMContext::MD_loop))
+    return false;
+
+  // If we do have loop metadata, retrieve it.
+  MDNode *LoopMD = getMetadata(LLVMContext::MD_loop);
+
+  // Check if the existing operands are debug locations. This loop
+  // should terminate after at most three iterations. Skip
+  // the first item because it is a self-reference.
+  for (const MDOperand &Op : llvm::drop_begin(LoopMD->operands())) {
+    // check for debug location type by attempting a cast.
+    if (!dyn_cast<DILocation>(Op)) {
+      return true;
+    }
+  }
+
+  // If we get here, then all we have is debug locations in the loop metadata.
+  return false;
+}
+
 void Instruction::dropPoisonGeneratingMetadata() {
   eraseMetadata(LLVMContext::MD_range);
   eraseMetadata(LLVMContext::MD_nonnull);
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index cdc3f0308fe59..a3af96d5af026 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -1279,10 +1279,10 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
   // |    for.body <---- (md2)
   // |_______|  |______|
   if (Instruction *TI = BB->getTerminator())
-    if (TI->hasMetadata(LLVMContext::MD_loop))
+    if (TI->hasNonDebugLocLoopMetadata())
       for (BasicBlock *Pred : predecessors(BB))
         if (Instruction *PredTI = Pred->getTerminator())
-          if (PredTI->hasMetadata(LLVMContext::MD_loop))
+          if (PredTI->hasNonDebugLocLoopMetadata())
             return false;
 
   if (BBKillable)
@@ -1345,12 +1345,15 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
     }
   }
 
-  // If the unconditional branch we replaced contains llvm.loop metadata, we
-  // add the metadata to the branch instructions in the predecessors.
+  // If the unconditional branch we replaced contains non-debug llvm.loop
+  // metadata, we add the metadata to the branch instructions in the
+  // predecessors.
   if (Instruction *TI = BB->getTerminator())
-    if (MDNode *LoopMD = TI->getMetadata(LLVMContext::MD_loop))
+    if (TI->hasNonDebugLocLoopMetadata()) {
+      MDNode *LoopMD = TI->getMetadata(LLVMContext::MD_loop);
       for (BasicBlock *Pred : predecessors(BB))
         Pred->getTerminator()->setMetadata(LLVMContext::MD_loop, LoopMD);
+    }
 
   if (BBKillable) {
     // Everything that jumped to BB now goes to Succ.
diff --git a/llvm/test/Transforms/SimplifyCFG/preserve-llvm-loop-metadata.ll b/llvm/test/Transforms/SimplifyCFG/preserve-llvm-loop-metadata.ll
index 6c255ff4fe876..a95b6ec15b129 100644
--- a/llvm/test/Transforms/SimplifyCFG/preserve-llvm-loop-metadata.ll
+++ b/llvm/test/Transforms/SimplifyCFG/preserve-llvm-loop-metadata.ll
@@ -28,7 +28,7 @@ if.then:                                          ; preds = %while.body
   br label %if.end
 
 ; CHECK: if.then:
-; CHECK:  br label %while.cond, !llvm.loop !0
+; CHECK:  br label %while.cond, !llvm.loop !1
 
 if.else:                                          ; preds = %while.body
   %4 = load i32, ptr %count, align 4
@@ -37,10 +37,10 @@ if.else:                                          ; preds = %while.body
   br label %if.end
 
 ; CHECK: if.else:
-; CHECK:  br label %while.cond, !llvm.loop !0
+; CHECK:  br label %while.cond, !llvm.loop !1
 
 if.end:                                           ; preds = %if.else, %if.then
-  br label %while.cond, !llvm.loop !0
+  br label %while.cond, !llvm.loop !1
 
 while.end:                                        ; preds = %while.cond
   ret void
@@ -74,7 +74,7 @@ entry:
   br label %while.cond
 
 while.cond.loopexit:                              ; preds = %for.body
-  br label %while.cond, !llvm.loop !2
+  br label %while.cond, !llvm.loop !3
 
 while.cond:                                       ; preds = %while.cond.loopexit, %entry
   %i.0 = phi i32 [ %a, %entry ], [ %add, %while.cond.loopexit ]
@@ -96,22 +96,74 @@ for.body:                                         ; preds = %while.body, %for.bo
   %1 = tail call i32 asm sideeffect "add ${0:w}, ${1:w}\0A", "=r,r,~{cc}"(i32 %0)
   %inc = add nuw nsw i32 %k.07, 1
   %cmp1 = icmp ult i32 %inc, 5
-  br i1 %cmp1, label %for.body, label %while.cond.loopexit, !llvm.loop !4
+  br i1 %cmp1, label %for.body, label %while.cond.loopexit, !llvm.loop !5
 
 while.end:                                        ; preds = %while.cond
   %sum.0.lcssa = phi i32 [ %sum.0, %while.cond ]
   ret i32 %sum.0.lcssa
 }
 
-!0 = distinct !{!0, !1}
-!1 = !{!"llvm.loop.distribute.enable", i1 true}
-!2 = distinct !{!2, !3}
-!3 = !{!"llvm.loop.mustprogress"}
-!4 = distinct !{!4, !3, !5}
-!5 = !{!"llvm.loop.unroll.enable"}
-; CHECK: !0 = distinct !{!0, !1}
-; CHECK: !1 = !{!"llvm.loop.distribute.enable", i1 true}
-; CHECK: !2 = distinct !{!2, !3}
-; CHECK: !3 = !{!"llvm.loop.mustprogress"}
-; CHECK: !4 = distinct !{!4, !3, !5}
-; CHECK: !5 = !{!"llvm.loop.unroll.enable"}
+; Test that the condition tested above does not trigger when the loop metadata consists only of debug locations,
+; i.e.the empty loop latch `while.cond.loopexit` *will* be folded into its successor if its
+; predecessor blocks are also loop latches and any loop metadata attached to it consists of debug information.
+;
+define i32 @test3(i32 %a, i32 %b, i32 %step, i32 %remainder, ptr %input) !dbg !7 {
+entry:
+  br label %while.cond
+
+;CHECK-LABEL: @test3( 
+;CHECK-NOT: while.cond.loopexit
+while.cond.loopexit:                              ; preds = %for.body
+  br label %while.cond, !llvm.loop !10
+
+while.cond:                                       ; preds = %while.cond.loopexit, %entry
+  %i.0 = phi i32 [ %a, %entry ], [ %add, %while.cond.loopexit ]
+  %sum.0 = phi i32 [ 0, %entry ], [ %1, %while.cond.loopexit ]
+  %sub = sub nsw i32 %b, %i.0
+  %cmp = icmp sgt i32 %sub, %remainder
+  br i1 %cmp, label %while.body, label %while.end
+
+while.body:                                       ; preds = %while.cond
+  %add = add nsw i32 %i.0, %step
+  br label %for.body
+
+for.body:                                         ; preds = %while.body, %for.body
+  %k.07 = phi i32 [ 0, %while.body ], [ %inc, %for.body ]
+  %add2 = add nsw i32 %k.07, %add
+  %idxprom = sext i32 %add2 to i64
+  %arrayidx = getelementptr inbounds i32, ptr %input, i64 %idxprom
+  %0 = load i32, ptr %arrayidx, align 4
+  %1 = tail call i32 asm sideeffect "add ${0:w}, ${1:w}\0A", "=r,r,~{cc}"(i32 %0)
+  %inc = add nuw nsw i32 %k.07, 1
+  %cmp1 = icmp ult i32 %inc, 5
+  br i1 %cmp1, label %for.body, label %while.cond.loopexit, !llvm.loop !5
+
+while.end:                                        ; preds = %while.cond
+  %sum.0.lcssa = phi i32 [ %sum.0, %while.cond ]
+  ret i32 %sum.0.lcssa
+}
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.distribute.enable", i1 true}
+!3 = distinct !{!3, !4}
+!4 = !{!"llvm.loop.mustprogress"}
+!5 = distinct !{!5, !4, !6}
+!6 = !{!"llvm.loop.unroll.enable"}
+!7 = distinct !DISubprogram(name: "test3", scope: !8, file: !8, spFlags: DISPFlagDefinition, unit: !9)
+!8 = !DIFile(filename: "preserve-llvm-loop-metadata.ll", directory: "/")
+!9 = distinct !DICompileUnit(language: DW_LANG_C99, file: !8, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
+!10 = distinct !{!10, !11, !13}
+!11 = !DILocation(line: 8, column: 4, scope: !12)
+!12 = distinct !DILexicalBlock(scope: !7, file: !8, line: 8, column: 2)
+!13 = !DILocation(line: 9, column: 23, scope: !12)
+
+; CHECK: !1 = distinct !{!1, !2}
+; CHECK: !2 = !{!"llvm.loop.distribute.enable", i1 true}
+; CHECK: !3 = distinct !{!3, !4}
+; CHECK: !4 = !{!"llvm.loop.mustprogress"}
+; CHECK: !5 = distinct !{!5, !4, !6}
+; CHECK: !6 = !{!"llvm.loop.unroll.enable"}
+; CHECK-NOT: !10 = distinct !{!10, !11, !13}

From 9e333872199b1e3bf488d71e222ff4b6f0370347 Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Fri, 20 Dec 2024 22:29:44 +0800
Subject: [PATCH 197/209] [clang analyzer]consume `llvm::Error` (#120597)

`llvm::Error` must be consumed, otherwise it will cause trap during destructor
---
 clang/lib/StaticAnalyzer/Core/TextDiagnostics.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/StaticAnalyzer/Core/TextDiagnostics.cpp b/clang/lib/StaticAnalyzer/Core/TextDiagnostics.cpp
index 71268af22e242..e8cf367b83346 100644
--- a/clang/lib/StaticAnalyzer/Core/TextDiagnostics.cpp
+++ b/clang/lib/StaticAnalyzer/Core/TextDiagnostics.cpp
@@ -81,7 +81,7 @@ class TextDiagnostics : public PathDiagnosticConsumer {
 
         if (llvm::Error Err = Repls.add(Repl)) {
           llvm::errs() << "Error applying replacement " << Repl.toString()
-                       << ": " << Err << "\n";
+                       << ": " << llvm::toString(std::move(Err)) << "\n";
         }
       }
     };

From acfd26a93be3fb70013560f3fb894eb9086e7e32 Mon Sep 17 00:00:00 2001
From: Julian Nagele <j.nagele@apple.com>
Date: Fri, 20 Dec 2024 15:44:15 +0100
Subject: [PATCH 198/209] [SCEV] Fix exit condition for recursive loop guard
 collection (#120442)

When assumptions are present `Terms.size()` does not actually count the
number of conditions collected from dominating branches; introduce a
separate counter.

Fixes https://github.com/llvm/llvm-project/issues/120237
---
 llvm/lib/Analysis/ScalarEvolution.cpp         |  6 ++--
 ...t-guard-info-with-multiple-predecessors.ll | 33 +++++++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index d55d09020fc14..1e4bb1d606cd3 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -15753,6 +15753,7 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
   // predecessors that can be found that have unique successors leading to the
   // original header.
   // TODO: share this logic with isLoopEntryGuardedByCond.
+  unsigned NumCollectedConditions = 0;
   std::pair<const BasicBlock *, const BasicBlock *> Pair(Pred, Block);
   for (; Pair.first;
        Pair = SE.getPredecessorWithUniqueSuccessorForBB(Pair.first)) {
@@ -15764,10 +15765,11 @@ void ScalarEvolution::LoopGuards::collectFromBlock(
 
     Terms.emplace_back(LoopEntryPredicate->getCondition(),
                        LoopEntryPredicate->getSuccessor(0) == Pair.second);
+    NumCollectedConditions++;
 
     // If we are recursively collecting guards stop after 2
-    // predecessors to limit compile-time impact for now.
-    if (Depth > 0 && Terms.size() == 2)
+    // conditions to limit compile-time impact for now.
+    if (Depth > 0 && NumCollectedConditions == 2)
       break;
   }
   // Finally, if we stopped climbing the predecessor chain because
diff --git a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll
index 71d66ef04ade1..81fe96a2f30c0 100644
--- a/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll
+++ b/llvm/test/Analysis/ScalarEvolution/backedge-taken-count-guard-info-with-multiple-predecessors.ll
@@ -277,3 +277,36 @@ epilogue:
 exit:
   ret void
 }
+
+declare void @llvm.assume(i1)
+
+; Checks that the presence of assumptions does not interfere with
+; exiting loop guard collection via following loop predecessors.
+define void @pr120442(i1 %c.1, i1 %c.2) {
+; CHECK-LABEL: 'pr120442'
+; CHECK-NEXT:  Determining loop execution counts for: @pr120442
+; CHECK-NEXT:  Loop %inner.header: backedge-taken count is i32 0
+; CHECK-NEXT:  Loop %inner.header: constant max backedge-taken count is i32 0
+; CHECK-NEXT:  Loop %inner.header: symbolic max backedge-taken count is i32 0
+; CHECK-NEXT:  Loop %inner.header: Trip multiple is 1
+entry:
+  call void @llvm.assume(i1 %c.1)
+  call void @llvm.assume(i1 %c.2)
+  br label %outer.header
+
+outer.header:
+  %phi7 = phi i32 [ 0, %bb ], [ 0, %entry ]
+  br label %inner.header
+
+bb:
+  br i1 false, label %outer.header, label %bb
+
+inner.header:
+  %phi = phi i32 [ %add, %inner.header ], [ 0, %outer.header ]
+  %add = add i32 %phi, 1
+  %icmp = icmp ugt i32 %add, 0
+  br i1 %icmp, label %exit, label %inner.header
+
+exit:
+  ret void
+}

From 2d5dc5c208532833e2ce55d7e1ce29063d91bbe3 Mon Sep 17 00:00:00 2001
From: Jan Voung <jvoung@google.com>
Date: Fri, 20 Dec 2024 09:57:30 -0500
Subject: [PATCH 199/209] [clang][dataflow] Fix a missing break from a switch
 case -Wimplicit-fallthrough (#120739)

Missed when changing code in
https://github.com/llvm/llvm-project/pull/120102
---
 .../lib/Analysis/FlowSensitive/SmartPointerAccessorCaching.cpp  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Analysis/FlowSensitive/SmartPointerAccessorCaching.cpp b/clang/lib/Analysis/FlowSensitive/SmartPointerAccessorCaching.cpp
index 546f128531df3..a0c81aa933da8 100644
--- a/clang/lib/Analysis/FlowSensitive/SmartPointerAccessorCaching.cpp
+++ b/clang/lib/Analysis/FlowSensitive/SmartPointerAccessorCaching.cpp
@@ -71,7 +71,7 @@ bool hasSmartPointerClassShape(const CXXRecordDecl &RD, bool &HasGet,
                                 ->getCanonicalTypeUnqualified();
         }
       }
-    }
+    } break;
     default:
       break;
     }

From b87a5fb9fd8d50c911ac95f6854389d287542010 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 20 Dec 2024 14:57:47 +0000
Subject: [PATCH 200/209] [VectorCombine] Add "VC: Visiting" debug message to
 help the log show the instruction folding order.

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index deff00ff1ae52..37bf44122043c 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3041,6 +3041,8 @@ bool VectorCombine::run() {
     bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
     auto Opcode = I.getOpcode();
 
+    LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
+
     // These folds should be beneficial regardless of when this pass is run
     // in the optimization pipeline.
     // The type checking is for run-time efficiency. We can avoid wasting time

From 70eac255b8c09244c7a9af7599fbe27d886010e9 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Fri, 20 Dec 2024 15:05:08 +0000
Subject: [PATCH 201/209] [VectorCombine] Add fp cast handling for
 shuffletoidentity (#120641)

This fixes some regressions from recent changes to vector combine in
#120216. It allows shuffleToIdentity to look through fp casts as other
casts, and makes sure mismatching vector types in splats and casts do
not block the transform, as only the lanes should matter.
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 10 ++-
 .../AArch64/shuffletoidentity.ll              | 75 ++++---------------
 2 files changed, 22 insertions(+), 63 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 37bf44122043c..4538c6ae0e8fc 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2259,7 +2259,9 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
         all_of(drop_begin(Item), [Item](InstLane &IL) {
           Value *FrontV = Item.front().first->get();
           Use *U = IL.first;
-          return !U || U->get() == FrontV;
+          return !U || (isa<Constant>(U->get()) &&
+                        cast<Constant>(U->get())->getSplatValue() ==
+                            cast<Constant>(FrontV)->getSplatValue());
         })) {
       SplatLeafs.insert(FrontU);
       continue;
@@ -2289,7 +2291,8 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
         if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
           return false;
       if (auto *CI = dyn_cast<CastInst>(V))
-        if (CI->getSrcTy() != cast<CastInst>(FrontV)->getSrcTy())
+        if (CI->getSrcTy()->getScalarType() !=
+            cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
           return false;
       if (auto *SI = dyn_cast<SelectInst>(V))
         if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
@@ -2314,7 +2317,8 @@ bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
         Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
         Worklist.push_back(generateInstLaneVectorFromOperand(Item, 1));
         continue;
-      } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst>(FrontU)) {
+      } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
+                     FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) {
         Worklist.push_back(generateInstLaneVectorFromOperand(Item, 0));
         continue;
       } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
index 44572b79ad407..f4c27794d3930 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -1110,17 +1110,9 @@ define <8 x i8> @operandbundles_second(<8 x i8> %a) {
 
 define <8 x i32> @fptoi(<8 x float> %a, <8 x float> %b) {
 ; CHECK-LABEL: @fptoi(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x float> [[A]], <8 x float> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[ABT:%.*]] = fptosi <4 x float> [[AT]] to <4 x i32>
-; CHECK-NEXT:    [[ABB:%.*]] = fptosi <4 x float> [[AB]] to <4 x i32>
-; CHECK-NEXT:    [[BBT:%.*]] = fptoui <4 x float> [[BT]] to <4 x i32>
-; CHECK-NEXT:    [[BBB:%.*]] = fptoui <4 x float> [[BB]] to <4 x i32>
-; CHECK-NEXT:    [[MT:%.*]] = mul <4 x i32> [[ABT]], [[BBT]]
-; CHECK-NEXT:    [[MB:%.*]] = mul <4 x i32> [[ABB]], [[BBB]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[MT]], <4 x i32> [[MB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = fptosi <8 x float> [[A:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = fptoui <8 x float> [[B:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[R:%.*]] = mul <8 x i32> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
   %ab = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -1139,17 +1131,9 @@ define <8 x i32> @fptoi(<8 x float> %a, <8 x float> %b) {
 
 define <8 x half> @itofp(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: @itofp(
-; CHECK-NEXT:    [[AB:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[AT:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[BB:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[BT:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT:    [[ABT:%.*]] = sitofp <4 x i16> [[AT]] to <4 x half>
-; CHECK-NEXT:    [[ABB:%.*]] = sitofp <4 x i16> [[AB]] to <4 x half>
-; CHECK-NEXT:    [[BBT:%.*]] = uitofp <4 x i16> [[BT]] to <4 x half>
-; CHECK-NEXT:    [[BBB:%.*]] = uitofp <4 x i16> [[BB]] to <4 x half>
-; CHECK-NEXT:    [[MT:%.*]] = fmul <4 x half> [[ABT]], [[BBT]]
-; CHECK-NEXT:    [[MB:%.*]] = fmul <4 x half> [[ABB]], [[BBB]]
-; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x half> [[MT]], <4 x half> [[MB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <8 x i16> [[A:%.*]] to <8 x half>
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <8 x i16> [[B:%.*]] to <8 x half>
+; CHECK-NEXT:    [[R:%.*]] = fmul <8 x half> [[TMP1]], [[TMP2]]
 ; CHECK-NEXT:    ret <8 x half> [[R]]
 ;
   %ab = shufflevector <8 x i16> %a, <8 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -1168,37 +1152,13 @@ define <8 x half> @itofp(<8 x i16> %a, <8 x i16> %b) {
 
 define <16 x i32> @const_types(<16 x i32> %wide.vec, <16 x i32> %wide.vec116) {
 ; CHECK-LABEL: @const_types(
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC:%.*]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[STRIDED_VEC113:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-NEXT:    [[STRIDED_VEC114:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-NEXT:    [[STRIDED_VEC115:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:    [[TMP1:%.*]] = sext <4 x i32> [[STRIDED_VEC]] to <4 x i64>
-; CHECK-NEXT:    [[STRIDED_VEC117:%.*]] = shufflevector <16 x i32> [[WIDE_VEC116:%.*]], <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[STRIDED_VEC118:%.*]] = shufflevector <16 x i32> [[WIDE_VEC116]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-NEXT:    [[STRIDED_VEC119:%.*]] = shufflevector <16 x i32> [[WIDE_VEC116]], <16 x i32> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-NEXT:    [[STRIDED_VEC120:%.*]] = shufflevector <16 x i32> [[WIDE_VEC116]], <16 x i32> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:    [[TMP2:%.*]] = sext <4 x i32> [[STRIDED_VEC117]] to <4 x i64>
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw <4 x i64> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <4 x i32> [[STRIDED_VEC113]] to <4 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <4 x i32> [[STRIDED_VEC118]] to <4 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw <4 x i64> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = sext <4 x i32> [[STRIDED_VEC114]] to <4 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = sext <4 x i32> [[STRIDED_VEC119]] to <4 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <4 x i64> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = sext <4 x i32> [[STRIDED_VEC115]] to <4 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i32> [[STRIDED_VEC120]] to <4 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = mul nsw <4 x i64> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP14:%.*]] = lshr <8 x i64> [[TMP13]], splat (i64 32)
-; CHECK-NEXT:    [[TMP15:%.*]] = trunc nuw <8 x i64> [[TMP14]] to <8 x i32>
-; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP15]], <8 x i32> splat (i32 1073741823))
-; CHECK-NEXT:    [[TMP17:%.*]] = shl <8 x i32> [[TMP16]], splat (i32 1)
-; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP19:%.*]] = lshr <8 x i64> [[TMP18]], splat (i64 32)
-; CHECK-NEXT:    [[TMP20:%.*]] = trunc nuw <8 x i64> [[TMP19]] to <8 x i32>
-; CHECK-NEXT:    [[TMP21:%.*]] = call <8 x i32> @llvm.smin.v8i32(<8 x i32> [[TMP20]], <8 x i32> splat (i32 1073741823))
-; CHECK-NEXT:    [[TMP22:%.*]] = shl <8 x i32> [[TMP21]], splat (i32 1)
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP17]], <8 x i32> [[TMP22]], <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <16 x i32> [[WIDE_VEC116:%.*]] to <16 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <16 x i32> [[WIDE_VEC:%.*]] to <16 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw <16 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr <16 x i64> [[TMP3]], splat (i64 32)
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc nuw <16 x i64> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.smin.v16i32(<16 x i32> [[TMP5]], <16 x i32> splat (i32 1073741823))
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shl <16 x i32> [[TMP6]], splat (i32 1)
 ; CHECK-NEXT:    ret <16 x i32> [[INTERLEAVED_VEC]]
 ;
   %strided.vec = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -1245,13 +1205,8 @@ define <16 x i32> @const_types(<16 x i32> %wide.vec, <16 x i32> %wide.vec116) {
 
 define <32 x half> @cast_types(<32 x i16> %wide.vec) {
 ; CHECK-LABEL: @cast_types(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <32 x i16> [[WIDE_VEC:%.*]], <32 x i16> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
-; CHECK-NEXT:    [[TMP5:%.*]] = sitofp <16 x i16> [[TMP1]] to <16 x half>
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <16 x half> [[TMP5]], splat (half 0xH0200)
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <32 x i16> [[WIDE_VEC]], <32 x i16> poison, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
-; CHECK-NEXT:    [[TMP7:%.*]] = sitofp <16 x i16> [[TMP4]] to <16 x half>
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <16 x half> [[TMP7]], splat (half 0xH0200)
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x half> [[TMP6]], <16 x half> [[TMP8]], <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <32 x i16> [[WIDE_VEC:%.*]] to <32 x half>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = fmul fast <32 x half> [[TMP1]], splat (half 0xH0200)
 ; CHECK-NEXT:    ret <32 x half> [[INTERLEAVED_VEC]]
 ;
   %strided.vec = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>

From 5845298f9439796a3a2f15dfce8250e322ddce4a Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Fri, 20 Dec 2024 15:08:08 +0000
Subject: [PATCH 202/209] [LoopVectorize] Teach some X86 cost model tests to
 use new vplan costs (#120738)

I've only fixed up the tests where I was able to use a simple sed script
to replace the text. Even after this patch lands, there are still over
50 tests that need updating in X86/CostModel!
---
 .../X86/CostModel/gather-i16-with-i8-index.ll | 50 ++++++++--------
 .../X86/CostModel/gather-i32-with-i8-index.ll | 58 +++++++++----------
 .../X86/CostModel/gather-i64-with-i8-index.ll | 58 +++++++++----------
 .../X86/CostModel/gather-i8-with-i8-index.ll  | 58 +++++++++----------
 .../interleaved-load-f32-stride-2.ll          | 40 ++++++-------
 .../interleaved-load-f32-stride-3.ll          | 40 ++++++-------
 .../interleaved-load-f32-stride-4.ll          | 38 ++++++------
 .../interleaved-load-i16-stride-2.ll          | 52 ++++++++---------
 .../interleaved-load-i16-stride-3.ll          | 52 ++++++++---------
 .../interleaved-load-i16-stride-4.ll          | 52 ++++++++---------
 .../interleaved-load-i16-stride-6.ll          | 52 ++++++++---------
 ...nterleaved-load-i32-stride-2-indices-0u.ll | 40 ++++++-------
 .../interleaved-load-i32-stride-2.ll          | 40 ++++++-------
 ...terleaved-load-i32-stride-3-indices-01u.ll | 40 ++++++-------
 ...terleaved-load-i32-stride-3-indices-0uu.ll | 40 ++++++-------
 .../interleaved-load-i32-stride-3.ll          | 40 ++++++-------
 ...erleaved-load-i32-stride-4-indices-012u.ll | 38 ++++++------
 ...erleaved-load-i32-stride-4-indices-0uuu.ll | 40 ++++++-------
 .../interleaved-load-i32-stride-4.ll          | 38 ++++++------
 .../interleaved-load-i32-stride-6.ll          | 32 +++++-----
 .../CostModel/interleaved-load-i8-stride-2.ll | 52 ++++++++---------
 .../CostModel/interleaved-load-i8-stride-3.ll | 52 ++++++++---------
 .../CostModel/interleaved-load-i8-stride-4.ll | 52 ++++++++---------
 .../interleaved-store-f32-stride-2.ll         | 40 ++++++-------
 .../interleaved-store-f32-stride-3.ll         | 40 ++++++-------
 .../interleaved-store-f32-stride-4.ll         | 40 ++++++-------
 .../interleaved-store-f32-stride-5.ll         | 34 +++++------
 .../interleaved-store-f32-stride-6.ll         | 34 +++++------
 .../interleaved-store-f32-stride-7.ll         | 32 +++++-----
 .../interleaved-store-f64-stride-2.ll         | 40 ++++++-------
 .../interleaved-store-f64-stride-3.ll         | 34 +++++------
 .../interleaved-store-f64-stride-4.ll         | 32 +++++-----
 .../interleaved-store-f64-stride-5.ll         | 26 ++++-----
 .../interleaved-store-f64-stride-6.ll         | 26 ++++-----
 .../interleaved-store-f64-stride-7.ll         | 26 ++++-----
 .../interleaved-store-i16-stride-2.ll         | 52 ++++++++---------
 .../interleaved-store-i16-stride-3.ll         | 52 ++++++++---------
 .../interleaved-store-i16-stride-4.ll         | 52 ++++++++---------
 .../interleaved-store-i16-stride-5.ll         | 52 ++++++++---------
 .../interleaved-store-i16-stride-6.ll         | 52 ++++++++---------
 .../interleaved-store-i16-stride-7.ll         | 52 ++++++++---------
 .../interleaved-store-i32-stride-2.ll         | 40 ++++++-------
 .../interleaved-store-i32-stride-3.ll         | 40 ++++++-------
 .../interleaved-store-i32-stride-4.ll         | 40 ++++++-------
 .../interleaved-store-i32-stride-5.ll         | 34 +++++------
 .../interleaved-store-i32-stride-6.ll         | 34 +++++------
 .../interleaved-store-i32-stride-7.ll         | 32 +++++-----
 .../interleaved-store-i64-stride-2.ll         | 40 ++++++-------
 .../interleaved-store-i64-stride-3.ll         | 34 +++++------
 .../interleaved-store-i64-stride-4.ll         | 32 +++++-----
 .../interleaved-store-i64-stride-5.ll         | 26 ++++-----
 .../interleaved-store-i64-stride-6.ll         | 26 ++++-----
 .../interleaved-store-i64-stride-7.ll         | 26 ++++-----
 .../interleaved-store-i8-stride-2.ll          | 52 ++++++++---------
 .../interleaved-store-i8-stride-3.ll          | 52 ++++++++---------
 .../interleaved-store-i8-stride-4.ll          | 52 ++++++++---------
 .../interleaved-store-i8-stride-5.ll          | 52 ++++++++---------
 .../interleaved-store-i8-stride-6.ll          | 52 ++++++++---------
 .../interleaved-store-i8-stride-7.ll          | 52 ++++++++---------
 .../masked-gather-i32-with-i8-index.ll        | 50 ++++++++--------
 .../masked-gather-i64-with-i8-index.ll        | 50 ++++++++--------
 .../X86/CostModel/masked-load-i16.ll          | 40 ++++++-------
 .../X86/CostModel/masked-load-i32.ll          | 40 ++++++-------
 .../X86/CostModel/masked-load-i64.ll          | 40 ++++++-------
 .../X86/CostModel/masked-load-i8.ll           | 40 ++++++-------
 65 files changed, 1373 insertions(+), 1373 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i16-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i16-with-i8-index.ll
index bb73b36e902d9..1c427a0107df1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i16-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i16-with-i8-index.ll
@@ -18,43 +18,43 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE-LABEL: 'test'
 ; SSE:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, ptr %inB, align 2
-; SSE:  LV: Found an estimated cost of 24 for VF 2 For instruction: %valB = load i16, ptr %inB, align 2
-; SSE:  LV: Found an estimated cost of 48 for VF 4 For instruction: %valB = load i16, ptr %inB, align 2
-; SSE:  LV: Found an estimated cost of 96 for VF 8 For instruction: %valB = load i16, ptr %inB, align 2
-; SSE:  LV: Found an estimated cost of 192 for VF 16 For instruction: %valB = load i16, ptr %inB, align 2
+; SSE:  Cost of 24 for VF 2: {{.*}}ir<%valB> = load
+; SSE:  Cost of 48 for VF 4: {{.*}}ir<%valB> = load
+; SSE:  Cost of 96 for VF 8: {{.*}}ir<%valB> = load
+; SSE:  Cost of 192 for VF 16: {{.*}}ir<%valB> = load
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, ptr %inB, align 2
-; AVX1:  LV: Found an estimated cost of 24 for VF 2 For instruction: %valB = load i16, ptr %inB, align 2
-; AVX1:  LV: Found an estimated cost of 48 for VF 4 For instruction: %valB = load i16, ptr %inB, align 2
-; AVX1:  LV: Found an estimated cost of 96 for VF 8 For instruction: %valB = load i16, ptr %inB, align 2
-; AVX1:  LV: Found an estimated cost of 193 for VF 16 For instruction: %valB = load i16, ptr %inB, align 2
-; AVX1:  LV: Found an estimated cost of 386 for VF 32 For instruction: %valB = load i16, ptr %inB, align 2
+; AVX1:  Cost of 24 for VF 2: {{.*}}ir<%valB> = load
+; AVX1:  Cost of 48 for VF 4: {{.*}}ir<%valB> = load
+; AVX1:  Cost of 96 for VF 8: {{.*}}ir<%valB> = load
+; AVX1:  Cost of 193 for VF 16: {{.*}}ir<%valB> = load
+; AVX1:  Cost of 386 for VF 32: {{.*}}ir<%valB> = load
 ;
 ; AVX2-SLOWGATHER-LABEL: 'test'
 ; AVX2-SLOWGATHER:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, ptr %inB, align 2
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i16, ptr %inB, align 2
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 8 for VF 4 For instruction: %valB = load i16, ptr %inB, align 2
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 16 for VF 8 For instruction: %valB = load i16, ptr %inB, align 2
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 33 for VF 16 For instruction: %valB = load i16, ptr %inB, align 2
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 66 for VF 32 For instruction: %valB = load i16, ptr %inB, align 2
+; AVX2-SLOWGATHER:  Cost of 4 for VF 2: {{.*}}ir<%valB> = load
+; AVX2-SLOWGATHER:  Cost of 8 for VF 4: {{.*}}ir<%valB> = load
+; AVX2-SLOWGATHER:  Cost of 16 for VF 8: {{.*}}ir<%valB> = load
+; AVX2-SLOWGATHER:  Cost of 33 for VF 16: {{.*}}ir<%valB> = load
+; AVX2-SLOWGATHER:  Cost of 66 for VF 32: {{.*}}ir<%valB> = load
 ;
 ; AVX2-FASTGATHER-LABEL: 'test'
 ; AVX2-FASTGATHER:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, ptr %inB, align 2
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 6 for VF 2 For instruction: %valB = load i16, ptr %inB, align 2
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 13 for VF 4 For instruction: %valB = load i16, ptr %inB, align 2
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 26 for VF 8 For instruction: %valB = load i16, ptr %inB, align 2
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 53 for VF 16 For instruction: %valB = load i16, ptr %inB, align 2
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 106 for VF 32 For instruction: %valB = load i16, ptr %inB, align 2
+; AVX2-FASTGATHER:  Cost of 6 for VF 2: {{.*}}ir<%valB> = load
+; AVX2-FASTGATHER:  Cost of 13 for VF 4: {{.*}}ir<%valB> = load
+; AVX2-FASTGATHER:  Cost of 26 for VF 8: {{.*}}ir<%valB> = load
+; AVX2-FASTGATHER:  Cost of 53 for VF 16: {{.*}}ir<%valB> = load
+; AVX2-FASTGATHER:  Cost of 106 for VF 32: {{.*}}ir<%valB> = load
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i16, ptr %inB, align 2
-; AVX512:  LV: Found an estimated cost of 6 for VF 2 For instruction: %valB = load i16, ptr %inB, align 2
-; AVX512:  LV: Found an estimated cost of 13 for VF 4 For instruction: %valB = load i16, ptr %inB, align 2
-; AVX512:  LV: Found an estimated cost of 27 for VF 8 For instruction: %valB = load i16, ptr %inB, align 2
-; AVX512:  LV: Found an estimated cost of 55 for VF 16 For instruction: %valB = load i16, ptr %inB, align 2
-; AVX512:  LV: Found an estimated cost of 111 for VF 32 For instruction: %valB = load i16, ptr %inB, align 2
-; AVX512:  LV: Found an estimated cost of 222 for VF 64 For instruction: %valB = load i16, ptr %inB, align 2
+; AVX512:  Cost of 6 for VF 2: {{.*}}ir<%valB> = load
+; AVX512:  Cost of 13 for VF 4: {{.*}}ir<%valB> = load
+; AVX512:  Cost of 27 for VF 8: {{.*}}ir<%valB> = load
+; AVX512:  Cost of 55 for VF 16: {{.*}}ir<%valB> = load
+; AVX512:  Cost of 111 for VF 32: {{.*}}ir<%valB> = load
+; AVX512:  Cost of 222 for VF 64: {{.*}}ir<%valB> = load
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i32-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i32-with-i8-index.ll
index b94197a5764bd..8cde13761bc54 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i32-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i32-with-i8-index.ll
@@ -18,50 +18,50 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i32, ptr %inB, align 4
-; SSE2:  LV: Found an estimated cost of 25 for VF 2 For instruction: %valB = load i32, ptr %inB, align 4
-; SSE2:  LV: Found an estimated cost of 51 for VF 4 For instruction: %valB = load i32, ptr %inB, align 4
-; SSE2:  LV: Found an estimated cost of 102 for VF 8 For instruction: %valB = load i32, ptr %inB, align 4
-; SSE2:  LV: Found an estimated cost of 204 for VF 16 For instruction: %valB = load i32, ptr %inB, align 4
+; SSE2:  Cost of 25 for VF 2: {{.*}}ir<%valB> = load
+; SSE2:  Cost of 51 for VF 4: {{.*}}ir<%valB> = load
+; SSE2:  Cost of 102 for VF 8: {{.*}}ir<%valB> = load
+; SSE2:  Cost of 204 for VF 16: {{.*}}ir<%valB> = load
 ;
 ; SSE42-LABEL: 'test'
 ; SSE42:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i32, ptr %inB, align 4
-; SSE42:  LV: Found an estimated cost of 24 for VF 2 For instruction: %valB = load i32, ptr %inB, align 4
-; SSE42:  LV: Found an estimated cost of 48 for VF 4 For instruction: %valB = load i32, ptr %inB, align 4
-; SSE42:  LV: Found an estimated cost of 96 for VF 8 For instruction: %valB = load i32, ptr %inB, align 4
-; SSE42:  LV: Found an estimated cost of 192 for VF 16 For instruction: %valB = load i32, ptr %inB, align 4
+; SSE42:  Cost of 24 for VF 2: {{.*}}ir<%valB> = load
+; SSE42:  Cost of 48 for VF 4: {{.*}}ir<%valB> = load
+; SSE42:  Cost of 96 for VF 8: {{.*}}ir<%valB> = load
+; SSE42:  Cost of 192 for VF 16: {{.*}}ir<%valB> = load
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i32, ptr %inB, align 4
-; AVX1:  LV: Found an estimated cost of 24 for VF 2 For instruction: %valB = load i32, ptr %inB, align 4
-; AVX1:  LV: Found an estimated cost of 48 for VF 4 For instruction: %valB = load i32, ptr %inB, align 4
-; AVX1:  LV: Found an estimated cost of 97 for VF 8 For instruction: %valB = load i32, ptr %inB, align 4
-; AVX1:  LV: Found an estimated cost of 194 for VF 16 For instruction: %valB = load i32, ptr %inB, align 4
-; AVX1:  LV: Found an estimated cost of 388 for VF 32 For instruction: %valB = load i32, ptr %inB, align 4
+; AVX1:  Cost of 24 for VF 2: {{.*}}ir<%valB> = load
+; AVX1:  Cost of 48 for VF 4: {{.*}}ir<%valB> = load
+; AVX1:  Cost of 97 for VF 8: {{.*}}ir<%valB> = load
+; AVX1:  Cost of 194 for VF 16: {{.*}}ir<%valB> = load
+; AVX1:  Cost of 388 for VF 32: {{.*}}ir<%valB> = load
 ;
 ; AVX2-SLOWGATHER-LABEL: 'test'
 ; AVX2-SLOWGATHER:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i32, ptr %inB, align 4
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i32, ptr %inB, align 4
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 8 for VF 4 For instruction: %valB = load i32, ptr %inB, align 4
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 17 for VF 8 For instruction: %valB = load i32, ptr %inB, align 4
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 34 for VF 16 For instruction: %valB = load i32, ptr %inB, align 4
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 68 for VF 32 For instruction: %valB = load i32, ptr %inB, align 4
+; AVX2-SLOWGATHER:  Cost of 4 for VF 2: {{.*}}ir<%valB> = load
+; AVX2-SLOWGATHER:  Cost of 8 for VF 4: {{.*}}ir<%valB> = load
+; AVX2-SLOWGATHER:  Cost of 17 for VF 8: {{.*}}ir<%valB> = load
+; AVX2-SLOWGATHER:  Cost of 34 for VF 16: {{.*}}ir<%valB> = load
+; AVX2-SLOWGATHER:  Cost of 68 for VF 32: {{.*}}ir<%valB> = load
 ;
 ; AVX2-FASTGATHER-LABEL: 'test'
 ; AVX2-FASTGATHER:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i32, ptr %inB, align 4
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i32, ptr %inB, align 4
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 6 for VF 4 For instruction: %valB = load i32, ptr %inB, align 4
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 12 for VF 8 For instruction: %valB = load i32, ptr %inB, align 4
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 24 for VF 16 For instruction: %valB = load i32, ptr %inB, align 4
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 48 for VF 32 For instruction: %valB = load i32, ptr %inB, align 4
+; AVX2-FASTGATHER:  Cost of 4 for VF 2: {{.*}}ir<%valB> = load
+; AVX2-FASTGATHER:  Cost of 6 for VF 4: {{.*}}ir<%valB> = load
+; AVX2-FASTGATHER:  Cost of 12 for VF 8: {{.*}}ir<%valB> = load
+; AVX2-FASTGATHER:  Cost of 24 for VF 16: {{.*}}ir<%valB> = load
+; AVX2-FASTGATHER:  Cost of 48 for VF 32: {{.*}}ir<%valB> = load
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 6 for VF 2 For instruction: %valB = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 13 for VF 4 For instruction: %valB = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %valB = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 18 for VF 16 For instruction: %valB = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 36 for VF 32 For instruction: %valB = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 72 for VF 64 For instruction: %valB = load i32, ptr %inB, align 4
+; AVX512:  Cost of 6 for VF 2: {{.*}}ir<%valB> = load
+; AVX512:  Cost of 13 for VF 4: {{.*}}ir<%valB> = load
+; AVX512:  Cost of 10 for VF 8: {{.*}}ir<%valB> = load
+; AVX512:  Cost of 18 for VF 16: {{.*}}ir<%valB> = load
+; AVX512:  Cost of 36 for VF 32: {{.*}}ir<%valB> = load
+; AVX512:  Cost of 72 for VF 64: {{.*}}ir<%valB> = load
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i64-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i64-with-i8-index.ll
index 3472231ddce6b..3435df1b494c6 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i64-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i64-with-i8-index.ll
@@ -18,50 +18,50 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i64, ptr %inB, align 8
-; SSE2:  LV: Found an estimated cost of 25 for VF 2 For instruction: %valB = load i64, ptr %inB, align 8
-; SSE2:  LV: Found an estimated cost of 50 for VF 4 For instruction: %valB = load i64, ptr %inB, align 8
-; SSE2:  LV: Found an estimated cost of 100 for VF 8 For instruction: %valB = load i64, ptr %inB, align 8
-; SSE2:  LV: Found an estimated cost of 200 for VF 16 For instruction: %valB = load i64, ptr %inB, align 8
+; SSE2:  Cost of 25 for VF 2: {{.*}}ir<%valB> = load
+; SSE2:  Cost of 50 for VF 4: {{.*}}ir<%valB> = load
+; SSE2:  Cost of 100 for VF 8: {{.*}}ir<%valB> = load
+; SSE2:  Cost of 200 for VF 16: {{.*}}ir<%valB> = load
 ;
 ; SSE42-LABEL: 'test'
 ; SSE42:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i64, ptr %inB, align 8
-; SSE42:  LV: Found an estimated cost of 24 for VF 2 For instruction: %valB = load i64, ptr %inB, align 8
-; SSE42:  LV: Found an estimated cost of 48 for VF 4 For instruction: %valB = load i64, ptr %inB, align 8
-; SSE42:  LV: Found an estimated cost of 96 for VF 8 For instruction: %valB = load i64, ptr %inB, align 8
-; SSE42:  LV: Found an estimated cost of 192 for VF 16 For instruction: %valB = load i64, ptr %inB, align 8
+; SSE42:  Cost of 24 for VF 2: {{.*}}ir<%valB> = load
+; SSE42:  Cost of 48 for VF 4: {{.*}}ir<%valB> = load
+; SSE42:  Cost of 96 for VF 8: {{.*}}ir<%valB> = load
+; SSE42:  Cost of 192 for VF 16: {{.*}}ir<%valB> = load
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i64, ptr %inB, align 8
-; AVX1:  LV: Found an estimated cost of 24 for VF 2 For instruction: %valB = load i64, ptr %inB, align 8
-; AVX1:  LV: Found an estimated cost of 49 for VF 4 For instruction: %valB = load i64, ptr %inB, align 8
-; AVX1:  LV: Found an estimated cost of 98 for VF 8 For instruction: %valB = load i64, ptr %inB, align 8
-; AVX1:  LV: Found an estimated cost of 196 for VF 16 For instruction: %valB = load i64, ptr %inB, align 8
-; AVX1:  LV: Found an estimated cost of 392 for VF 32 For instruction: %valB = load i64, ptr %inB, align 8
+; AVX1:  Cost of 24 for VF 2: {{.*}}ir<%valB> = load
+; AVX1:  Cost of 49 for VF 4: {{.*}}ir<%valB> = load
+; AVX1:  Cost of 98 for VF 8: {{.*}}ir<%valB> = load
+; AVX1:  Cost of 196 for VF 16: {{.*}}ir<%valB> = load
+; AVX1:  Cost of 392 for VF 32: {{.*}}ir<%valB> = load
 ;
 ; AVX2-SLOWGATHER-LABEL: 'test'
 ; AVX2-SLOWGATHER:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i64, ptr %inB, align 8
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i64, ptr %inB, align 8
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 9 for VF 4 For instruction: %valB = load i64, ptr %inB, align 8
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 18 for VF 8 For instruction: %valB = load i64, ptr %inB, align 8
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 36 for VF 16 For instruction: %valB = load i64, ptr %inB, align 8
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 72 for VF 32 For instruction: %valB = load i64, ptr %inB, align 8
+; AVX2-SLOWGATHER:  Cost of 4 for VF 2: {{.*}}ir<%valB> = load
+; AVX2-SLOWGATHER:  Cost of 9 for VF 4: {{.*}}ir<%valB> = load
+; AVX2-SLOWGATHER:  Cost of 18 for VF 8: {{.*}}ir<%valB> = load
+; AVX2-SLOWGATHER:  Cost of 36 for VF 16: {{.*}}ir<%valB> = load
+; AVX2-SLOWGATHER:  Cost of 72 for VF 32: {{.*}}ir<%valB> = load
 ;
 ; AVX2-FASTGATHER-LABEL: 'test'
 ; AVX2-FASTGATHER:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i64, ptr %inB, align 8
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i64, ptr %inB, align 8
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 6 for VF 4 For instruction: %valB = load i64, ptr %inB, align 8
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 12 for VF 8 For instruction: %valB = load i64, ptr %inB, align 8
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 24 for VF 16 For instruction: %valB = load i64, ptr %inB, align 8
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 48 for VF 32 For instruction: %valB = load i64, ptr %inB, align 8
+; AVX2-FASTGATHER:  Cost of 4 for VF 2: {{.*}}ir<%valB> = load
+; AVX2-FASTGATHER:  Cost of 6 for VF 4: {{.*}}ir<%valB> = load
+; AVX2-FASTGATHER:  Cost of 12 for VF 8: {{.*}}ir<%valB> = load
+; AVX2-FASTGATHER:  Cost of 24 for VF 16: {{.*}}ir<%valB> = load
+; AVX2-FASTGATHER:  Cost of 48 for VF 32: {{.*}}ir<%valB> = load
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 6 for VF 2 For instruction: %valB = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 14 for VF 4 For instruction: %valB = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %valB = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %valB = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %valB = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %valB = load i64, ptr %inB, align 8
+; AVX512:  Cost of 6 for VF 2: {{.*}}ir<%valB> = load
+; AVX512:  Cost of 14 for VF 4: {{.*}}ir<%valB> = load
+; AVX512:  Cost of 10 for VF 8: {{.*}}ir<%valB> = load
+; AVX512:  Cost of 20 for VF 16: {{.*}}ir<%valB> = load
+; AVX512:  Cost of 40 for VF 32: {{.*}}ir<%valB> = load
+; AVX512:  Cost of 80 for VF 64: {{.*}}ir<%valB> = load
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i8-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i8-with-i8-index.ll
index 998b4e4e92d03..7aeeff6518416 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i8-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/gather-i8-with-i8-index.ll
@@ -18,50 +18,50 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, ptr %inB, align 1
-; SSE2:  LV: Found an estimated cost of 25 for VF 2 For instruction: %valB = load i8, ptr %inB, align 1
-; SSE2:  LV: Found an estimated cost of 51 for VF 4 For instruction: %valB = load i8, ptr %inB, align 1
-; SSE2:  LV: Found an estimated cost of 103 for VF 8 For instruction: %valB = load i8, ptr %inB, align 1
-; SSE2:  LV: Found an estimated cost of 207 for VF 16 For instruction: %valB = load i8, ptr %inB, align 1
+; SSE2:  Cost of 25 for VF 2: {{.*}}ir<%valB> = load
+; SSE2:  Cost of 51 for VF 4: {{.*}}ir<%valB> = load
+; SSE2:  Cost of 103 for VF 8: {{.*}}ir<%valB> = load
+; SSE2:  Cost of 207 for VF 16: {{.*}}ir<%valB> = load
 ;
 ; SSE42-LABEL: 'test'
 ; SSE42:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, ptr %inB, align 1
-; SSE42:  LV: Found an estimated cost of 24 for VF 2 For instruction: %valB = load i8, ptr %inB, align 1
-; SSE42:  LV: Found an estimated cost of 48 for VF 4 For instruction: %valB = load i8, ptr %inB, align 1
-; SSE42:  LV: Found an estimated cost of 96 for VF 8 For instruction: %valB = load i8, ptr %inB, align 1
-; SSE42:  LV: Found an estimated cost of 192 for VF 16 For instruction: %valB = load i8, ptr %inB, align 1
+; SSE42:  Cost of 24 for VF 2: {{.*}}ir<%valB> = load
+; SSE42:  Cost of 48 for VF 4: {{.*}}ir<%valB> = load
+; SSE42:  Cost of 96 for VF 8: {{.*}}ir<%valB> = load
+; SSE42:  Cost of 192 for VF 16: {{.*}}ir<%valB> = load
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, ptr %inB, align 1
-; AVX1:  LV: Found an estimated cost of 24 for VF 2 For instruction: %valB = load i8, ptr %inB, align 1
-; AVX1:  LV: Found an estimated cost of 48 for VF 4 For instruction: %valB = load i8, ptr %inB, align 1
-; AVX1:  LV: Found an estimated cost of 96 for VF 8 For instruction: %valB = load i8, ptr %inB, align 1
-; AVX1:  LV: Found an estimated cost of 192 for VF 16 For instruction: %valB = load i8, ptr %inB, align 1
-; AVX1:  LV: Found an estimated cost of 385 for VF 32 For instruction: %valB = load i8, ptr %inB, align 1
+; AVX1:  Cost of 24 for VF 2: {{.*}}ir<%valB> = load
+; AVX1:  Cost of 48 for VF 4: {{.*}}ir<%valB> = load
+; AVX1:  Cost of 96 for VF 8: {{.*}}ir<%valB> = load
+; AVX1:  Cost of 192 for VF 16: {{.*}}ir<%valB> = load
+; AVX1:  Cost of 385 for VF 32: {{.*}}ir<%valB> = load
 ;
 ; AVX2-SLOWGATHER-LABEL: 'test'
 ; AVX2-SLOWGATHER:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, ptr %inB, align 1
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 4 for VF 2 For instruction: %valB = load i8, ptr %inB, align 1
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 8 for VF 4 For instruction: %valB = load i8, ptr %inB, align 1
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 16 for VF 8 For instruction: %valB = load i8, ptr %inB, align 1
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 32 for VF 16 For instruction: %valB = load i8, ptr %inB, align 1
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 65 for VF 32 For instruction: %valB = load i8, ptr %inB, align 1
+; AVX2-SLOWGATHER:  Cost of 4 for VF 2: {{.*}}ir<%valB> = load
+; AVX2-SLOWGATHER:  Cost of 8 for VF 4: {{.*}}ir<%valB> = load
+; AVX2-SLOWGATHER:  Cost of 16 for VF 8: {{.*}}ir<%valB> = load
+; AVX2-SLOWGATHER:  Cost of 32 for VF 16: {{.*}}ir<%valB> = load
+; AVX2-SLOWGATHER:  Cost of 65 for VF 32: {{.*}}ir<%valB> = load
 ;
 ; AVX2-FASTGATHER-LABEL: 'test'
 ; AVX2-FASTGATHER:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, ptr %inB, align 1
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 6 for VF 2 For instruction: %valB = load i8, ptr %inB, align 1
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 13 for VF 4 For instruction: %valB = load i8, ptr %inB, align 1
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 26 for VF 8 For instruction: %valB = load i8, ptr %inB, align 1
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 52 for VF 16 For instruction: %valB = load i8, ptr %inB, align 1
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 105 for VF 32 For instruction: %valB = load i8, ptr %inB, align 1
+; AVX2-FASTGATHER:  Cost of 6 for VF 2: {{.*}}ir<%valB> = load
+; AVX2-FASTGATHER:  Cost of 13 for VF 4: {{.*}}ir<%valB> = load
+; AVX2-FASTGATHER:  Cost of 26 for VF 8: {{.*}}ir<%valB> = load
+; AVX2-FASTGATHER:  Cost of 52 for VF 16: {{.*}}ir<%valB> = load
+; AVX2-FASTGATHER:  Cost of 105 for VF 32: {{.*}}ir<%valB> = load
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB = load i8, ptr %inB, align 1
-; AVX512:  LV: Found an estimated cost of 6 for VF 2 For instruction: %valB = load i8, ptr %inB, align 1
-; AVX512:  LV: Found an estimated cost of 13 for VF 4 For instruction: %valB = load i8, ptr %inB, align 1
-; AVX512:  LV: Found an estimated cost of 27 for VF 8 For instruction: %valB = load i8, ptr %inB, align 1
-; AVX512:  LV: Found an estimated cost of 54 for VF 16 For instruction: %valB = load i8, ptr %inB, align 1
-; AVX512:  LV: Found an estimated cost of 109 for VF 32 For instruction: %valB = load i8, ptr %inB, align 1
-; AVX512:  LV: Found an estimated cost of 219 for VF 64 For instruction: %valB = load i8, ptr %inB, align 1
+; AVX512:  Cost of 6 for VF 2: {{.*}}ir<%valB> = load
+; AVX512:  Cost of 13 for VF 4: {{.*}}ir<%valB> = load
+; AVX512:  Cost of 27 for VF 8: {{.*}}ir<%valB> = load
+; AVX512:  Cost of 54 for VF 16: {{.*}}ir<%valB> = load
+; AVX512:  Cost of 109 for VF 32: {{.*}}ir<%valB> = load
+; AVX512:  Cost of 219 for VF 64: {{.*}}ir<%valB> = load
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-2.ll
index 1eec906171edd..41ad1ebcf6ffb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-2.ll
@@ -14,35 +14,35 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 28 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 56 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; SSE2:  Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; SSE2:  Cost of 28 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; SSE2:  Cost of 56 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 30 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 60 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 120 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX1:  Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX1:  Cost of 30 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX1:  Cost of 60 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX1:  Cost of 120 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 24 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX2:  Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX2:  Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX2:  Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX2:  Cost of 24 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 3 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 5 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 22 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 92 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512:  Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512:  Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512:  Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512:  Cost of 22 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512:  Cost of 92 for VF 64: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll
index d119ca93f04de..2f6d958e87530 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-3.ll
@@ -14,35 +14,35 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 42 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 84 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  Cost of 9 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; SSE2:  Cost of 21 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; SSE2:  Cost of 42 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; SSE2:  Cost of 84 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 11 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 21 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 45 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 90 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 180 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 21 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 45 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 90 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 180 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 44 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 10 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 20 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 44 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 51 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 210 for VF 64 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512:  Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512:  Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512:  Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512:  Cost of 51 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512:  Cost of 210 for VF 64: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-4.ll
index c5c6555de0da5..4ae991958deea 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-f32-stride-4.ll
@@ -14,34 +14,34 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 28 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 56 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 112 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
+; SSE2:  Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; SSE2:  Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; SSE2:  Cost of 56 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; SSE2:  Cost of 112 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 28 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 60 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 240 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX1:  Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 60 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 120 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 240 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 84 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX2:  Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 10 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 20 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 40 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 84 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 8 for VF 8 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load float, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 92 for VF 32 For instruction: %v0 = load float, ptr %in0, align 4
+; AVX512:  Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512:  Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512:  Cost of 8 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512:  Cost of 22 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512:  Cost of 92 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-2.ll
index 78389fadd5226..9fcd04bd52def 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-2.ll
@@ -15,44 +15,44 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
-; SSE2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; SSE2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; SSE2:  LV: Found an estimated cost of 34 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; SSE2:  LV: Found an estimated cost of 68 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; SSE2:  Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; SSE2:  Cost of 34 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; SSE2:  Cost of 68 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX1:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX1:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX1:  LV: Found an estimated cost of 34 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX1:  LV: Found an estimated cost of 70 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX1:  LV: Found an estimated cost of 140 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX1:  Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX1:  Cost of 34 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX1:  Cost of 70 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX1:  Cost of 140 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 7 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 11 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 22 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX2:  Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX2:  Cost of 7 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX2:  Cost of 11 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX2:  Cost of 22 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 7 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 10 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 20 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 284 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512DQ:  Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512DQ:  Cost of 7 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512DQ:  Cost of 10 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512DQ:  Cost of 20 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512DQ:  Cost of 284 for VF 64: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 5 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 5 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 7 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 34 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512BW:  Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512BW:  Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512BW:  Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512BW:  Cost of 7 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512BW:  Cost of 34 for VF 64: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll
index edf044dd092e7..9a4f8417921e7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-3.ll
@@ -15,44 +15,44 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
-; SSE2:  LV: Found an estimated cost of 18 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; SSE2:  LV: Found an estimated cost of 26 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; SSE2:  LV: Found an estimated cost of 51 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; SSE2:  LV: Found an estimated cost of 102 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; SSE2:  Cost of 26 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; SSE2:  Cost of 51 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; SSE2:  Cost of 102 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX1:  LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX1:  LV: Found an estimated cost of 28 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX1:  LV: Found an estimated cost of 51 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX1:  LV: Found an estimated cost of 105 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX1:  LV: Found an estimated cost of 210 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  Cost of 15 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 51 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 105 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 210 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 11 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 31 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 62 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 10 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 11 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 31 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 62 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 12 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 30 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 59 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 426 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512DQ:  Cost of 10 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512DQ:  Cost of 12 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512DQ:  Cost of 30 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512DQ:  Cost of 59 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512DQ:  Cost of 426 for VF 64: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 7 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 7 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 9 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 18 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 81 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512BW:  Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512BW:  Cost of 7 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512BW:  Cost of 9 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512BW:  Cost of 18 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512BW:  Cost of 81 for VF 64: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-4.ll
index 6f22ec2a86080..4a7a5a2dbc9b9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-4.ll
@@ -15,44 +15,44 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
-; SSE2:  LV: Found an estimated cost of 17 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; SSE2:  LV: Found an estimated cost of 34 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; SSE2:  LV: Found an estimated cost of 68 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; SSE2:  LV: Found an estimated cost of 136 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  Cost of 17 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; SSE2:  Cost of 34 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; SSE2:  Cost of 68 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; SSE2:  Cost of 136 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX1:  LV: Found an estimated cost of 17 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX1:  LV: Found an estimated cost of 34 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX1:  LV: Found an estimated cost of 68 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX1:  LV: Found an estimated cost of 140 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX1:  LV: Found an estimated cost of 280 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  Cost of 17 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 34 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 68 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 140 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 280 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 35 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 79 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 158 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 35 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 79 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 158 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 34 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 77 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 154 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 568 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512DQ:  Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512DQ:  Cost of 34 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512DQ:  Cost of 77 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512DQ:  Cost of 154 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512DQ:  Cost of 568 for VF 64: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 9 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 9 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 34 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 148 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512BW:  Cost of 9 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512BW:  Cost of 9 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512BW:  Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512BW:  Cost of 34 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512BW:  Cost of 148 for VF 64: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-6.ll
index 09823572118ad..1e5b242433b3e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i16-stride-6.ll
@@ -15,44 +15,44 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
-; SSE2:  LV: Found an estimated cost of 26 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; SSE2:  LV: Found an estimated cost of 51 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; SSE2:  LV: Found an estimated cost of 102 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; SSE2:  LV: Found an estimated cost of 204 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
+; SSE2:  Cost of 26 for VF 2: INTERLEAVE-GROUP with factor 6 at %v0
+; SSE2:  Cost of 51 for VF 4: INTERLEAVE-GROUP with factor 6 at %v0
+; SSE2:  Cost of 102 for VF 8: INTERLEAVE-GROUP with factor 6 at %v0
+; SSE2:  Cost of 204 for VF 16: INTERLEAVE-GROUP with factor 6 at %v0
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX1:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX1:  LV: Found an estimated cost of 51 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX1:  LV: Found an estimated cost of 102 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX1:  LV: Found an estimated cost of 210 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX1:  LV: Found an estimated cost of 420 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX1:  Cost of 28 for VF 2: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX1:  Cost of 51 for VF 4: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX1:  Cost of 102 for VF 8: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX1:  Cost of 210 for VF 16: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX1:  Cost of 420 for VF 32: INTERLEAVE-GROUP with factor 6 at %v0
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 16 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 42 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 112 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX2:  LV: Found an estimated cost of 224 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX2:  Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX2:  Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX2:  Cost of 42 for VF 8: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX2:  Cost of 112 for VF 16: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX2:  Cost of 224 for VF 32: INTERLEAVE-GROUP with factor 6 at %v0
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 16 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 41 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 109 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 218 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512DQ:  LV: Found an estimated cost of 852 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512DQ:  Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX512DQ:  Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX512DQ:  Cost of 41 for VF 8: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX512DQ:  Cost of 109 for VF 16: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX512DQ:  Cost of 218 for VF 32: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX512DQ:  Cost of 852 for VF 64: INTERLEAVE-GROUP with factor 6 at %v0
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 13 for VF 2 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 13 for VF 4 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 17 for VF 8 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 81 for VF 32 For instruction: %v0 = load i16, ptr %in0, align 2
-; AVX512BW:  LV: Found an estimated cost of 342 for VF 64 For instruction: %v0 = load i16, ptr %in0, align 2
+; AVX512BW:  Cost of 13 for VF 2: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX512BW:  Cost of 13 for VF 4: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX512BW:  Cost of 17 for VF 8: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX512BW:  Cost of 33 for VF 16: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX512BW:  Cost of 81 for VF 32: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX512BW:  Cost of 342 for VF 64: INTERLEAVE-GROUP with factor 6 at %v0
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2-indices-0u.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2-indices-0u.ll
index d809761735245..12ffe4b1632ed 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2-indices-0u.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2-indices-0u.ll
@@ -14,35 +14,35 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 2 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 30 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 60 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; SSE2:  Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; SSE2:  Cost of 30 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; SSE2:  Cost of 60 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 2 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 21 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 42 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 84 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX1:  Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX1:  Cost of 21 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX1:  Cost of 42 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX1:  Cost of 84 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 2 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 4 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 8 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 16 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX2:  Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX2:  Cost of 4 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX2:  Cost of 8 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX2:  Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 1 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 1 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 1 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 2 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 13 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 50 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  Cost of 1 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512:  Cost of 1 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512:  Cost of 1 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512:  Cost of 2 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512:  Cost of 13 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512:  Cost of 50 for VF 64: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2.ll
index a0f4334597103..9c055eabe0816 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-2.ll
@@ -14,35 +14,35 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 60 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 120 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; SSE2:  Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; SSE2:  Cost of 60 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; SSE2:  Cost of 120 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 38 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 76 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 152 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX1:  Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX1:  Cost of 38 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX1:  Cost of 76 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX1:  Cost of 152 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 24 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX2:  Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX2:  Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX2:  Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX2:  Cost of 24 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 3 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 5 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 22 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 92 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512:  Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512:  Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512:  Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512:  Cost of 22 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512:  Cost of 92 for VF 64: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-01u.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-01u.ll
index 000356ad40134..382e5bfcde341 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-01u.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-01u.ll
@@ -14,35 +14,35 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 31 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 62 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 124 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; SSE2:  Cost of 31 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; SSE2:  Cost of 62 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; SSE2:  Cost of 124 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 12 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 19 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 40 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 80 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 160 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 19 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 40 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 80 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 160 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 8 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 16 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 34 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 8 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 16 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 34 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 5 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 9 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 36 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 144 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512:  Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512:  Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512:  Cost of 9 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512:  Cost of 36 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512:  Cost of 144 for VF 64: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-0uu.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-0uu.ll
index 7b725d5f63ea1..3c6401715311e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-0uu.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3-indices-0uu.ll
@@ -14,35 +14,35 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 34 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 68 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; SSE2:  Cost of 17 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; SSE2:  Cost of 34 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; SSE2:  Cost of 68 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 23 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 46 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 92 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 23 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 46 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 92 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 11 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 23 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  Cost of 4 for VF 2: {{.*}}ir<%v0> = load
+; AVX2:  Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 11 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 23 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 1 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 1 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 2 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 3 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 21 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 78 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  Cost of 1 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512:  Cost of 1 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512:  Cost of 2 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512:  Cost of 3 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512:  Cost of 21 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512:  Cost of 78 for VF 64: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll
index 239c9e1e1434e..ee3c510991503 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-3.ll
@@ -14,35 +14,35 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 90 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 180 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  Cost of 21 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; SSE2:  Cost of 45 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; SSE2:  Cost of 90 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; SSE2:  Cost of 180 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 16 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 27 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 57 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 114 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 228 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 27 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 57 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 114 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 228 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 10 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 20 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 44 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 10 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 20 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 44 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 12 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 51 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 210 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512:  Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512:  Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512:  Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512:  Cost of 51 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512:  Cost of 210 for VF 64: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll
index 252e0844c0edf..17dad2598dc82 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-012u.ll
@@ -14,34 +14,34 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 21 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 45 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 90 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 180 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  Cost of 21 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; SSE2:  Cost of 45 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; SSE2:  Cost of 90 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; SSE2:  Cost of 180 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 28 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 59 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 118 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 236 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 59 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 118 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 236 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 16 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 32 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 67 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 16 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 32 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 67 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 6 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 17 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 71 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512:  Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512:  Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512:  Cost of 17 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512:  Cost of 71 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-0uuu.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-0uuu.ll
index 119062ff9cae4..55339f958bc2a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-0uuu.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4-indices-0uuu.ll
@@ -14,35 +14,35 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 15 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 30 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 60 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; SSE2:  Cost of 15 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; SSE2:  Cost of 30 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; SSE2:  Cost of 60 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 6 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 12 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 25 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 50 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 100 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 25 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 50 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 100 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 8 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 16 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 33 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 8 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 16 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 33 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 1 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 1 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 2 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 5 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 29 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  Cost of 1 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512:  Cost of 1 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512:  Cost of 2 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512:  Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512:  Cost of 29 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512:  Cost of 80 for VF 64: {{.*}}ir<%v0> = load
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4.ll
index 1f59aab2860f8..d95ab28c1ff74 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-4.ll
@@ -14,34 +14,34 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 60 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 120 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 240 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  Cost of 28 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; SSE2:  Cost of 60 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; SSE2:  Cost of 120 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; SSE2:  Cost of 240 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 18 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 36 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 76 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 152 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 304 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 76 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 152 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 304 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 10 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 20 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 40 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 84 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 10 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 20 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 40 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 84 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 8 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 22 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 92 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512:  Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512:  Cost of 8 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512:  Cost of 22 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512:  Cost of 92 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-6.ll
index 278e4a80073a6..0f1fc532b6df4 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i32-stride-6.ll
@@ -14,31 +14,31 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 42 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 90 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; SSE2:  LV: Found an estimated cost of 180 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
+; SSE2:  Cost of 42 for VF 2: INTERLEAVE-GROUP with factor 6 at %v0
+; SSE2:  Cost of 90 for VF 4: INTERLEAVE-GROUP with factor 6 at %v0
+; SSE2:  Cost of 180 for VF 8: INTERLEAVE-GROUP with factor 6 at %v0
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 27 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 54 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 114 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX1:  LV: Found an estimated cost of 228 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX1:  Cost of 27 for VF 2: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX1:  Cost of 54 for VF 4: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX1:  Cost of 114 for VF 8: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX1:  Cost of 228 for VF 16: INTERLEAVE-GROUP with factor 6 at %v0
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 8 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 18 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 37 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX2:  LV: Found an estimated cost of 76 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX2:  Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX2:  Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX2:  Cost of 37 for VF 8: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX2:  Cost of 76 for VF 16: INTERLEAVE-GROUP with factor 6 at %v0
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 11 for VF 4 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 21 for VF 8 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 51 for VF 16 For instruction: %v0 = load i32, ptr %in0, align 4
-; AVX512:  LV: Found an estimated cost of 210 for VF 32 For instruction: %v0 = load i32, ptr %in0, align 4
+; AVX512:  Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX512:  Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX512:  Cost of 21 for VF 8: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX512:  Cost of 51 for VF 16: INTERLEAVE-GROUP with factor 6 at %v0
+; AVX512:  Cost of 210 for VF 32: INTERLEAVE-GROUP with factor 6 at %v0
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-2.ll
index 91376c26a7a4c..66430532d863f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-2.ll
@@ -15,44 +15,44 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
-; SSE2:  LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
-; SSE2:  LV: Found an estimated cost of 30 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
-; SSE2:  LV: Found an estimated cost of 62 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
-; SSE2:  LV: Found an estimated cost of 126 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; SSE2:  Cost of 30 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; SSE2:  Cost of 62 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; SSE2:  Cost of 126 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX1:  LV: Found an estimated cost of 9 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX1:  LV: Found an estimated cost of 17 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX1:  LV: Found an estimated cost of 33 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX1:  LV: Found an estimated cost of 66 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX1:  LV: Found an estimated cost of 134 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  Cost of 9 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX1:  Cost of 17 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX1:  Cost of 33 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX1:  Cost of 66 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX1:  Cost of 134 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 3 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 5 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 8 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX2:  Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX2:  Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX2:  Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX2:  Cost of 8 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512DQ:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512DQ:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512DQ:  LV: Found an estimated cost of 3 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512DQ:  LV: Found an estimated cost of 5 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512DQ:  LV: Found an estimated cost of 7 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512DQ:  LV: Found an estimated cost of 270 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512DQ:  Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512DQ:  Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512DQ:  Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512DQ:  Cost of 7 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512DQ:  Cost of 270 for VF 64: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512BW:  LV: Found an estimated cost of 3 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512BW:  LV: Found an estimated cost of 3 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512BW:  LV: Found an estimated cost of 3 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512BW:  LV: Found an estimated cost of 9 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512BW:  LV: Found an estimated cost of 17 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512BW:  LV: Found an estimated cost of 41 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512BW:  Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512BW:  Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512BW:  Cost of 9 for VF 16: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512BW:  Cost of 17 for VF 32: INTERLEAVE-GROUP with factor 2 at %v0
+; AVX512BW:  Cost of 41 for VF 64: INTERLEAVE-GROUP with factor 2 at %v0
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-3.ll
index 4a2de69f43d43..508a8a6acecec 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-3.ll
@@ -15,44 +15,44 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
-; SSE2:  LV: Found an estimated cost of 24 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
-; SSE2:  LV: Found an estimated cost of 50 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
-; SSE2:  LV: Found an estimated cost of 93 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
-; SSE2:  LV: Found an estimated cost of 189 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  Cost of 24 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; SSE2:  Cost of 50 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; SSE2:  Cost of 93 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; SSE2:  Cost of 189 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX1:  LV: Found an estimated cost of 16 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX1:  LV: Found an estimated cost of 27 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX1:  LV: Found an estimated cost of 52 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX1:  LV: Found an estimated cost of 99 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX1:  LV: Found an estimated cost of 201 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 27 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 52 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 99 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX1:  Cost of 201 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 9 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 13 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 17 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 6 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 9 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 13 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX2:  Cost of 17 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512DQ:  LV: Found an estimated cost of 7 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512DQ:  LV: Found an estimated cost of 6 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512DQ:  LV: Found an estimated cost of 9 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512DQ:  LV: Found an estimated cost of 14 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512DQ:  LV: Found an estimated cost of 16 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512DQ:  LV: Found an estimated cost of 405 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512DQ:  Cost of 6 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512DQ:  Cost of 9 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512DQ:  Cost of 14 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512DQ:  Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512DQ:  Cost of 405 for VF 64: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512BW:  LV: Found an estimated cost of 4 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512BW:  LV: Found an estimated cost of 4 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512BW:  LV: Found an estimated cost of 13 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512BW:  LV: Found an estimated cost of 13 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512BW:  LV: Found an estimated cost of 16 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512BW:  LV: Found an estimated cost of 25 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512BW:  Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512BW:  Cost of 13 for VF 8: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512BW:  Cost of 13 for VF 16: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512BW:  Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 3 at %v0
+; AVX512BW:  Cost of 25 for VF 64: INTERLEAVE-GROUP with factor 3 at %v0
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-4.ll
index 7bb14702d5b92..509562c19114d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-load-i8-stride-4.ll
@@ -15,44 +15,44 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
-; SSE2:  LV: Found an estimated cost of 28 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
-; SSE2:  LV: Found an estimated cost of 60 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
-; SSE2:  LV: Found an estimated cost of 124 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
-; SSE2:  LV: Found an estimated cost of 252 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
+; SSE2:  Cost of 28 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; SSE2:  Cost of 60 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; SSE2:  Cost of 124 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; SSE2:  Cost of 252 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX1:  LV: Found an estimated cost of 17 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX1:  LV: Found an estimated cost of 33 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX1:  LV: Found an estimated cost of 66 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX1:  LV: Found an estimated cost of 132 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX1:  LV: Found an estimated cost of 268 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX1:  Cost of 17 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 33 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 66 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 132 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX1:  Cost of 268 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 13 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 26 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX2:  LV: Found an estimated cost of 60 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX2:  Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 13 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 26 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX2:  Cost of 60 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512DQ:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512DQ:  LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512DQ:  LV: Found an estimated cost of 13 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512DQ:  LV: Found an estimated cost of 25 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512DQ:  LV: Found an estimated cost of 58 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512DQ:  LV: Found an estimated cost of 540 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512DQ:  Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512DQ:  Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512DQ:  Cost of 13 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512DQ:  Cost of 25 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512DQ:  Cost of 58 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512DQ:  Cost of 540 for VF 64: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512BW:  LV: Found an estimated cost of 5 for VF 2 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512BW:  LV: Found an estimated cost of 5 for VF 4 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512BW:  LV: Found an estimated cost of 17 for VF 8 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512BW:  LV: Found an estimated cost of 33 for VF 16 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512BW:  LV: Found an estimated cost of 80 for VF 32 For instruction: %v0 = load i8, ptr %in0, align 1
-; AVX512BW:  LV: Found an estimated cost of 238 for VF 64 For instruction: %v0 = load i8, ptr %in0, align 1
+; AVX512BW:  Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512BW:  Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512BW:  Cost of 17 for VF 8: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512BW:  Cost of 33 for VF 16: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512BW:  Cost of 80 for VF 32: INTERLEAVE-GROUP with factor 4 at %v0
+; AVX512BW:  Cost of 238 for VF 64: INTERLEAVE-GROUP with factor 4 at %v0
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-2.ll
index e70a4f09029a0..6076aa274623a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-2.ll
@@ -14,35 +14,35 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v1, ptr %out1, align 4
-; SSE2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store float %v1, ptr %out1, align 4
-; SSE2:  LV: Found an estimated cost of 14 for VF 4 For instruction: store float %v1, ptr %out1, align 4
-; SSE2:  LV: Found an estimated cost of 28 for VF 8 For instruction: store float %v1, ptr %out1, align 4
-; SSE2:  LV: Found an estimated cost of 56 for VF 16 For instruction: store float %v1, ptr %out1, align 4
+; SSE2:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; SSE2:  Cost of 14 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; SSE2:  Cost of 28 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; SSE2:  Cost of 56 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v1, ptr %out1, align 4
-; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: store float %v1, ptr %out1, align 4
-; AVX1:  LV: Found an estimated cost of 14 for VF 4 For instruction: store float %v1, ptr %out1, align 4
-; AVX1:  LV: Found an estimated cost of 30 for VF 8 For instruction: store float %v1, ptr %out1, align 4
-; AVX1:  LV: Found an estimated cost of 60 for VF 16 For instruction: store float %v1, ptr %out1, align 4
-; AVX1:  LV: Found an estimated cost of 120 for VF 32 For instruction: store float %v1, ptr %out1, align 4
+; AVX1:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 14 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 30 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 60 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 120 for VF 32: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v1, ptr %out1, align 4
-; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store float %v1, ptr %out1, align 4
-; AVX2:  LV: Found an estimated cost of 3 for VF 4 For instruction: store float %v1, ptr %out1, align 4
-; AVX2:  LV: Found an estimated cost of 6 for VF 8 For instruction: store float %v1, ptr %out1, align 4
-; AVX2:  LV: Found an estimated cost of 12 for VF 16 For instruction: store float %v1, ptr %out1, align 4
-; AVX2:  LV: Found an estimated cost of 24 for VF 32 For instruction: store float %v1, ptr %out1, align 4
+; AVX2:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 24 for VF 32: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v1, ptr %out1, align 4
-; AVX512:  LV: Found an estimated cost of 2 for VF 2 For instruction: store float %v1, ptr %out1, align 4
-; AVX512:  LV: Found an estimated cost of 2 for VF 4 For instruction: store float %v1, ptr %out1, align 4
-; AVX512:  LV: Found an estimated cost of 2 for VF 8 For instruction: store float %v1, ptr %out1, align 4
-; AVX512:  LV: Found an estimated cost of 5 for VF 16 For instruction: store float %v1, ptr %out1, align 4
-; AVX512:  LV: Found an estimated cost of 10 for VF 32 For instruction: store float %v1, ptr %out1, align 4
-; AVX512:  LV: Found an estimated cost of 20 for VF 64 For instruction: store float %v1, ptr %out1, align 4
+; AVX512:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512:  Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512:  Cost of 2 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512:  Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512:  Cost of 10 for VF 32: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512:  Cost of 20 for VF 64: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-3.ll
index a80f9fb1a8bde..ce0289998100b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-3.ll
@@ -14,35 +14,35 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v2, ptr %out2, align 4
-; SSE2:  LV: Found an estimated cost of 11 for VF 2 For instruction: store float %v2, ptr %out2, align 4
-; SSE2:  LV: Found an estimated cost of 24 for VF 4 For instruction: store float %v2, ptr %out2, align 4
-; SSE2:  LV: Found an estimated cost of 48 for VF 8 For instruction: store float %v2, ptr %out2, align 4
-; SSE2:  LV: Found an estimated cost of 96 for VF 16 For instruction: store float %v2, ptr %out2, align 4
+; SSE2:  Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; SSE2:  Cost of 24 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; SSE2:  Cost of 48 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; SSE2:  Cost of 96 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v2, ptr %out2, align 4
-; AVX1:  LV: Found an estimated cost of 12 for VF 2 For instruction: store float %v2, ptr %out2, align 4
-; AVX1:  LV: Found an estimated cost of 22 for VF 4 For instruction: store float %v2, ptr %out2, align 4
-; AVX1:  LV: Found an estimated cost of 45 for VF 8 For instruction: store float %v2, ptr %out2, align 4
-; AVX1:  LV: Found an estimated cost of 90 for VF 16 For instruction: store float %v2, ptr %out2, align 4
-; AVX1:  LV: Found an estimated cost of 180 for VF 32 For instruction: store float %v2, ptr %out2, align 4
+; AVX1:  Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 22 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 45 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 90 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 180 for VF 32: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v2, ptr %out2, align 4
-; AVX2:  LV: Found an estimated cost of 7 for VF 2 For instruction: store float %v2, ptr %out2, align 4
-; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: store float %v2, ptr %out2, align 4
-; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: store float %v2, ptr %out2, align 4
-; AVX2:  LV: Found an estimated cost of 28 for VF 16 For instruction: store float %v2, ptr %out2, align 4
-; AVX2:  LV: Found an estimated cost of 60 for VF 32 For instruction: store float %v2, ptr %out2, align 4
+; AVX2:  Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 14 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 28 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 60 for VF 32: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v2, ptr %out2, align 4
-; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: store float %v2, ptr %out2, align 4
-; AVX512:  LV: Found an estimated cost of 4 for VF 4 For instruction: store float %v2, ptr %out2, align 4
-; AVX512:  LV: Found an estimated cost of 8 for VF 8 For instruction: store float %v2, ptr %out2, align 4
-; AVX512:  LV: Found an estimated cost of 12 for VF 16 For instruction: store float %v2, ptr %out2, align 4
-; AVX512:  LV: Found an estimated cost of 24 for VF 32 For instruction: store float %v2, ptr %out2, align 4
-; AVX512:  LV: Found an estimated cost of 48 for VF 64 For instruction: store float %v2, ptr %out2, align 4
+; AVX512:  Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512:  Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512:  Cost of 8 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512:  Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512:  Cost of 24 for VF 32: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512:  Cost of 48 for VF 64: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-4.ll
index 7e56b13cc8b8c..e0fa19298c577 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-4.ll
@@ -14,35 +14,35 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v3, ptr %out3, align 4
-; SSE2:  LV: Found an estimated cost of 12 for VF 2 For instruction: store float %v3, ptr %out3, align 4
-; SSE2:  LV: Found an estimated cost of 28 for VF 4 For instruction: store float %v3, ptr %out3, align 4
-; SSE2:  LV: Found an estimated cost of 56 for VF 8 For instruction: store float %v3, ptr %out3, align 4
-; SSE2:  LV: Found an estimated cost of 112 for VF 16 For instruction: store float %v3, ptr %out3, align 4
+; SSE2:  Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; SSE2:  Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; SSE2:  Cost of 56 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; SSE2:  Cost of 112 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v3, ptr %out3, align 4
-; AVX1:  LV: Found an estimated cost of 12 for VF 2 For instruction: store float %v3, ptr %out3, align 4
-; AVX1:  LV: Found an estimated cost of 28 for VF 4 For instruction: store float %v3, ptr %out3, align 4
-; AVX1:  LV: Found an estimated cost of 60 for VF 8 For instruction: store float %v3, ptr %out3, align 4
-; AVX1:  LV: Found an estimated cost of 120 for VF 16 For instruction: store float %v3, ptr %out3, align 4
-; AVX1:  LV: Found an estimated cost of 240 for VF 32 For instruction: store float %v3, ptr %out3, align 4
+; AVX1:  Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 60 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 120 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 240 for VF 32: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v3, ptr %out3, align 4
-; AVX2:  LV: Found an estimated cost of 6 for VF 2 For instruction: store float %v3, ptr %out3, align 4
-; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: store float %v3, ptr %out3, align 4
-; AVX2:  LV: Found an estimated cost of 20 for VF 8 For instruction: store float %v3, ptr %out3, align 4
-; AVX2:  LV: Found an estimated cost of 40 for VF 16 For instruction: store float %v3, ptr %out3, align 4
-; AVX2:  LV: Found an estimated cost of 80 for VF 32 For instruction: store float %v3, ptr %out3, align 4
+; AVX2:  Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 20 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 40 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 80 for VF 32: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v3, ptr %out3, align 4
-; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: store float %v3, ptr %out3, align 4
-; AVX512:  LV: Found an estimated cost of 5 for VF 4 For instruction: store float %v3, ptr %out3, align 4
-; AVX512:  LV: Found an estimated cost of 11 for VF 8 For instruction: store float %v3, ptr %out3, align 4
-; AVX512:  LV: Found an estimated cost of 22 for VF 16 For instruction: store float %v3, ptr %out3, align 4
-; AVX512:  LV: Found an estimated cost of 44 for VF 32 For instruction: store float %v3, ptr %out3, align 4
-; AVX512:  LV: Found an estimated cost of 88 for VF 64 For instruction: store float %v3, ptr %out3, align 4
+; AVX512:  Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512:  Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512:  Cost of 11 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512:  Cost of 22 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512:  Cost of 44 for VF 32: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512:  Cost of 88 for VF 64: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll
index 164849976b67e..940864a59a93a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-5.ll
@@ -14,32 +14,32 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v4, ptr %out4, align 4
-; SSE2:  LV: Found an estimated cost of 20 for VF 2 For instruction: store float %v4, ptr %out4, align 4
-; SSE2:  LV: Found an estimated cost of 44 for VF 4 For instruction: store float %v4, ptr %out4, align 4
-; SSE2:  LV: Found an estimated cost of 88 for VF 8 For instruction: store float %v4, ptr %out4, align 4
+; SSE2:  Cost of 20 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; SSE2:  Cost of 44 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; SSE2:  Cost of 88 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v4, ptr %out4, align 4
-; AVX1:  LV: Found an estimated cost of 16 for VF 2 For instruction: store float %v4, ptr %out4, align 4
-; AVX1:  LV: Found an estimated cost of 36 for VF 4 For instruction: store float %v4, ptr %out4, align 4
-; AVX1:  LV: Found an estimated cost of 75 for VF 8 For instruction: store float %v4, ptr %out4, align 4
-; AVX1:  LV: Found an estimated cost of 150 for VF 16 For instruction: store float %v4, ptr %out4, align 4
+; AVX1:  Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX1:  Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX1:  Cost of 75 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX1:  Cost of 150 for VF 16: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v4, ptr %out4, align 4
-; AVX2:  LV: Found an estimated cost of 16 for VF 2 For instruction: store float %v4, ptr %out4, align 4
-; AVX2:  LV: Found an estimated cost of 36 for VF 4 For instruction: store float %v4, ptr %out4, align 4
-; AVX2:  LV: Found an estimated cost of 75 for VF 8 For instruction: store float %v4, ptr %out4, align 4
-; AVX2:  LV: Found an estimated cost of 150 for VF 16 For instruction: store float %v4, ptr %out4, align 4
+; AVX2:  Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX2:  Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX2:  Cost of 75 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX2:  Cost of 150 for VF 16: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v4, ptr %out4, align 4
-; AVX512:  LV: Found an estimated cost of 7 for VF 2 For instruction: store float %v4, ptr %out4, align 4
-; AVX512:  LV: Found an estimated cost of 14 for VF 4 For instruction: store float %v4, ptr %out4, align 4
-; AVX512:  LV: Found an estimated cost of 21 for VF 8 For instruction: store float %v4, ptr %out4, align 4
-; AVX512:  LV: Found an estimated cost of 35 for VF 16 For instruction: store float %v4, ptr %out4, align 4
-; AVX512:  LV: Found an estimated cost of 70 for VF 32 For instruction: store float %v4, ptr %out4, align 4
-; AVX512:  LV: Found an estimated cost of 140 for VF 64 For instruction: store float %v4, ptr %out4, align 4
+; AVX512:  Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512:  Cost of 14 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512:  Cost of 21 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512:  Cost of 35 for VF 16: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512:  Cost of 70 for VF 32: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512:  Cost of 140 for VF 64: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-6.ll
index b166432ff3bd9..3d947e13b3732 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-6.ll
@@ -14,32 +14,32 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v5, ptr %out5, align 4
-; SSE2:  LV: Found an estimated cost of 21 for VF 2 For instruction: store float %v5, ptr %out5, align 4
-; SSE2:  LV: Found an estimated cost of 48 for VF 4 For instruction: store float %v5, ptr %out5, align 4
-; SSE2:  LV: Found an estimated cost of 96 for VF 8 For instruction: store float %v5, ptr %out5, align 4
+; SSE2:  Cost of 21 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; SSE2:  Cost of 48 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; SSE2:  Cost of 96 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v5, ptr %out5, align 4
-; AVX1:  LV: Found an estimated cost of 19 for VF 2 For instruction: store float %v5, ptr %out5, align 4
-; AVX1:  LV: Found an estimated cost of 42 for VF 4 For instruction: store float %v5, ptr %out5, align 4
-; AVX1:  LV: Found an estimated cost of 90 for VF 8 For instruction: store float %v5, ptr %out5, align 4
-; AVX1:  LV: Found an estimated cost of 180 for VF 16 For instruction: store float %v5, ptr %out5, align 4
+; AVX1:  Cost of 19 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX1:  Cost of 42 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX1:  Cost of 90 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX1:  Cost of 180 for VF 16: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v5, ptr %out5, align 4
-; AVX2:  LV: Found an estimated cost of 11 for VF 2 For instruction: store float %v5, ptr %out5, align 4
-; AVX2:  LV: Found an estimated cost of 15 for VF 4 For instruction: store float %v5, ptr %out5, align 4
-; AVX2:  LV: Found an estimated cost of 39 for VF 8 For instruction: store float %v5, ptr %out5, align 4
-; AVX2:  LV: Found an estimated cost of 78 for VF 16 For instruction: store float %v5, ptr %out5, align 4
+; AVX2:  Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX2:  Cost of 15 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX2:  Cost of 39 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX2:  Cost of 78 for VF 16: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v5, ptr %out5, align 4
-; AVX512:  LV: Found an estimated cost of 8 for VF 2 For instruction: store float %v5, ptr %out5, align 4
-; AVX512:  LV: Found an estimated cost of 17 for VF 4 For instruction: store float %v5, ptr %out5, align 4
-; AVX512:  LV: Found an estimated cost of 25 for VF 8 For instruction: store float %v5, ptr %out5, align 4
-; AVX512:  LV: Found an estimated cost of 51 for VF 16 For instruction: store float %v5, ptr %out5, align 4
-; AVX512:  LV: Found an estimated cost of 102 for VF 32 For instruction: store float %v5, ptr %out5, align 4
-; AVX512:  LV: Found an estimated cost of 204 for VF 64 For instruction: store float %v5, ptr %out5, align 4
+; AVX512:  Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512:  Cost of 17 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512:  Cost of 25 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512:  Cost of 51 for VF 16: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512:  Cost of 102 for VF 32: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512:  Cost of 204 for VF 64: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-7.ll
index cf5bb64154f4c..ffa4cab905cea 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f32-stride-7.ll
@@ -14,31 +14,31 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v6, ptr %out6, align 4
-; SSE2:  LV: Found an estimated cost of 23 for VF 2 For instruction: store float %v6, ptr %out6, align 4
-; SSE2:  LV: Found an estimated cost of 52 for VF 4 For instruction: store float %v6, ptr %out6, align 4
-; SSE2:  LV: Found an estimated cost of 104 for VF 8 For instruction: store float %v6, ptr %out6, align 4
+; SSE2:  Cost of 23 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; SSE2:  Cost of 52 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; SSE2:  Cost of 104 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v6, ptr %out6, align 4
-; AVX1:  LV: Found an estimated cost of 24 for VF 2 For instruction: store float %v6, ptr %out6, align 4
-; AVX1:  LV: Found an estimated cost of 50 for VF 4 For instruction: store float %v6, ptr %out6, align 4
-; AVX1:  LV: Found an estimated cost of 105 for VF 8 For instruction: store float %v6, ptr %out6, align 4
-; AVX1:  LV: Found an estimated cost of 210 for VF 16 For instruction: store float %v6, ptr %out6, align 4
+; AVX1:  Cost of 24 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX1:  Cost of 50 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX1:  Cost of 105 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX1:  Cost of 210 for VF 16: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v6, ptr %out6, align 4
-; AVX2:  LV: Found an estimated cost of 24 for VF 2 For instruction: store float %v6, ptr %out6, align 4
-; AVX2:  LV: Found an estimated cost of 50 for VF 4 For instruction: store float %v6, ptr %out6, align 4
-; AVX2:  LV: Found an estimated cost of 105 for VF 8 For instruction: store float %v6, ptr %out6, align 4
-; AVX2:  LV: Found an estimated cost of 210 for VF 16 For instruction: store float %v6, ptr %out6, align 4
+; AVX2:  Cost of 24 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX2:  Cost of 50 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX2:  Cost of 105 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX2:  Cost of 210 for VF 16: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store float %v6, ptr %out6, align 4
-; AVX512:  LV: Found an estimated cost of 10 for VF 2 For instruction: store float %v6, ptr %out6, align 4
-; AVX512:  LV: Found an estimated cost of 20 for VF 4 For instruction: store float %v6, ptr %out6, align 4
-; AVX512:  LV: Found an estimated cost of 40 for VF 8 For instruction: store float %v6, ptr %out6, align 4
-; AVX512:  LV: Found an estimated cost of 70 for VF 16 For instruction: store float %v6, ptr %out6, align 4
-; AVX512:  LV: Found an estimated cost of 140 for VF 32 For instruction: store float %v6, ptr %out6, align 4
+; AVX512:  Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512:  Cost of 20 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512:  Cost of 40 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512:  Cost of 70 for VF 16: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512:  Cost of 140 for VF 32: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-2.ll
index 74eccc0c33218..30faeb74644be 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-2.ll
@@ -14,35 +14,35 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
-; SSE2:  LV: Found an estimated cost of 6 for VF 2 For instruction: store double %v1, ptr %out1, align 8
-; SSE2:  LV: Found an estimated cost of 12 for VF 4 For instruction: store double %v1, ptr %out1, align 8
-; SSE2:  LV: Found an estimated cost of 24 for VF 8 For instruction: store double %v1, ptr %out1, align 8
-; SSE2:  LV: Found an estimated cost of 48 for VF 16 For instruction: store double %v1, ptr %out1, align 8
+; SSE2:  Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; SSE2:  Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; SSE2:  Cost of 24 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; SSE2:  Cost of 48 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
-; AVX1:  LV: Found an estimated cost of 6 for VF 2 For instruction: store double %v1, ptr %out1, align 8
-; AVX1:  LV: Found an estimated cost of 14 for VF 4 For instruction: store double %v1, ptr %out1, align 8
-; AVX1:  LV: Found an estimated cost of 28 for VF 8 For instruction: store double %v1, ptr %out1, align 8
-; AVX1:  LV: Found an estimated cost of 56 for VF 16 For instruction: store double %v1, ptr %out1, align 8
-; AVX1:  LV: Found an estimated cost of 112 for VF 32 For instruction: store double %v1, ptr %out1, align 8
+; AVX1:  Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 14 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 28 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 56 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 112 for VF 32: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
-; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: store double %v1, ptr %out1, align 8
-; AVX2:  LV: Found an estimated cost of 6 for VF 4 For instruction: store double %v1, ptr %out1, align 8
-; AVX2:  LV: Found an estimated cost of 12 for VF 8 For instruction: store double %v1, ptr %out1, align 8
-; AVX2:  LV: Found an estimated cost of 24 for VF 16 For instruction: store double %v1, ptr %out1, align 8
-; AVX2:  LV: Found an estimated cost of 48 for VF 32 For instruction: store double %v1, ptr %out1, align 8
+; AVX2:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 6 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 12 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 24 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 48 for VF 32: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v1, ptr %out1, align 8
-; AVX512:  LV: Found an estimated cost of 2 for VF 2 For instruction: store double %v1, ptr %out1, align 8
-; AVX512:  LV: Found an estimated cost of 2 for VF 4 For instruction: store double %v1, ptr %out1, align 8
-; AVX512:  LV: Found an estimated cost of 5 for VF 8 For instruction: store double %v1, ptr %out1, align 8
-; AVX512:  LV: Found an estimated cost of 10 for VF 16 For instruction: store double %v1, ptr %out1, align 8
-; AVX512:  LV: Found an estimated cost of 20 for VF 32 For instruction: store double %v1, ptr %out1, align 8
-; AVX512:  LV: Found an estimated cost of 40 for VF 64 For instruction: store double %v1, ptr %out1, align 8
+; AVX512:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512:  Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512:  Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512:  Cost of 10 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512:  Cost of 20 for VF 32: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512:  Cost of 40 for VF 64: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll
index b6f76d392bbae..a4405db7e8099 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-3.ll
@@ -14,32 +14,32 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
-; SSE2:  LV: Found an estimated cost of 10 for VF 2 For instruction: store double %v2, ptr %out2, align 8
-; SSE2:  LV: Found an estimated cost of 20 for VF 4 For instruction: store double %v2, ptr %out2, align 8
-; SSE2:  LV: Found an estimated cost of 40 for VF 8 For instruction: store double %v2, ptr %out2, align 8
+; SSE2:  Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; SSE2:  Cost of 20 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; SSE2:  Cost of 40 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
-; AVX1:  LV: Found an estimated cost of 11 for VF 2 For instruction: store double %v2, ptr %out2, align 8
-; AVX1:  LV: Found an estimated cost of 24 for VF 4 For instruction: store double %v2, ptr %out2, align 8
-; AVX1:  LV: Found an estimated cost of 48 for VF 8 For instruction: store double %v2, ptr %out2, align 8
-; AVX1:  LV: Found an estimated cost of 96 for VF 16 For instruction: store double %v2, ptr %out2, align 8
+; AVX1:  Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 24 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 48 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 96 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
-; AVX2:  LV: Found an estimated cost of 6 for VF 2 For instruction: store double %v2, ptr %out2, align 8
-; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: store double %v2, ptr %out2, align 8
-; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: store double %v2, ptr %out2, align 8
-; AVX2:  LV: Found an estimated cost of 36 for VF 16 For instruction: store double %v2, ptr %out2, align 8
+; AVX2:  Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 9 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 18 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 36 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v2, ptr %out2, align 8
-; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: store double %v2, ptr %out2, align 8
-; AVX512:  LV: Found an estimated cost of 8 for VF 4 For instruction: store double %v2, ptr %out2, align 8
-; AVX512:  LV: Found an estimated cost of 12 for VF 8 For instruction: store double %v2, ptr %out2, align 8
-; AVX512:  LV: Found an estimated cost of 24 for VF 16 For instruction: store double %v2, ptr %out2, align 8
-; AVX512:  LV: Found an estimated cost of 48 for VF 32 For instruction: store double %v2, ptr %out2, align 8
-; AVX512:  LV: Found an estimated cost of 96 for VF 64 For instruction: store double %v2, ptr %out2, align 8
+; AVX512:  Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512:  Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512:  Cost of 12 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512:  Cost of 24 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512:  Cost of 48 for VF 32: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512:  Cost of 96 for VF 64: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-4.ll
index 14e76623b0506..aea5871ec8b87 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-4.ll
@@ -14,31 +14,31 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
-; SSE2:  LV: Found an estimated cost of 12 for VF 2 For instruction: store double %v3, ptr %out3, align 8
-; SSE2:  LV: Found an estimated cost of 24 for VF 4 For instruction: store double %v3, ptr %out3, align 8
-; SSE2:  LV: Found an estimated cost of 48 for VF 8 For instruction: store double %v3, ptr %out3, align 8
+; SSE2:  Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; SSE2:  Cost of 24 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; SSE2:  Cost of 48 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
-; AVX1:  LV: Found an estimated cost of 12 for VF 2 For instruction: store double %v3, ptr %out3, align 8
-; AVX1:  LV: Found an estimated cost of 28 for VF 4 For instruction: store double %v3, ptr %out3, align 8
-; AVX1:  LV: Found an estimated cost of 56 for VF 8 For instruction: store double %v3, ptr %out3, align 8
-; AVX1:  LV: Found an estimated cost of 112 for VF 16 For instruction: store double %v3, ptr %out3, align 8
+; AVX1:  Cost of 12 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 56 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 112 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
-; AVX2:  LV: Found an estimated cost of 8 for VF 2 For instruction: store double %v3, ptr %out3, align 8
-; AVX2:  LV: Found an estimated cost of 12 for VF 4 For instruction: store double %v3, ptr %out3, align 8
-; AVX2:  LV: Found an estimated cost of 28 for VF 8 For instruction: store double %v3, ptr %out3, align 8
-; AVX2:  LV: Found an estimated cost of 56 for VF 16 For instruction: store double %v3, ptr %out3, align 8
+; AVX2:  Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 28 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 56 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v3, ptr %out3, align 8
-; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: store double %v3, ptr %out3, align 8
-; AVX512:  LV: Found an estimated cost of 11 for VF 4 For instruction: store double %v3, ptr %out3, align 8
-; AVX512:  LV: Found an estimated cost of 22 for VF 8 For instruction: store double %v3, ptr %out3, align 8
-; AVX512:  LV: Found an estimated cost of 44 for VF 16 For instruction: store double %v3, ptr %out3, align 8
-; AVX512:  LV: Found an estimated cost of 88 for VF 32 For instruction: store double %v3, ptr %out3, align 8
+; AVX512:  Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512:  Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512:  Cost of 22 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512:  Cost of 44 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512:  Cost of 88 for VF 32: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-5.ll
index 8f10e1b1c2bf3..888ba09ea8452 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-5.ll
@@ -14,28 +14,28 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
-; SSE2:  LV: Found an estimated cost of 18 for VF 2 For instruction: store double %v4, ptr %out4, align 8
-; SSE2:  LV: Found an estimated cost of 36 for VF 4 For instruction: store double %v4, ptr %out4, align 8
+; SSE2:  Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; SSE2:  Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
-; AVX1:  LV: Found an estimated cost of 20 for VF 2 For instruction: store double %v4, ptr %out4, align 8
-; AVX1:  LV: Found an estimated cost of 44 for VF 4 For instruction: store double %v4, ptr %out4, align 8
-; AVX1:  LV: Found an estimated cost of 88 for VF 8 For instruction: store double %v4, ptr %out4, align 8
+; AVX1:  Cost of 20 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX1:  Cost of 44 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX1:  Cost of 88 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
-; AVX2:  LV: Found an estimated cost of 20 for VF 2 For instruction: store double %v4, ptr %out4, align 8
-; AVX2:  LV: Found an estimated cost of 44 for VF 4 For instruction: store double %v4, ptr %out4, align 8
-; AVX2:  LV: Found an estimated cost of 88 for VF 8 For instruction: store double %v4, ptr %out4, align 8
+; AVX2:  Cost of 20 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX2:  Cost of 44 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX2:  Cost of 88 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v4, ptr %out4, align 8
-; AVX512:  LV: Found an estimated cost of 14 for VF 2 For instruction: store double %v4, ptr %out4, align 8
-; AVX512:  LV: Found an estimated cost of 21 for VF 4 For instruction: store double %v4, ptr %out4, align 8
-; AVX512:  LV: Found an estimated cost of 35 for VF 8 For instruction: store double %v4, ptr %out4, align 8
-; AVX512:  LV: Found an estimated cost of 70 for VF 16 For instruction: store double %v4, ptr %out4, align 8
-; AVX512:  LV: Found an estimated cost of 140 for VF 32 For instruction: store double %v4, ptr %out4, align 8
+; AVX512:  Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512:  Cost of 21 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512:  Cost of 35 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512:  Cost of 70 for VF 16: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512:  Cost of 140 for VF 32: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-6.ll
index 7bd0d0b497b0f..6200f8fec3201 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-6.ll
@@ -14,28 +14,28 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
-; SSE2:  LV: Found an estimated cost of 20 for VF 2 For instruction: store double %v5, ptr %out5, align 8
-; SSE2:  LV: Found an estimated cost of 40 for VF 4 For instruction: store double %v5, ptr %out5, align 8
+; SSE2:  Cost of 20 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; SSE2:  Cost of 40 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
-; AVX1:  LV: Found an estimated cost of 21 for VF 2 For instruction: store double %v5, ptr %out5, align 8
-; AVX1:  LV: Found an estimated cost of 48 for VF 4 For instruction: store double %v5, ptr %out5, align 8
-; AVX1:  LV: Found an estimated cost of 96 for VF 8 For instruction: store double %v5, ptr %out5, align 8
+; AVX1:  Cost of 21 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX1:  Cost of 48 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX1:  Cost of 96 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
-; AVX2:  LV: Found an estimated cost of 11 for VF 2 For instruction: store double %v5, ptr %out5, align 8
-; AVX2:  LV: Found an estimated cost of 21 for VF 4 For instruction: store double %v5, ptr %out5, align 8
-; AVX2:  LV: Found an estimated cost of 42 for VF 8 For instruction: store double %v5, ptr %out5, align 8
+; AVX2:  Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX2:  Cost of 21 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX2:  Cost of 42 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v5, ptr %out5, align 8
-; AVX512:  LV: Found an estimated cost of 17 for VF 2 For instruction: store double %v5, ptr %out5, align 8
-; AVX512:  LV: Found an estimated cost of 25 for VF 4 For instruction: store double %v5, ptr %out5, align 8
-; AVX512:  LV: Found an estimated cost of 51 for VF 8 For instruction: store double %v5, ptr %out5, align 8
-; AVX512:  LV: Found an estimated cost of 102 for VF 16 For instruction: store double %v5, ptr %out5, align 8
-; AVX512:  LV: Found an estimated cost of 204 for VF 32 For instruction: store double %v5, ptr %out5, align 8
+; AVX512:  Cost of 17 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512:  Cost of 25 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512:  Cost of 51 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512:  Cost of 102 for VF 16: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512:  Cost of 204 for VF 32: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll
index c06de4819a30f..b8258ae845954 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-f64-stride-7.ll
@@ -14,28 +14,28 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
-; SSE2:  LV: Found an estimated cost of 22 for VF 2 For instruction: store double %v6, ptr %out6, align 8
-; SSE2:  LV: Found an estimated cost of 44 for VF 4 For instruction: store double %v6, ptr %out6, align 8
+; SSE2:  Cost of 22 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; SSE2:  Cost of 44 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
-; AVX1:  LV: Found an estimated cost of 23 for VF 2 For instruction: store double %v6, ptr %out6, align 8
-; AVX1:  LV: Found an estimated cost of 52 for VF 4 For instruction: store double %v6, ptr %out6, align 8
-; AVX1:  LV: Found an estimated cost of 104 for VF 8 For instruction: store double %v6, ptr %out6, align 8
+; AVX1:  Cost of 23 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX1:  Cost of 52 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX1:  Cost of 104 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
-; AVX2:  LV: Found an estimated cost of 23 for VF 2 For instruction: store double %v6, ptr %out6, align 8
-; AVX2:  LV: Found an estimated cost of 52 for VF 4 For instruction: store double %v6, ptr %out6, align 8
-; AVX2:  LV: Found an estimated cost of 104 for VF 8 For instruction: store double %v6, ptr %out6, align 8
+; AVX2:  Cost of 23 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX2:  Cost of 52 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX2:  Cost of 104 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store double %v6, ptr %out6, align 8
-; AVX512:  LV: Found an estimated cost of 20 for VF 2 For instruction: store double %v6, ptr %out6, align 8
-; AVX512:  LV: Found an estimated cost of 40 for VF 4 For instruction: store double %v6, ptr %out6, align 8
-; AVX512:  LV: Found an estimated cost of 70 for VF 8 For instruction: store double %v6, ptr %out6, align 8
-; AVX512:  LV: Found an estimated cost of 140 for VF 16 For instruction: store double %v6, ptr %out6, align 8
-; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: store double %v6, ptr %out6, align 8
+; AVX512:  Cost of 20 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512:  Cost of 40 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512:  Cost of 70 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512:  Cost of 140 for VF 16: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512:  Cost of 280 for VF 32: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-2.ll
index d945b852c581c..70aa1a017605e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-2.ll
@@ -15,44 +15,44 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, ptr %out1, align 2
-; SSE2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %v1, ptr %out1, align 2
-; SSE2:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i16 %v1, ptr %out1, align 2
-; SSE2:  LV: Found an estimated cost of 34 for VF 8 For instruction: store i16 %v1, ptr %out1, align 2
-; SSE2:  LV: Found an estimated cost of 68 for VF 16 For instruction: store i16 %v1, ptr %out1, align 2
+; SSE2:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; SSE2:  Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; SSE2:  Cost of 34 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; SSE2:  Cost of 68 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX1:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX1:  LV: Found an estimated cost of 34 for VF 8 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX1:  LV: Found an estimated cost of 70 for VF 16 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX1:  LV: Found an estimated cost of 140 for VF 32 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX1:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 34 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 70 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 140 for VF 32: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX2:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX2:  LV: Found an estimated cost of 4 for VF 8 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX2:  LV: Found an estimated cost of 6 for VF 16 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX2:  LV: Found an estimated cost of 12 for VF 32 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX2:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 4 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 6 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 12 for VF 32: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX512DQ:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX512DQ:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX512DQ:  LV: Found an estimated cost of 4 for VF 8 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX512DQ:  LV: Found an estimated cost of 5 for VF 16 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX512DQ:  LV: Found an estimated cost of 10 for VF 32 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX512DQ:  LV: Found an estimated cost of 284 for VF 64 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX512DQ:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 4 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 10 for VF 32: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 284 for VF 64: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX512BW:  LV: Found an estimated cost of 3 for VF 2 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX512BW:  LV: Found an estimated cost of 3 for VF 4 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX512BW:  LV: Found an estimated cost of 3 for VF 8 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX512BW:  LV: Found an estimated cost of 3 for VF 16 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX512BW:  LV: Found an estimated cost of 7 for VF 32 For instruction: store i16 %v1, ptr %out1, align 2
-; AVX512BW:  LV: Found an estimated cost of 14 for VF 64 For instruction: store i16 %v1, ptr %out1, align 2
+; AVX512BW:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 3 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 3 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 7 for VF 32: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 14 for VF 64: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-3.ll
index e9113439b4b0e..03bffeb36eedc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-3.ll
@@ -15,44 +15,44 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, ptr %out2, align 2
-; SSE2:  LV: Found an estimated cost of 16 for VF 2 For instruction: store i16 %v2, ptr %out2, align 2
-; SSE2:  LV: Found an estimated cost of 26 for VF 4 For instruction: store i16 %v2, ptr %out2, align 2
-; SSE2:  LV: Found an estimated cost of 51 for VF 8 For instruction: store i16 %v2, ptr %out2, align 2
-; SSE2:  LV: Found an estimated cost of 102 for VF 16 For instruction: store i16 %v2, ptr %out2, align 2
+; SSE2:  Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; SSE2:  Cost of 26 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; SSE2:  Cost of 51 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; SSE2:  Cost of 102 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX1:  LV: Found an estimated cost of 15 for VF 2 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX1:  LV: Found an estimated cost of 29 for VF 4 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX1:  LV: Found an estimated cost of 52 for VF 8 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX1:  LV: Found an estimated cost of 105 for VF 16 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX1:  LV: Found an estimated cost of 210 for VF 32 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX1:  Cost of 15 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 29 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 52 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 105 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 210 for VF 32: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX2:  LV: Found an estimated cost of 7 for VF 2 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX2:  LV: Found an estimated cost of 60 for VF 32 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX2:  Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 9 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 14 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 30 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 60 for VF 32: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX512DQ:  LV: Found an estimated cost of 7 for VF 2 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX512DQ:  LV: Found an estimated cost of 9 for VF 4 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX512DQ:  LV: Found an estimated cost of 15 for VF 8 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX512DQ:  LV: Found an estimated cost of 29 for VF 16 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX512DQ:  LV: Found an estimated cost of 57 for VF 32 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX512DQ:  LV: Found an estimated cost of 426 for VF 64 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX512DQ:  Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 9 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 15 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 29 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 57 for VF 32: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 426 for VF 64: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX512BW:  LV: Found an estimated cost of 6 for VF 2 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX512BW:  LV: Found an estimated cost of 6 for VF 4 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX512BW:  LV: Found an estimated cost of 6 for VF 8 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX512BW:  LV: Found an estimated cost of 12 for VF 16 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX512BW:  LV: Found an estimated cost of 18 for VF 32 For instruction: store i16 %v2, ptr %out2, align 2
-; AVX512BW:  LV: Found an estimated cost of 36 for VF 64 For instruction: store i16 %v2, ptr %out2, align 2
+; AVX512BW:  Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 6 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 18 for VF 32: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 36 for VF 64: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-4.ll
index 90d246ef330a4..58b742521cfaa 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-4.ll
@@ -15,44 +15,44 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, ptr %out3, align 2
-; SSE2:  LV: Found an estimated cost of 17 for VF 2 For instruction: store i16 %v3, ptr %out3, align 2
-; SSE2:  LV: Found an estimated cost of 34 for VF 4 For instruction: store i16 %v3, ptr %out3, align 2
-; SSE2:  LV: Found an estimated cost of 68 for VF 8 For instruction: store i16 %v3, ptr %out3, align 2
-; SSE2:  LV: Found an estimated cost of 136 for VF 16 For instruction: store i16 %v3, ptr %out3, align 2
+; SSE2:  Cost of 17 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; SSE2:  Cost of 34 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; SSE2:  Cost of 68 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; SSE2:  Cost of 136 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX1:  LV: Found an estimated cost of 17 for VF 2 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX1:  LV: Found an estimated cost of 34 for VF 4 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX1:  LV: Found an estimated cost of 68 for VF 8 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX1:  LV: Found an estimated cost of 140 for VF 16 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX1:  LV: Found an estimated cost of 280 for VF 32 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX1:  Cost of 17 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 34 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 68 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 140 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 280 for VF 32: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX2:  LV: Found an estimated cost of 12 for VF 8 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX2:  LV: Found an estimated cost of 36 for VF 16 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX2:  LV: Found an estimated cost of 72 for VF 32 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX2:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 12 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 36 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 72 for VF 32: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX512DQ:  LV: Found an estimated cost of 3 for VF 2 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX512DQ:  LV: Found an estimated cost of 7 for VF 4 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX512DQ:  LV: Found an estimated cost of 11 for VF 8 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX512DQ:  LV: Found an estimated cost of 34 for VF 16 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX512DQ:  LV: Found an estimated cost of 68 for VF 32 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX512DQ:  LV: Found an estimated cost of 568 for VF 64 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX512DQ:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 11 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 34 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 68 for VF 32: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 568 for VF 64: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX512BW:  LV: Found an estimated cost of 8 for VF 2 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX512BW:  LV: Found an estimated cost of 8 for VF 4 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX512BW:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX512BW:  LV: Found an estimated cost of 17 for VF 16 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX512BW:  LV: Found an estimated cost of 34 for VF 32 For instruction: store i16 %v3, ptr %out3, align 2
-; AVX512BW:  LV: Found an estimated cost of 68 for VF 64 For instruction: store i16 %v3, ptr %out3, align 2
+; AVX512BW:  Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 8 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 17 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 34 for VF 32: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 68 for VF 64: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-5.ll
index 02684a5ce764c..b12b9ddf09529 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-5.ll
@@ -15,44 +15,44 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, ptr %out4, align 2
-; SSE2:  LV: Found an estimated cost of 22 for VF 2 For instruction: store i16 %v4, ptr %out4, align 2
-; SSE2:  LV: Found an estimated cost of 43 for VF 4 For instruction: store i16 %v4, ptr %out4, align 2
-; SSE2:  LV: Found an estimated cost of 85 for VF 8 For instruction: store i16 %v4, ptr %out4, align 2
-; SSE2:  LV: Found an estimated cost of 170 for VF 16 For instruction: store i16 %v4, ptr %out4, align 2
+; SSE2:  Cost of 22 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; SSE2:  Cost of 43 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; SSE2:  Cost of 85 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; SSE2:  Cost of 170 for VF 16: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX1:  LV: Found an estimated cost of 26 for VF 2 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX1:  LV: Found an estimated cost of 44 for VF 4 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX1:  LV: Found an estimated cost of 86 for VF 8 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX1:  LV: Found an estimated cost of 175 for VF 16 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX1:  LV: Found an estimated cost of 350 for VF 32 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX1:  Cost of 26 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX1:  Cost of 44 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX1:  Cost of 86 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX1:  Cost of 175 for VF 16: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX1:  Cost of 350 for VF 32: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX2:  LV: Found an estimated cost of 26 for VF 2 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX2:  LV: Found an estimated cost of 44 for VF 4 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX2:  LV: Found an estimated cost of 86 for VF 8 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX2:  LV: Found an estimated cost of 175 for VF 16 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX2:  LV: Found an estimated cost of 350 for VF 32 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX2:  Cost of 26 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX2:  Cost of 44 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX2:  Cost of 86 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX2:  Cost of 175 for VF 16: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX2:  Cost of 350 for VF 32: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX512DQ:  LV: Found an estimated cost of 26 for VF 2 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX512DQ:  LV: Found an estimated cost of 47 for VF 4 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX512DQ:  LV: Found an estimated cost of 86 for VF 8 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX512DQ:  LV: Found an estimated cost of 176 for VF 16 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX512DQ:  LV: Found an estimated cost of 355 for VF 32 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX512DQ:  LV: Found an estimated cost of 710 for VF 64 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX512DQ:  Cost of 26 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 47 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 86 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 176 for VF 16: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 355 for VF 32: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 710 for VF 64: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX512BW:  LV: Found an estimated cost of 11 for VF 2 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX512BW:  LV: Found an estimated cost of 11 for VF 4 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX512BW:  LV: Found an estimated cost of 22 for VF 8 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX512BW:  LV: Found an estimated cost of 33 for VF 16 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX512BW:  LV: Found an estimated cost of 55 for VF 32 For instruction: store i16 %v4, ptr %out4, align 2
-; AVX512BW:  LV: Found an estimated cost of 110 for VF 64 For instruction: store i16 %v4, ptr %out4, align 2
+; AVX512BW:  Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 22 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 33 for VF 16: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 55 for VF 32: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 110 for VF 64: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-6.ll
index fe07abaab820d..c41493d916fb9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-6.ll
@@ -15,44 +15,44 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, ptr %out5, align 2
-; SSE2:  LV: Found an estimated cost of 26 for VF 2 For instruction: store i16 %v5, ptr %out5, align 2
-; SSE2:  LV: Found an estimated cost of 51 for VF 4 For instruction: store i16 %v5, ptr %out5, align 2
-; SSE2:  LV: Found an estimated cost of 102 for VF 8 For instruction: store i16 %v5, ptr %out5, align 2
-; SSE2:  LV: Found an estimated cost of 204 for VF 16 For instruction: store i16 %v5, ptr %out5, align 2
+; SSE2:  Cost of 26 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; SSE2:  Cost of 51 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; SSE2:  Cost of 102 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; SSE2:  Cost of 204 for VF 16: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX1:  LV: Found an estimated cost of 29 for VF 2 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX1:  LV: Found an estimated cost of 52 for VF 4 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX1:  LV: Found an estimated cost of 102 for VF 8 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX1:  LV: Found an estimated cost of 210 for VF 16 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX1:  LV: Found an estimated cost of 420 for VF 32 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX1:  Cost of 29 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX1:  Cost of 52 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX1:  Cost of 102 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX1:  Cost of 210 for VF 16: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX1:  Cost of 420 for VF 32: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX2:  LV: Found an estimated cost of 13 for VF 2 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX2:  LV: Found an estimated cost of 17 for VF 4 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX2:  LV: Found an estimated cost of 24 for VF 8 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX2:  LV: Found an estimated cost of 64 for VF 16 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX2:  LV: Found an estimated cost of 102 for VF 32 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX2:  Cost of 13 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX2:  Cost of 17 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX2:  Cost of 24 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX2:  Cost of 64 for VF 16: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX2:  Cost of 102 for VF 32: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX512DQ:  LV: Found an estimated cost of 13 for VF 2 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX512DQ:  LV: Found an estimated cost of 18 for VF 4 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX512DQ:  LV: Found an estimated cost of 23 for VF 8 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX512DQ:  LV: Found an estimated cost of 61 for VF 16 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX512DQ:  LV: Found an estimated cost of 96 for VF 32 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX512DQ:  LV: Found an estimated cost of 852 for VF 64 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX512DQ:  Cost of 13 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 23 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 61 for VF 16: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 96 for VF 32: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 852 for VF 64: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX512BW:  LV: Found an estimated cost of 13 for VF 2 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX512BW:  LV: Found an estimated cost of 13 for VF 4 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX512BW:  LV: Found an estimated cost of 27 for VF 8 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX512BW:  LV: Found an estimated cost of 40 for VF 16 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX512BW:  LV: Found an estimated cost of 81 for VF 32 For instruction: store i16 %v5, ptr %out5, align 2
-; AVX512BW:  LV: Found an estimated cost of 162 for VF 64 For instruction: store i16 %v5, ptr %out5, align 2
+; AVX512BW:  Cost of 13 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 13 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 27 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 40 for VF 16: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 81 for VF 32: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 162 for VF 64: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-7.ll
index d8b9d9716919a..2a4d1bbf77cd5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i16-stride-7.ll
@@ -15,44 +15,44 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v6, ptr %out6, align 2
-; SSE2:  LV: Found an estimated cost of 33 for VF 2 For instruction: store i16 %v6, ptr %out6, align 2
-; SSE2:  LV: Found an estimated cost of 60 for VF 4 For instruction: store i16 %v6, ptr %out6, align 2
-; SSE2:  LV: Found an estimated cost of 119 for VF 8 For instruction: store i16 %v6, ptr %out6, align 2
-; SSE2:  LV: Found an estimated cost of 238 for VF 16 For instruction: store i16 %v6, ptr %out6, align 2
+; SSE2:  Cost of 33 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; SSE2:  Cost of 60 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; SSE2:  Cost of 119 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; SSE2:  Cost of 238 for VF 16: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX1:  LV: Found an estimated cost of 35 for VF 2 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX1:  LV: Found an estimated cost of 63 for VF 4 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX1:  LV: Found an estimated cost of 120 for VF 8 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX1:  LV: Found an estimated cost of 245 for VF 16 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX1:  LV: Found an estimated cost of 490 for VF 32 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX1:  Cost of 35 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX1:  Cost of 63 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX1:  Cost of 120 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX1:  Cost of 245 for VF 16: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX1:  Cost of 490 for VF 32: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX2:  LV: Found an estimated cost of 35 for VF 2 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX2:  LV: Found an estimated cost of 63 for VF 4 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX2:  LV: Found an estimated cost of 120 for VF 8 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX2:  LV: Found an estimated cost of 245 for VF 16 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX2:  LV: Found an estimated cost of 490 for VF 32 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX2:  Cost of 35 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX2:  Cost of 63 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX2:  Cost of 120 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX2:  Cost of 245 for VF 16: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX2:  Cost of 490 for VF 32: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX512DQ:  LV: Found an estimated cost of 35 for VF 2 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX512DQ:  LV: Found an estimated cost of 65 for VF 4 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX512DQ:  LV: Found an estimated cost of 122 for VF 8 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX512DQ:  LV: Found an estimated cost of 246 for VF 16 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX512DQ:  LV: Found an estimated cost of 497 for VF 32 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX512DQ:  LV: Found an estimated cost of 994 for VF 64 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX512DQ:  Cost of 35 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 65 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 122 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 246 for VF 16: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 497 for VF 32: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 994 for VF 64: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX512BW:  LV: Found an estimated cost of 16 for VF 2 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX512BW:  LV: Found an estimated cost of 16 for VF 4 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX512BW:  LV: Found an estimated cost of 32 for VF 8 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX512BW:  LV: Found an estimated cost of 64 for VF 16 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX512BW:  LV: Found an estimated cost of 112 for VF 32 For instruction: store i16 %v6, ptr %out6, align 2
-; AVX512BW:  LV: Found an estimated cost of 224 for VF 64 For instruction: store i16 %v6, ptr %out6, align 2
+; AVX512BW:  Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 16 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 32 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 64 for VF 16: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 112 for VF 32: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 224 for VF 64: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-2.ll
index 6c3d430413336..cc697305e3b0e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-2.ll
@@ -14,35 +14,35 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v1, ptr %out1, align 4
-; SSE2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %v1, ptr %out1, align 4
-; SSE2:  LV: Found an estimated cost of 30 for VF 4 For instruction: store i32 %v1, ptr %out1, align 4
-; SSE2:  LV: Found an estimated cost of 60 for VF 8 For instruction: store i32 %v1, ptr %out1, align 4
-; SSE2:  LV: Found an estimated cost of 120 for VF 16 For instruction: store i32 %v1, ptr %out1, align 4
+; SSE2:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; SSE2:  Cost of 30 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; SSE2:  Cost of 60 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; SSE2:  Cost of 120 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v1, ptr %out1, align 4
-; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %v1, ptr %out1, align 4
-; AVX1:  LV: Found an estimated cost of 18 for VF 4 For instruction: store i32 %v1, ptr %out1, align 4
-; AVX1:  LV: Found an estimated cost of 38 for VF 8 For instruction: store i32 %v1, ptr %out1, align 4
-; AVX1:  LV: Found an estimated cost of 76 for VF 16 For instruction: store i32 %v1, ptr %out1, align 4
-; AVX1:  LV: Found an estimated cost of 152 for VF 32 For instruction: store i32 %v1, ptr %out1, align 4
+; AVX1:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 38 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 76 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 152 for VF 32: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v1, ptr %out1, align 4
-; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %v1, ptr %out1, align 4
-; AVX2:  LV: Found an estimated cost of 3 for VF 4 For instruction: store i32 %v1, ptr %out1, align 4
-; AVX2:  LV: Found an estimated cost of 6 for VF 8 For instruction: store i32 %v1, ptr %out1, align 4
-; AVX2:  LV: Found an estimated cost of 12 for VF 16 For instruction: store i32 %v1, ptr %out1, align 4
-; AVX2:  LV: Found an estimated cost of 24 for VF 32 For instruction: store i32 %v1, ptr %out1, align 4
+; AVX2:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 3 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 6 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 24 for VF 32: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v1, ptr %out1, align 4
-; AVX512:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i32 %v1, ptr %out1, align 4
-; AVX512:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i32 %v1, ptr %out1, align 4
-; AVX512:  LV: Found an estimated cost of 2 for VF 8 For instruction: store i32 %v1, ptr %out1, align 4
-; AVX512:  LV: Found an estimated cost of 5 for VF 16 For instruction: store i32 %v1, ptr %out1, align 4
-; AVX512:  LV: Found an estimated cost of 10 for VF 32 For instruction: store i32 %v1, ptr %out1, align 4
-; AVX512:  LV: Found an estimated cost of 20 for VF 64 For instruction: store i32 %v1, ptr %out1, align 4
+; AVX512:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512:  Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512:  Cost of 2 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512:  Cost of 5 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512:  Cost of 10 for VF 32: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512:  Cost of 20 for VF 64: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-3.ll
index f34ac63fb82a0..47bed35fd42bc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-3.ll
@@ -14,35 +14,35 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v2, ptr %out2, align 4
-; SSE2:  LV: Found an estimated cost of 23 for VF 2 For instruction: store i32 %v2, ptr %out2, align 4
-; SSE2:  LV: Found an estimated cost of 48 for VF 4 For instruction: store i32 %v2, ptr %out2, align 4
-; SSE2:  LV: Found an estimated cost of 96 for VF 8 For instruction: store i32 %v2, ptr %out2, align 4
-; SSE2:  LV: Found an estimated cost of 192 for VF 16 For instruction: store i32 %v2, ptr %out2, align 4
+; SSE2:  Cost of 23 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; SSE2:  Cost of 48 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; SSE2:  Cost of 96 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; SSE2:  Cost of 192 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v2, ptr %out2, align 4
-; AVX1:  LV: Found an estimated cost of 17 for VF 2 For instruction: store i32 %v2, ptr %out2, align 4
-; AVX1:  LV: Found an estimated cost of 28 for VF 4 For instruction: store i32 %v2, ptr %out2, align 4
-; AVX1:  LV: Found an estimated cost of 57 for VF 8 For instruction: store i32 %v2, ptr %out2, align 4
-; AVX1:  LV: Found an estimated cost of 114 for VF 16 For instruction: store i32 %v2, ptr %out2, align 4
-; AVX1:  LV: Found an estimated cost of 228 for VF 32 For instruction: store i32 %v2, ptr %out2, align 4
+; AVX1:  Cost of 17 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 57 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 114 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 228 for VF 32: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v2, ptr %out2, align 4
-; AVX2:  LV: Found an estimated cost of 7 for VF 2 For instruction: store i32 %v2, ptr %out2, align 4
-; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: store i32 %v2, ptr %out2, align 4
-; AVX2:  LV: Found an estimated cost of 14 for VF 8 For instruction: store i32 %v2, ptr %out2, align 4
-; AVX2:  LV: Found an estimated cost of 28 for VF 16 For instruction: store i32 %v2, ptr %out2, align 4
-; AVX2:  LV: Found an estimated cost of 60 for VF 32 For instruction: store i32 %v2, ptr %out2, align 4
+; AVX2:  Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 14 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 28 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 60 for VF 32: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v2, ptr %out2, align 4
-; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: store i32 %v2, ptr %out2, align 4
-; AVX512:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i32 %v2, ptr %out2, align 4
-; AVX512:  LV: Found an estimated cost of 8 for VF 8 For instruction: store i32 %v2, ptr %out2, align 4
-; AVX512:  LV: Found an estimated cost of 12 for VF 16 For instruction: store i32 %v2, ptr %out2, align 4
-; AVX512:  LV: Found an estimated cost of 24 for VF 32 For instruction: store i32 %v2, ptr %out2, align 4
-; AVX512:  LV: Found an estimated cost of 48 for VF 64 For instruction: store i32 %v2, ptr %out2, align 4
+; AVX512:  Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512:  Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512:  Cost of 8 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512:  Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512:  Cost of 24 for VF 32: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512:  Cost of 48 for VF 64: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-4.ll
index 2d38f8fb4c4b3..6e19bc197e240 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-4.ll
@@ -14,35 +14,35 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v3, ptr %out3, align 4
-; SSE2:  LV: Found an estimated cost of 28 for VF 2 For instruction: store i32 %v3, ptr %out3, align 4
-; SSE2:  LV: Found an estimated cost of 60 for VF 4 For instruction: store i32 %v3, ptr %out3, align 4
-; SSE2:  LV: Found an estimated cost of 120 for VF 8 For instruction: store i32 %v3, ptr %out3, align 4
-; SSE2:  LV: Found an estimated cost of 240 for VF 16 For instruction: store i32 %v3, ptr %out3, align 4
+; SSE2:  Cost of 28 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; SSE2:  Cost of 60 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; SSE2:  Cost of 120 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; SSE2:  Cost of 240 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v3, ptr %out3, align 4
-; AVX1:  LV: Found an estimated cost of 18 for VF 2 For instruction: store i32 %v3, ptr %out3, align 4
-; AVX1:  LV: Found an estimated cost of 36 for VF 4 For instruction: store i32 %v3, ptr %out3, align 4
-; AVX1:  LV: Found an estimated cost of 76 for VF 8 For instruction: store i32 %v3, ptr %out3, align 4
-; AVX1:  LV: Found an estimated cost of 152 for VF 16 For instruction: store i32 %v3, ptr %out3, align 4
-; AVX1:  LV: Found an estimated cost of 304 for VF 32 For instruction: store i32 %v3, ptr %out3, align 4
+; AVX1:  Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 76 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 152 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 304 for VF 32: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v3, ptr %out3, align 4
-; AVX2:  LV: Found an estimated cost of 6 for VF 2 For instruction: store i32 %v3, ptr %out3, align 4
-; AVX2:  LV: Found an estimated cost of 8 for VF 4 For instruction: store i32 %v3, ptr %out3, align 4
-; AVX2:  LV: Found an estimated cost of 20 for VF 8 For instruction: store i32 %v3, ptr %out3, align 4
-; AVX2:  LV: Found an estimated cost of 40 for VF 16 For instruction: store i32 %v3, ptr %out3, align 4
-; AVX2:  LV: Found an estimated cost of 80 for VF 32 For instruction: store i32 %v3, ptr %out3, align 4
+; AVX2:  Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 20 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 40 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 80 for VF 32: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v3, ptr %out3, align 4
-; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i32 %v3, ptr %out3, align 4
-; AVX512:  LV: Found an estimated cost of 5 for VF 4 For instruction: store i32 %v3, ptr %out3, align 4
-; AVX512:  LV: Found an estimated cost of 11 for VF 8 For instruction: store i32 %v3, ptr %out3, align 4
-; AVX512:  LV: Found an estimated cost of 22 for VF 16 For instruction: store i32 %v3, ptr %out3, align 4
-; AVX512:  LV: Found an estimated cost of 44 for VF 32 For instruction: store i32 %v3, ptr %out3, align 4
-; AVX512:  LV: Found an estimated cost of 88 for VF 64 For instruction: store i32 %v3, ptr %out3, align 4
+; AVX512:  Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512:  Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512:  Cost of 11 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512:  Cost of 22 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512:  Cost of 44 for VF 32: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512:  Cost of 88 for VF 64: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll
index 472990feb1df1..d109b746efc42 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-5.ll
@@ -14,32 +14,32 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v4, ptr %out4, align 4
-; SSE2:  LV: Found an estimated cost of 40 for VF 2 For instruction: store i32 %v4, ptr %out4, align 4
-; SSE2:  LV: Found an estimated cost of 84 for VF 4 For instruction: store i32 %v4, ptr %out4, align 4
-; SSE2:  LV: Found an estimated cost of 168 for VF 8 For instruction: store i32 %v4, ptr %out4, align 4
+; SSE2:  Cost of 40 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; SSE2:  Cost of 84 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; SSE2:  Cost of 168 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v4, ptr %out4, align 4
-; AVX1:  LV: Found an estimated cost of 24 for VF 2 For instruction: store i32 %v4, ptr %out4, align 4
-; AVX1:  LV: Found an estimated cost of 46 for VF 4 For instruction: store i32 %v4, ptr %out4, align 4
-; AVX1:  LV: Found an estimated cost of 95 for VF 8 For instruction: store i32 %v4, ptr %out4, align 4
-; AVX1:  LV: Found an estimated cost of 190 for VF 16 For instruction: store i32 %v4, ptr %out4, align 4
+; AVX1:  Cost of 24 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX1:  Cost of 46 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX1:  Cost of 95 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX1:  Cost of 190 for VF 16: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v4, ptr %out4, align 4
-; AVX2:  LV: Found an estimated cost of 24 for VF 2 For instruction: store i32 %v4, ptr %out4, align 4
-; AVX2:  LV: Found an estimated cost of 46 for VF 4 For instruction: store i32 %v4, ptr %out4, align 4
-; AVX2:  LV: Found an estimated cost of 95 for VF 8 For instruction: store i32 %v4, ptr %out4, align 4
-; AVX2:  LV: Found an estimated cost of 190 for VF 16 For instruction: store i32 %v4, ptr %out4, align 4
+; AVX2:  Cost of 24 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX2:  Cost of 46 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX2:  Cost of 95 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX2:  Cost of 190 for VF 16: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v4, ptr %out4, align 4
-; AVX512:  LV: Found an estimated cost of 7 for VF 2 For instruction: store i32 %v4, ptr %out4, align 4
-; AVX512:  LV: Found an estimated cost of 14 for VF 4 For instruction: store i32 %v4, ptr %out4, align 4
-; AVX512:  LV: Found an estimated cost of 21 for VF 8 For instruction: store i32 %v4, ptr %out4, align 4
-; AVX512:  LV: Found an estimated cost of 35 for VF 16 For instruction: store i32 %v4, ptr %out4, align 4
-; AVX512:  LV: Found an estimated cost of 70 for VF 32 For instruction: store i32 %v4, ptr %out4, align 4
-; AVX512:  LV: Found an estimated cost of 140 for VF 64 For instruction: store i32 %v4, ptr %out4, align 4
+; AVX512:  Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512:  Cost of 14 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512:  Cost of 21 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512:  Cost of 35 for VF 16: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512:  Cost of 70 for VF 32: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512:  Cost of 140 for VF 64: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-6.ll
index e8d28b5475c9c..93ab0b7e10369 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-6.ll
@@ -14,32 +14,32 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v5, ptr %out5, align 4
-; SSE2:  LV: Found an estimated cost of 45 for VF 2 For instruction: store i32 %v5, ptr %out5, align 4
-; SSE2:  LV: Found an estimated cost of 96 for VF 4 For instruction: store i32 %v5, ptr %out5, align 4
-; SSE2:  LV: Found an estimated cost of 192 for VF 8 For instruction: store i32 %v5, ptr %out5, align 4
+; SSE2:  Cost of 45 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; SSE2:  Cost of 96 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; SSE2:  Cost of 192 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v5, ptr %out5, align 4
-; AVX1:  LV: Found an estimated cost of 28 for VF 2 For instruction: store i32 %v5, ptr %out5, align 4
-; AVX1:  LV: Found an estimated cost of 54 for VF 4 For instruction: store i32 %v5, ptr %out5, align 4
-; AVX1:  LV: Found an estimated cost of 114 for VF 8 For instruction: store i32 %v5, ptr %out5, align 4
-; AVX1:  LV: Found an estimated cost of 228 for VF 16 For instruction: store i32 %v5, ptr %out5, align 4
+; AVX1:  Cost of 28 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX1:  Cost of 54 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX1:  Cost of 114 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX1:  Cost of 228 for VF 16: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v5, ptr %out5, align 4
-; AVX2:  LV: Found an estimated cost of 11 for VF 2 For instruction: store i32 %v5, ptr %out5, align 4
-; AVX2:  LV: Found an estimated cost of 15 for VF 4 For instruction: store i32 %v5, ptr %out5, align 4
-; AVX2:  LV: Found an estimated cost of 39 for VF 8 For instruction: store i32 %v5, ptr %out5, align 4
-; AVX2:  LV: Found an estimated cost of 78 for VF 16 For instruction: store i32 %v5, ptr %out5, align 4
+; AVX2:  Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX2:  Cost of 15 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX2:  Cost of 39 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX2:  Cost of 78 for VF 16: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v5, ptr %out5, align 4
-; AVX512:  LV: Found an estimated cost of 8 for VF 2 For instruction: store i32 %v5, ptr %out5, align 4
-; AVX512:  LV: Found an estimated cost of 17 for VF 4 For instruction: store i32 %v5, ptr %out5, align 4
-; AVX512:  LV: Found an estimated cost of 25 for VF 8 For instruction: store i32 %v5, ptr %out5, align 4
-; AVX512:  LV: Found an estimated cost of 51 for VF 16 For instruction: store i32 %v5, ptr %out5, align 4
-; AVX512:  LV: Found an estimated cost of 102 for VF 32 For instruction: store i32 %v5, ptr %out5, align 4
-; AVX512:  LV: Found an estimated cost of 204 for VF 64 For instruction: store i32 %v5, ptr %out5, align 4
+; AVX512:  Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512:  Cost of 17 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512:  Cost of 25 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512:  Cost of 51 for VF 16: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512:  Cost of 102 for VF 32: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512:  Cost of 204 for VF 64: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-7.ll
index 7901b55d516f4..8a01e2fbceaa2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i32-stride-7.ll
@@ -14,31 +14,31 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4
-; SSE2:  LV: Found an estimated cost of 51 for VF 2 For instruction: store i32 %v6, ptr %out6, align 4
-; SSE2:  LV: Found an estimated cost of 108 for VF 4 For instruction: store i32 %v6, ptr %out6, align 4
-; SSE2:  LV: Found an estimated cost of 216 for VF 8 For instruction: store i32 %v6, ptr %out6, align 4
+; SSE2:  Cost of 51 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; SSE2:  Cost of 108 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; SSE2:  Cost of 216 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4
-; AVX1:  LV: Found an estimated cost of 35 for VF 2 For instruction: store i32 %v6, ptr %out6, align 4
-; AVX1:  LV: Found an estimated cost of 64 for VF 4 For instruction: store i32 %v6, ptr %out6, align 4
-; AVX1:  LV: Found an estimated cost of 133 for VF 8 For instruction: store i32 %v6, ptr %out6, align 4
-; AVX1:  LV: Found an estimated cost of 266 for VF 16 For instruction: store i32 %v6, ptr %out6, align 4
+; AVX1:  Cost of 35 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX1:  Cost of 64 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX1:  Cost of 133 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX1:  Cost of 266 for VF 16: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4
-; AVX2:  LV: Found an estimated cost of 35 for VF 2 For instruction: store i32 %v6, ptr %out6, align 4
-; AVX2:  LV: Found an estimated cost of 64 for VF 4 For instruction: store i32 %v6, ptr %out6, align 4
-; AVX2:  LV: Found an estimated cost of 133 for VF 8 For instruction: store i32 %v6, ptr %out6, align 4
-; AVX2:  LV: Found an estimated cost of 266 for VF 16 For instruction: store i32 %v6, ptr %out6, align 4
+; AVX2:  Cost of 35 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX2:  Cost of 64 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX2:  Cost of 133 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX2:  Cost of 266 for VF 16: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i32 %v6, ptr %out6, align 4
-; AVX512:  LV: Found an estimated cost of 10 for VF 2 For instruction: store i32 %v6, ptr %out6, align 4
-; AVX512:  LV: Found an estimated cost of 20 for VF 4 For instruction: store i32 %v6, ptr %out6, align 4
-; AVX512:  LV: Found an estimated cost of 40 for VF 8 For instruction: store i32 %v6, ptr %out6, align 4
-; AVX512:  LV: Found an estimated cost of 70 for VF 16 For instruction: store i32 %v6, ptr %out6, align 4
-; AVX512:  LV: Found an estimated cost of 140 for VF 32 For instruction: store i32 %v6, ptr %out6, align 4
+; AVX512:  Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512:  Cost of 20 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512:  Cost of 40 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512:  Cost of 70 for VF 16: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512:  Cost of 140 for VF 32: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-2.ll
index a76b1ba9db8bb..895d7a7623dba 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-2.ll
@@ -14,35 +14,35 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
-; SSE2:  LV: Found an estimated cost of 14 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8
-; SSE2:  LV: Found an estimated cost of 28 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8
-; SSE2:  LV: Found an estimated cost of 56 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8
-; SSE2:  LV: Found an estimated cost of 112 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8
+; SSE2:  Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; SSE2:  Cost of 28 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; SSE2:  Cost of 56 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; SSE2:  Cost of 112 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
-; AVX1:  LV: Found an estimated cost of 10 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8
-; AVX1:  LV: Found an estimated cost of 22 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8
-; AVX1:  LV: Found an estimated cost of 44 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8
-; AVX1:  LV: Found an estimated cost of 88 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8
-; AVX1:  LV: Found an estimated cost of 176 for VF 32 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX1:  Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 22 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 44 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 88 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 176 for VF 32: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
-; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8
-; AVX2:  LV: Found an estimated cost of 6 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8
-; AVX2:  LV: Found an estimated cost of 12 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8
-; AVX2:  LV: Found an estimated cost of 24 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8
-; AVX2:  LV: Found an estimated cost of 48 for VF 32 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX2:  Cost of 3 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 6 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 12 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 24 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 48 for VF 32: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v1, ptr %out1, align 8
-; AVX512:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i64 %v1, ptr %out1, align 8
-; AVX512:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i64 %v1, ptr %out1, align 8
-; AVX512:  LV: Found an estimated cost of 5 for VF 8 For instruction: store i64 %v1, ptr %out1, align 8
-; AVX512:  LV: Found an estimated cost of 10 for VF 16 For instruction: store i64 %v1, ptr %out1, align 8
-; AVX512:  LV: Found an estimated cost of 20 for VF 32 For instruction: store i64 %v1, ptr %out1, align 8
-; AVX512:  LV: Found an estimated cost of 40 for VF 64 For instruction: store i64 %v1, ptr %out1, align 8
+; AVX512:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512:  Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512:  Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512:  Cost of 10 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512:  Cost of 20 for VF 32: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512:  Cost of 40 for VF 64: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll
index 12af7eb263ac7..c4cc65212ffa9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-3.ll
@@ -14,32 +14,32 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
-; SSE2:  LV: Found an estimated cost of 22 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8
-; SSE2:  LV: Found an estimated cost of 44 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8
-; SSE2:  LV: Found an estimated cost of 88 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8
+; SSE2:  Cost of 22 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; SSE2:  Cost of 44 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; SSE2:  Cost of 88 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
-; AVX1:  LV: Found an estimated cost of 16 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8
-; AVX1:  LV: Found an estimated cost of 33 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8
-; AVX1:  LV: Found an estimated cost of 66 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8
-; AVX1:  LV: Found an estimated cost of 132 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX1:  Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 33 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 66 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 132 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
-; AVX2:  LV: Found an estimated cost of 6 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8
-; AVX2:  LV: Found an estimated cost of 9 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8
-; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8
-; AVX2:  LV: Found an estimated cost of 36 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX2:  Cost of 6 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 9 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 18 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 36 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v2, ptr %out2, align 8
-; AVX512:  LV: Found an estimated cost of 4 for VF 2 For instruction: store i64 %v2, ptr %out2, align 8
-; AVX512:  LV: Found an estimated cost of 8 for VF 4 For instruction: store i64 %v2, ptr %out2, align 8
-; AVX512:  LV: Found an estimated cost of 12 for VF 8 For instruction: store i64 %v2, ptr %out2, align 8
-; AVX512:  LV: Found an estimated cost of 24 for VF 16 For instruction: store i64 %v2, ptr %out2, align 8
-; AVX512:  LV: Found an estimated cost of 48 for VF 32 For instruction: store i64 %v2, ptr %out2, align 8
-; AVX512:  LV: Found an estimated cost of 96 for VF 64 For instruction: store i64 %v2, ptr %out2, align 8
+; AVX512:  Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512:  Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512:  Cost of 12 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512:  Cost of 24 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512:  Cost of 48 for VF 32: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512:  Cost of 96 for VF 64: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-4.ll
index fff9767f2500f..6b7be24b54f6b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-4.ll
@@ -14,31 +14,31 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
-; SSE2:  LV: Found an estimated cost of 28 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8
-; SSE2:  LV: Found an estimated cost of 56 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8
-; SSE2:  LV: Found an estimated cost of 112 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8
+; SSE2:  Cost of 28 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; SSE2:  Cost of 56 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; SSE2:  Cost of 112 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
-; AVX1:  LV: Found an estimated cost of 20 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8
-; AVX1:  LV: Found an estimated cost of 44 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8
-; AVX1:  LV: Found an estimated cost of 88 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8
-; AVX1:  LV: Found an estimated cost of 176 for VF 16 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX1:  Cost of 20 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 44 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 88 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 176 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
-; AVX2:  LV: Found an estimated cost of 8 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8
-; AVX2:  LV: Found an estimated cost of 12 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8
-; AVX2:  LV: Found an estimated cost of 28 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8
-; AVX2:  LV: Found an estimated cost of 56 for VF 16 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX2:  Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 28 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 56 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v3, ptr %out3, align 8
-; AVX512:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i64 %v3, ptr %out3, align 8
-; AVX512:  LV: Found an estimated cost of 11 for VF 4 For instruction: store i64 %v3, ptr %out3, align 8
-; AVX512:  LV: Found an estimated cost of 22 for VF 8 For instruction: store i64 %v3, ptr %out3, align 8
-; AVX512:  LV: Found an estimated cost of 44 for VF 16 For instruction: store i64 %v3, ptr %out3, align 8
-; AVX512:  LV: Found an estimated cost of 88 for VF 32 For instruction: store i64 %v3, ptr %out3, align 8
+; AVX512:  Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512:  Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512:  Cost of 22 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512:  Cost of 44 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512:  Cost of 88 for VF 32: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-5.ll
index 068eab2f3909b..52a4d4f9eb658 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-5.ll
@@ -14,28 +14,28 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
-; SSE2:  LV: Found an estimated cost of 38 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8
-; SSE2:  LV: Found an estimated cost of 76 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8
+; SSE2:  Cost of 38 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; SSE2:  Cost of 76 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
-; AVX1:  LV: Found an estimated cost of 26 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8
-; AVX1:  LV: Found an estimated cost of 55 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8
-; AVX1:  LV: Found an estimated cost of 110 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX1:  Cost of 26 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX1:  Cost of 55 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX1:  Cost of 110 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
-; AVX2:  LV: Found an estimated cost of 26 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8
-; AVX2:  LV: Found an estimated cost of 55 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8
-; AVX2:  LV: Found an estimated cost of 110 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX2:  Cost of 26 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX2:  Cost of 55 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX2:  Cost of 110 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v4, ptr %out4, align 8
-; AVX512:  LV: Found an estimated cost of 14 for VF 2 For instruction: store i64 %v4, ptr %out4, align 8
-; AVX512:  LV: Found an estimated cost of 21 for VF 4 For instruction: store i64 %v4, ptr %out4, align 8
-; AVX512:  LV: Found an estimated cost of 35 for VF 8 For instruction: store i64 %v4, ptr %out4, align 8
-; AVX512:  LV: Found an estimated cost of 70 for VF 16 For instruction: store i64 %v4, ptr %out4, align 8
-; AVX512:  LV: Found an estimated cost of 140 for VF 32 For instruction: store i64 %v4, ptr %out4, align 8
+; AVX512:  Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512:  Cost of 21 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512:  Cost of 35 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512:  Cost of 70 for VF 16: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512:  Cost of 140 for VF 32: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-6.ll
index 7de288fb52b37..14f36529a13b3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-6.ll
@@ -14,28 +14,28 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
-; SSE2:  LV: Found an estimated cost of 44 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8
-; SSE2:  LV: Found an estimated cost of 88 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8
+; SSE2:  Cost of 44 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; SSE2:  Cost of 88 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
-; AVX1:  LV: Found an estimated cost of 30 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8
-; AVX1:  LV: Found an estimated cost of 66 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8
-; AVX1:  LV: Found an estimated cost of 132 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX1:  Cost of 30 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX1:  Cost of 66 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX1:  Cost of 132 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
-; AVX2:  LV: Found an estimated cost of 11 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8
-; AVX2:  LV: Found an estimated cost of 21 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8
-; AVX2:  LV: Found an estimated cost of 42 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX2:  Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX2:  Cost of 21 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX2:  Cost of 42 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v5, ptr %out5, align 8
-; AVX512:  LV: Found an estimated cost of 17 for VF 2 For instruction: store i64 %v5, ptr %out5, align 8
-; AVX512:  LV: Found an estimated cost of 25 for VF 4 For instruction: store i64 %v5, ptr %out5, align 8
-; AVX512:  LV: Found an estimated cost of 51 for VF 8 For instruction: store i64 %v5, ptr %out5, align 8
-; AVX512:  LV: Found an estimated cost of 102 for VF 16 For instruction: store i64 %v5, ptr %out5, align 8
-; AVX512:  LV: Found an estimated cost of 204 for VF 32 For instruction: store i64 %v5, ptr %out5, align 8
+; AVX512:  Cost of 17 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512:  Cost of 25 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512:  Cost of 51 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512:  Cost of 102 for VF 16: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512:  Cost of 204 for VF 32: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll
index 0a48220c57fa5..41a45a3072c95 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i64-stride-7.ll
@@ -14,28 +14,28 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
-; SSE2:  LV: Found an estimated cost of 50 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8
-; SSE2:  LV: Found an estimated cost of 100 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8
+; SSE2:  Cost of 50 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; SSE2:  Cost of 100 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
-; AVX1:  LV: Found an estimated cost of 36 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8
-; AVX1:  LV: Found an estimated cost of 77 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8
-; AVX1:  LV: Found an estimated cost of 154 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX1:  Cost of 36 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX1:  Cost of 77 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX1:  Cost of 154 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
-; AVX2:  LV: Found an estimated cost of 36 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8
-; AVX2:  LV: Found an estimated cost of 77 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8
-; AVX2:  LV: Found an estimated cost of 154 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX2:  Cost of 36 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX2:  Cost of 77 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX2:  Cost of 154 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %v6, ptr %out6, align 8
-; AVX512:  LV: Found an estimated cost of 20 for VF 2 For instruction: store i64 %v6, ptr %out6, align 8
-; AVX512:  LV: Found an estimated cost of 40 for VF 4 For instruction: store i64 %v6, ptr %out6, align 8
-; AVX512:  LV: Found an estimated cost of 70 for VF 8 For instruction: store i64 %v6, ptr %out6, align 8
-; AVX512:  LV: Found an estimated cost of 140 for VF 16 For instruction: store i64 %v6, ptr %out6, align 8
-; AVX512:  LV: Found an estimated cost of 280 for VF 32 For instruction: store i64 %v6, ptr %out6, align 8
+; AVX512:  Cost of 20 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512:  Cost of 40 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512:  Cost of 70 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512:  Cost of 140 for VF 16: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512:  Cost of 280 for VF 32: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-2.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-2.ll
index 87fc6e1c581ac..78105db5e5ad3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-2.ll
@@ -15,44 +15,44 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, ptr %out1, align 1
-; SSE2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %v1, ptr %out1, align 1
-; SSE2:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i8 %v1, ptr %out1, align 1
-; SSE2:  LV: Found an estimated cost of 2 for VF 8 For instruction: store i8 %v1, ptr %out1, align 1
-; SSE2:  LV: Found an estimated cost of 126 for VF 16 For instruction: store i8 %v1, ptr %out1, align 1
+; SSE2:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; SSE2:  Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; SSE2:  Cost of 2 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; SSE2:  Cost of 126 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX1:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX1:  LV: Found an estimated cost of 2 for VF 8 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX1:  LV: Found an estimated cost of 66 for VF 16 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX1:  LV: Found an estimated cost of 134 for VF 32 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX1:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 2 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 66 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX1:  Cost of 134 for VF 32: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX2:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX2:  LV: Found an estimated cost of 2 for VF 8 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX2:  LV: Found an estimated cost of 4 for VF 16 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX2:  LV: Found an estimated cost of 6 for VF 32 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX2:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 2 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 4 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX2:  Cost of 6 for VF 32: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX512DQ:  LV: Found an estimated cost of 2 for VF 2 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX512DQ:  LV: Found an estimated cost of 2 for VF 4 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX512DQ:  LV: Found an estimated cost of 2 for VF 8 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX512DQ:  LV: Found an estimated cost of 4 for VF 16 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX512DQ:  LV: Found an estimated cost of 5 for VF 32 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX512DQ:  LV: Found an estimated cost of 270 for VF 64 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX512DQ:  Cost of 2 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 2 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 2 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 4 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 5 for VF 32: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 270 for VF 64: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX512BW:  LV: Found an estimated cost of 4 for VF 2 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX512BW:  LV: Found an estimated cost of 4 for VF 4 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX512BW:  LV: Found an estimated cost of 4 for VF 8 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX512BW:  LV: Found an estimated cost of 8 for VF 16 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX512BW:  LV: Found an estimated cost of 20 for VF 32 For instruction: store i8 %v1, ptr %out1, align 1
-; AVX512BW:  LV: Found an estimated cost of 41 for VF 64 For instruction: store i8 %v1, ptr %out1, align 1
+; AVX512BW:  Cost of 4 for VF 2: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 4 for VF 4: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 4 for VF 8: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 8 for VF 16: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 20 for VF 32: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 41 for VF 64: INTERLEAVE-GROUP with factor 2 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-3.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-3.ll
index 2cfd81ac65d5b..8e6892d0158e7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-3.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-3.ll
@@ -15,44 +15,44 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v2, ptr %out2, align 1
-; SSE2:  LV: Found an estimated cost of 26 for VF 2 For instruction: store i8 %v2, ptr %out2, align 1
-; SSE2:  LV: Found an estimated cost of 52 for VF 4 For instruction: store i8 %v2, ptr %out2, align 1
-; SSE2:  LV: Found an estimated cost of 101 for VF 8 For instruction: store i8 %v2, ptr %out2, align 1
-; SSE2:  LV: Found an estimated cost of 204 for VF 16 For instruction: store i8 %v2, ptr %out2, align 1
+; SSE2:  Cost of 26 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; SSE2:  Cost of 52 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; SSE2:  Cost of 101 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; SSE2:  Cost of 204 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX1:  LV: Found an estimated cost of 16 for VF 2 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX1:  LV: Found an estimated cost of 27 for VF 4 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX1:  LV: Found an estimated cost of 53 for VF 8 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX1:  LV: Found an estimated cost of 100 for VF 16 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX1:  LV: Found an estimated cost of 201 for VF 32 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX1:  Cost of 16 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 27 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 53 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 100 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX1:  Cost of 201 for VF 32: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX2:  LV: Found an estimated cost of 8 for VF 2 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX2:  LV: Found an estimated cost of 7 for VF 4 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX2:  LV: Found an estimated cost of 9 for VF 8 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX2:  LV: Found an estimated cost of 13 for VF 16 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX2:  LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX2:  Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 9 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 13 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX2:  Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX512DQ:  LV: Found an estimated cost of 8 for VF 2 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX512DQ:  LV: Found an estimated cost of 7 for VF 4 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX512DQ:  LV: Found an estimated cost of 9 for VF 8 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX512DQ:  LV: Found an estimated cost of 14 for VF 16 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX512DQ:  LV: Found an estimated cost of 15 for VF 32 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX512DQ:  LV: Found an estimated cost of 405 for VF 64 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX512DQ:  Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 9 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 14 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 15 for VF 32: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 405 for VF 64: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX512BW:  LV: Found an estimated cost of 8 for VF 2 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX512BW:  LV: Found an estimated cost of 8 for VF 4 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX512BW:  LV: Found an estimated cost of 16 for VF 8 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX512BW:  LV: Found an estimated cost of 13 for VF 16 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX512BW:  LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %v2, ptr %out2, align 1
-; AVX512BW:  LV: Found an estimated cost of 29 for VF 64 For instruction: store i8 %v2, ptr %out2, align 1
+; AVX512BW:  Cost of 8 for VF 2: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 8 for VF 4: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 16 for VF 8: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 13 for VF 16: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 29 for VF 64: INTERLEAVE-GROUP with factor 3 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-4.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-4.ll
index 8365716ddbc14..3083dde7ac66f 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-4.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-4.ll
@@ -15,44 +15,44 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v3, ptr %out3, align 1
-; SSE2:  LV: Found an estimated cost of 28 for VF 2 For instruction: store i8 %v3, ptr %out3, align 1
-; SSE2:  LV: Found an estimated cost of 60 for VF 4 For instruction: store i8 %v3, ptr %out3, align 1
-; SSE2:  LV: Found an estimated cost of 124 for VF 8 For instruction: store i8 %v3, ptr %out3, align 1
-; SSE2:  LV: Found an estimated cost of 252 for VF 16 For instruction: store i8 %v3, ptr %out3, align 1
+; SSE2:  Cost of 28 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; SSE2:  Cost of 60 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; SSE2:  Cost of 124 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; SSE2:  Cost of 252 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX1:  LV: Found an estimated cost of 17 for VF 2 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX1:  LV: Found an estimated cost of 33 for VF 4 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX1:  LV: Found an estimated cost of 66 for VF 8 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX1:  LV: Found an estimated cost of 132 for VF 16 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX1:  LV: Found an estimated cost of 268 for VF 32 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX1:  Cost of 17 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 33 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 66 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 132 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX1:  Cost of 268 for VF 32: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX2:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX2:  LV: Found an estimated cost of 5 for VF 4 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX2:  LV: Found an estimated cost of 5 for VF 8 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX2:  LV: Found an estimated cost of 10 for VF 16 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX2:  LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX2:  Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 10 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX2:  Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX512DQ:  LV: Found an estimated cost of 5 for VF 2 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX512DQ:  LV: Found an estimated cost of 5 for VF 4 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX512DQ:  LV: Found an estimated cost of 5 for VF 8 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX512DQ:  LV: Found an estimated cost of 9 for VF 16 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX512DQ:  LV: Found an estimated cost of 14 for VF 32 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX512DQ:  LV: Found an estimated cost of 540 for VF 64 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX512DQ:  Cost of 5 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 5 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 5 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 9 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 14 for VF 32: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 540 for VF 64: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX512BW:  LV: Found an estimated cost of 11 for VF 2 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX512BW:  LV: Found an estimated cost of 11 for VF 4 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX512BW:  LV: Found an estimated cost of 11 for VF 8 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX512BW:  LV: Found an estimated cost of 12 for VF 16 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX512BW:  LV: Found an estimated cost of 16 for VF 32 For instruction: store i8 %v3, ptr %out3, align 1
-; AVX512BW:  LV: Found an estimated cost of 28 for VF 64 For instruction: store i8 %v3, ptr %out3, align 1
+; AVX512BW:  Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 11 for VF 8: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 12 for VF 16: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 16 for VF 32: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 28 for VF 64: INTERLEAVE-GROUP with factor 4 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-5.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-5.ll
index eef594d855d3d..d788a7b8d469e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-5.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-5.ll
@@ -15,44 +15,44 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v4, ptr %out4, align 1
-; SSE2:  LV: Found an estimated cost of 44 for VF 2 For instruction: store i8 %v4, ptr %out4, align 1
-; SSE2:  LV: Found an estimated cost of 87 for VF 4 For instruction: store i8 %v4, ptr %out4, align 1
-; SSE2:  LV: Found an estimated cost of 178 for VF 8 For instruction: store i8 %v4, ptr %out4, align 1
-; SSE2:  LV: Found an estimated cost of 360 for VF 16 For instruction: store i8 %v4, ptr %out4, align 1
+; SSE2:  Cost of 44 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; SSE2:  Cost of 87 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; SSE2:  Cost of 178 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; SSE2:  Cost of 360 for VF 16: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX1:  LV: Found an estimated cost of 24 for VF 2 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX1:  LV: Found an estimated cost of 46 for VF 4 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX1:  LV: Found an estimated cost of 84 for VF 8 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX1:  LV: Found an estimated cost of 166 for VF 16 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX1:  LV: Found an estimated cost of 335 for VF 32 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX1:  Cost of 24 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX1:  Cost of 46 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX1:  Cost of 84 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX1:  Cost of 166 for VF 16: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX1:  Cost of 335 for VF 32: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX2:  LV: Found an estimated cost of 24 for VF 2 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX2:  LV: Found an estimated cost of 46 for VF 4 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX2:  LV: Found an estimated cost of 84 for VF 8 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX2:  LV: Found an estimated cost of 166 for VF 16 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX2:  LV: Found an estimated cost of 335 for VF 32 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX2:  Cost of 24 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX2:  Cost of 46 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX2:  Cost of 84 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX2:  Cost of 166 for VF 16: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX2:  Cost of 335 for VF 32: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX512DQ:  LV: Found an estimated cost of 24 for VF 2 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX512DQ:  LV: Found an estimated cost of 46 for VF 4 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX512DQ:  LV: Found an estimated cost of 87 for VF 8 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX512DQ:  LV: Found an estimated cost of 166 for VF 16 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX512DQ:  LV: Found an estimated cost of 336 for VF 32 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX512DQ:  LV: Found an estimated cost of 675 for VF 64 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX512DQ:  Cost of 24 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 46 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 87 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 166 for VF 16: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 336 for VF 32: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 675 for VF 64: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX512BW:  LV: Found an estimated cost of 15 for VF 2 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX512BW:  LV: Found an estimated cost of 31 for VF 4 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX512BW:  LV: Found an estimated cost of 79 for VF 8 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX512BW:  LV: Found an estimated cost of 158 for VF 16 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX512BW:  LV: Found an estimated cost of 237 for VF 32 For instruction: store i8 %v4, ptr %out4, align 1
-; AVX512BW:  LV: Found an estimated cost of 395 for VF 64 For instruction: store i8 %v4, ptr %out4, align 1
+; AVX512BW:  Cost of 15 for VF 2: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 31 for VF 4: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 79 for VF 8: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 158 for VF 16: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 237 for VF 32: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 395 for VF 64: INTERLEAVE-GROUP with factor 5 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-6.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-6.ll
index 4d91fa0dd83bf..f3ec189573a77 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-6.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-6.ll
@@ -15,44 +15,44 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v5, ptr %out5, align 1
-; SSE2:  LV: Found an estimated cost of 49 for VF 2 For instruction: store i8 %v5, ptr %out5, align 1
-; SSE2:  LV: Found an estimated cost of 98 for VF 4 For instruction: store i8 %v5, ptr %out5, align 1
-; SSE2:  LV: Found an estimated cost of 201 for VF 8 For instruction: store i8 %v5, ptr %out5, align 1
-; SSE2:  LV: Found an estimated cost of 408 for VF 16 For instruction: store i8 %v5, ptr %out5, align 1
+; SSE2:  Cost of 49 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; SSE2:  Cost of 98 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; SSE2:  Cost of 201 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; SSE2:  Cost of 408 for VF 16: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX1:  LV: Found an estimated cost of 27 for VF 2 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX1:  LV: Found an estimated cost of 53 for VF 4 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX1:  LV: Found an estimated cost of 100 for VF 8 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX1:  LV: Found an estimated cost of 198 for VF 16 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX1:  LV: Found an estimated cost of 402 for VF 32 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX1:  Cost of 27 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX1:  Cost of 53 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX1:  Cost of 100 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX1:  Cost of 198 for VF 16: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX1:  Cost of 402 for VF 32: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX2:  LV: Found an estimated cost of 10 for VF 2 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX2:  LV: Found an estimated cost of 12 for VF 4 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX2:  LV: Found an estimated cost of 18 for VF 8 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX2:  LV: Found an estimated cost of 30 for VF 16 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX2:  LV: Found an estimated cost of 96 for VF 32 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX2:  Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX2:  Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX2:  Cost of 18 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX2:  Cost of 30 for VF 16: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX2:  Cost of 96 for VF 32: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX512DQ:  LV: Found an estimated cost of 10 for VF 2 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX512DQ:  LV: Found an estimated cost of 12 for VF 4 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX512DQ:  LV: Found an estimated cost of 19 for VF 8 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX512DQ:  LV: Found an estimated cost of 29 for VF 16 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX512DQ:  LV: Found an estimated cost of 93 for VF 32 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX512DQ:  LV: Found an estimated cost of 810 for VF 64 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX512DQ:  Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 12 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 19 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 29 for VF 16: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 93 for VF 32: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 810 for VF 64: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX512BW:  LV: Found an estimated cost of 18 for VF 2 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX512BW:  LV: Found an estimated cost of 38 for VF 4 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX512BW:  LV: Found an estimated cost of 98 for VF 8 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX512BW:  LV: Found an estimated cost of 197 for VF 16 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX512BW:  LV: Found an estimated cost of 295 for VF 32 For instruction: store i8 %v5, ptr %out5, align 1
-; AVX512BW:  LV: Found an estimated cost of 591 for VF 64 For instruction: store i8 %v5, ptr %out5, align 1
+; AVX512BW:  Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 38 for VF 4: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 98 for VF 8: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 197 for VF 16: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 295 for VF 32: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 591 for VF 64: INTERLEAVE-GROUP with factor 6 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-7.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-7.ll
index 5c7a6a8796234..ed04d806bf415 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-7.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/interleaved-store-i8-stride-7.ll
@@ -15,44 +15,44 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE2-LABEL: 'test'
 ; SSE2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v6, ptr %out6, align 1
-; SSE2:  LV: Found an estimated cost of 57 for VF 2 For instruction: store i8 %v6, ptr %out6, align 1
-; SSE2:  LV: Found an estimated cost of 112 for VF 4 For instruction: store i8 %v6, ptr %out6, align 1
-; SSE2:  LV: Found an estimated cost of 225 for VF 8 For instruction: store i8 %v6, ptr %out6, align 1
-; SSE2:  LV: Found an estimated cost of 456 for VF 16 For instruction: store i8 %v6, ptr %out6, align 1
+; SSE2:  Cost of 57 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; SSE2:  Cost of 112 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; SSE2:  Cost of 225 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; SSE2:  Cost of 456 for VF 16: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX1:  LV: Found an estimated cost of 34 for VF 2 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX1:  LV: Found an estimated cost of 63 for VF 4 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX1:  LV: Found an estimated cost of 119 for VF 8 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX1:  LV: Found an estimated cost of 232 for VF 16 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX1:  LV: Found an estimated cost of 469 for VF 32 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX1:  Cost of 34 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX1:  Cost of 63 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX1:  Cost of 119 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX1:  Cost of 232 for VF 16: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX1:  Cost of 469 for VF 32: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX2:  LV: Found an estimated cost of 34 for VF 2 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX2:  LV: Found an estimated cost of 63 for VF 4 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX2:  LV: Found an estimated cost of 119 for VF 8 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX2:  LV: Found an estimated cost of 232 for VF 16 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX2:  LV: Found an estimated cost of 469 for VF 32 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX2:  Cost of 34 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX2:  Cost of 63 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX2:  Cost of 119 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX2:  Cost of 232 for VF 16: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX2:  Cost of 469 for VF 32: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 ; AVX512DQ-LABEL: 'test'
 ; AVX512DQ:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX512DQ:  LV: Found an estimated cost of 34 for VF 2 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX512DQ:  LV: Found an estimated cost of 63 for VF 4 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX512DQ:  LV: Found an estimated cost of 121 for VF 8 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX512DQ:  LV: Found an estimated cost of 234 for VF 16 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX512DQ:  LV: Found an estimated cost of 470 for VF 32 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX512DQ:  LV: Found an estimated cost of 945 for VF 64 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX512DQ:  Cost of 34 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 63 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 121 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 234 for VF 16: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 470 for VF 32: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512DQ:  Cost of 945 for VF 64: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 ; AVX512BW-LABEL: 'test'
 ; AVX512BW:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX512BW:  LV: Found an estimated cost of 22 for VF 2 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX512BW:  LV: Found an estimated cost of 46 for VF 4 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX512BW:  LV: Found an estimated cost of 118 for VF 8 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX512BW:  LV: Found an estimated cost of 236 for VF 16 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX512BW:  LV: Found an estimated cost of 472 for VF 32 For instruction: store i8 %v6, ptr %out6, align 1
-; AVX512BW:  LV: Found an estimated cost of 826 for VF 64 For instruction: store i8 %v6, ptr %out6, align 1
+; AVX512BW:  Cost of 22 for VF 2: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 46 for VF 4: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 118 for VF 8: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 236 for VF 16: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 472 for VF 32: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
+; AVX512BW:  Cost of 826 for VF 64: INTERLEAVE-GROUP with factor 7 at <badref>, ir<%out0>
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-gather-i32-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-gather-i32-with-i8-index.ll
index b62d3fb250915..d3d439f37c440 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-gather-i32-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-gather-i32-with-i8-index.ll
@@ -18,43 +18,43 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE-LABEL: 'test'
 ; SSE:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; SSE:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; SSE:  LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; SSE:  LV: Found an estimated cost of 3000000 for VF 8 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; SSE:  LV: Found an estimated cost of 3000000 for VF 16 For instruction: %valB.loaded = load i32, ptr %inB, align 4
+; SSE:  Cost of 3000000 for VF 2: {{.*}}ir<%valB.loaded> = load
+; SSE:  Cost of 3000000 for VF 4: {{.*}}ir<%valB.loaded> = load
+; SSE:  Cost of 3000000 for VF 8: {{.*}}ir<%valB.loaded> = load
+; SSE:  Cost of 3000000 for VF 16: {{.*}}ir<%valB.loaded> = load
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX1:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX1:  LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX1:  LV: Found an estimated cost of 3000000 for VF 8 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX1:  LV: Found an estimated cost of 3000000 for VF 16 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX1:  LV: Found an estimated cost of 3000000 for VF 32 For instruction: %valB.loaded = load i32, ptr %inB, align 4
+; AVX1:  Cost of 3000000 for VF 2: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 3000000 for VF 4: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 3000000 for VF 8: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 3000000 for VF 16: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 3000000 for VF 32: {{.*}}ir<%valB.loaded> = load
 ;
 ; AVX2-SLOWGATHER-LABEL: 'test'
 ; AVX2-SLOWGATHER:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 3000000 for VF 8 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 3000000 for VF 16 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 3000000 for VF 32 For instruction: %valB.loaded = load i32, ptr %inB, align 4
+; AVX2-SLOWGATHER:  Cost of 3000000 for VF 2: {{.*}}ir<%valB.loaded> = load
+; AVX2-SLOWGATHER:  Cost of 3000000 for VF 4: {{.*}}ir<%valB.loaded> = load
+; AVX2-SLOWGATHER:  Cost of 3000000 for VF 8: {{.*}}ir<%valB.loaded> = load
+; AVX2-SLOWGATHER:  Cost of 3000000 for VF 16: {{.*}}ir<%valB.loaded> = load
+; AVX2-SLOWGATHER:  Cost of 3000000 for VF 32: {{.*}}ir<%valB.loaded> = load
 ;
 ; AVX2-FASTGATHER-LABEL: 'test'
 ; AVX2-FASTGATHER:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 4 for VF 2 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 6 for VF 4 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 12 for VF 8 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 24 for VF 16 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 48 for VF 32 For instruction: %valB.loaded = load i32, ptr %inB, align 4
+; AVX2-FASTGATHER:  Cost of 4 for VF 2: {{.*}}ir<%valB.loaded> = load
+; AVX2-FASTGATHER:  Cost of 6 for VF 4: {{.*}}ir<%valB.loaded> = load
+; AVX2-FASTGATHER:  Cost of 12 for VF 8: {{.*}}ir<%valB.loaded> = load
+; AVX2-FASTGATHER:  Cost of 24 for VF 16: {{.*}}ir<%valB.loaded> = load
+; AVX2-FASTGATHER:  Cost of 48 for VF 32: {{.*}}ir<%valB.loaded> = load
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 8 for VF 2 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 17 for VF 4 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 18 for VF 16 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 36 for VF 32 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 72 for VF 64 For instruction: %valB.loaded = load i32, ptr %inB, align 4
+; AVX512:  Cost of 8 for VF 2: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 17 for VF 4: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 10 for VF 8: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 18 for VF 16: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 36 for VF 32: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 72 for VF 64: {{.*}}ir<%valB.loaded> = load
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-gather-i64-with-i8-index.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-gather-i64-with-i8-index.ll
index 1d3e45765e511..f1dec928391eb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-gather-i64-with-i8-index.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-gather-i64-with-i8-index.ll
@@ -18,43 +18,43 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test() {
 ; SSE-LABEL: 'test'
 ; SSE:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; SSE:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; SSE:  LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; SSE:  LV: Found an estimated cost of 3000000 for VF 8 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; SSE:  LV: Found an estimated cost of 3000000 for VF 16 For instruction: %valB.loaded = load i64, ptr %inB, align 8
+; SSE:  Cost of 3000000 for VF 2: {{.*}}ir<%valB.loaded> = load
+; SSE:  Cost of 3000000 for VF 4: {{.*}}ir<%valB.loaded> = load
+; SSE:  Cost of 3000000 for VF 8: {{.*}}ir<%valB.loaded> = load
+; SSE:  Cost of 3000000 for VF 16: {{.*}}ir<%valB.loaded> = load
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX1:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX1:  LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX1:  LV: Found an estimated cost of 3000000 for VF 8 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX1:  LV: Found an estimated cost of 3000000 for VF 16 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX1:  LV: Found an estimated cost of 3000000 for VF 32 For instruction: %valB.loaded = load i64, ptr %inB, align 8
+; AVX1:  Cost of 3000000 for VF 2: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 3000000 for VF 4: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 3000000 for VF 8: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 3000000 for VF 16: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 3000000 for VF 32: {{.*}}ir<%valB.loaded> = load
 ;
 ; AVX2-SLOWGATHER-LABEL: 'test'
 ; AVX2-SLOWGATHER:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 3000000 for VF 8 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 3000000 for VF 16 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX2-SLOWGATHER:  LV: Found an estimated cost of 3000000 for VF 32 For instruction: %valB.loaded = load i64, ptr %inB, align 8
+; AVX2-SLOWGATHER:  Cost of 3000000 for VF 2: {{.*}}ir<%valB.loaded> = load
+; AVX2-SLOWGATHER:  Cost of 3000000 for VF 4: {{.*}}ir<%valB.loaded> = load
+; AVX2-SLOWGATHER:  Cost of 3000000 for VF 8: {{.*}}ir<%valB.loaded> = load
+; AVX2-SLOWGATHER:  Cost of 3000000 for VF 16: {{.*}}ir<%valB.loaded> = load
+; AVX2-SLOWGATHER:  Cost of 3000000 for VF 32: {{.*}}ir<%valB.loaded> = load
 ;
 ; AVX2-FASTGATHER-LABEL: 'test'
 ; AVX2-FASTGATHER:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 4 for VF 2 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 6 for VF 4 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 12 for VF 8 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 24 for VF 16 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX2-FASTGATHER:  LV: Found an estimated cost of 48 for VF 32 For instruction: %valB.loaded = load i64, ptr %inB, align 8
+; AVX2-FASTGATHER:  Cost of 4 for VF 2: {{.*}}ir<%valB.loaded> = load
+; AVX2-FASTGATHER:  Cost of 6 for VF 4: {{.*}}ir<%valB.loaded> = load
+; AVX2-FASTGATHER:  Cost of 12 for VF 8: {{.*}}ir<%valB.loaded> = load
+; AVX2-FASTGATHER:  Cost of 24 for VF 16: {{.*}}ir<%valB.loaded> = load
+; AVX2-FASTGATHER:  Cost of 48 for VF 32: {{.*}}ir<%valB.loaded> = load
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 8 for VF 2 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 18 for VF 4 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 80 for VF 64 For instruction: %valB.loaded = load i64, ptr %inB, align 8
+; AVX512:  Cost of 8 for VF 2: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 18 for VF 4: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 10 for VF 8: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 20 for VF 16: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 40 for VF 32: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 80 for VF 64: {{.*}}ir<%valB.loaded> = load
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i16.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i16.ll
index 2b34f5a9a6967..5f910a6dfaa61 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i16.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i16.ll
@@ -17,35 +17,35 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test(ptr %B) {
 ; SSE-LABEL: 'test'
 ; SSE:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i16, ptr %inB, align 2
-; SSE:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i16, ptr %inB, align 2
-; SSE:  LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i16, ptr %inB, align 2
-; SSE:  LV: Found an estimated cost of 3000000 for VF 8 For instruction: %valB.loaded = load i16, ptr %inB, align 2
-; SSE:  LV: Found an estimated cost of 3000000 for VF 16 For instruction: %valB.loaded = load i16, ptr %inB, align 2
+; SSE:  Cost of 3000000 for VF 2: {{.*}}ir<%valB.loaded> = load
+; SSE:  Cost of 3000000 for VF 4: {{.*}}ir<%valB.loaded> = load
+; SSE:  Cost of 3000000 for VF 8: {{.*}}ir<%valB.loaded> = load
+; SSE:  Cost of 3000000 for VF 16: {{.*}}ir<%valB.loaded> = load
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i16, ptr %inB, align 2
-; AVX1:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i16, ptr %inB, align 2
-; AVX1:  LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i16, ptr %inB, align 2
-; AVX1:  LV: Found an estimated cost of 3000000 for VF 8 For instruction: %valB.loaded = load i16, ptr %inB, align 2
-; AVX1:  LV: Found an estimated cost of 3000000 for VF 16 For instruction: %valB.loaded = load i16, ptr %inB, align 2
-; AVX1:  LV: Found an estimated cost of 3000000 for VF 32 For instruction: %valB.loaded = load i16, ptr %inB, align 2
+; AVX1:  Cost of 3000000 for VF 2: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 3000000 for VF 4: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 3000000 for VF 8: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 3000000 for VF 16: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 3000000 for VF 32: {{.*}}ir<%valB.loaded> = load
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i16, ptr %inB, align 2
-; AVX2:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i16, ptr %inB, align 2
-; AVX2:  LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i16, ptr %inB, align 2
-; AVX2:  LV: Found an estimated cost of 3000000 for VF 8 For instruction: %valB.loaded = load i16, ptr %inB, align 2
-; AVX2:  LV: Found an estimated cost of 3000000 for VF 16 For instruction: %valB.loaded = load i16, ptr %inB, align 2
-; AVX2:  LV: Found an estimated cost of 3000000 for VF 32 For instruction: %valB.loaded = load i16, ptr %inB, align 2
+; AVX2:  Cost of 3000000 for VF 2: {{.*}}ir<%valB.loaded> = load
+; AVX2:  Cost of 3000000 for VF 4: {{.*}}ir<%valB.loaded> = load
+; AVX2:  Cost of 3000000 for VF 8: {{.*}}ir<%valB.loaded> = load
+; AVX2:  Cost of 3000000 for VF 16: {{.*}}ir<%valB.loaded> = load
+; AVX2:  Cost of 3000000 for VF 32: {{.*}}ir<%valB.loaded> = load
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i16, ptr %inB, align 2
-; AVX512:  LV: Found an estimated cost of 2 for VF 2 For instruction: %valB.loaded = load i16, ptr %inB, align 2
-; AVX512:  LV: Found an estimated cost of 2 for VF 4 For instruction: %valB.loaded = load i16, ptr %inB, align 2
-; AVX512:  LV: Found an estimated cost of 1 for VF 8 For instruction: %valB.loaded = load i16, ptr %inB, align 2
-; AVX512:  LV: Found an estimated cost of 1 for VF 16 For instruction: %valB.loaded = load i16, ptr %inB, align 2
-; AVX512:  LV: Found an estimated cost of 1 for VF 32 For instruction: %valB.loaded = load i16, ptr %inB, align 2
-; AVX512:  LV: Found an estimated cost of 2 for VF 64 For instruction: %valB.loaded = load i16, ptr %inB, align 2
+; AVX512:  Cost of 2 for VF 2: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 2 for VF 4: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 1 for VF 8: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 1 for VF 16: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 1 for VF 32: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 2 for VF 64: {{.*}}ir<%valB.loaded> = load
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i32.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i32.ll
index 62e7b1eb1000a..095444a727fe8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i32.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i32.ll
@@ -17,35 +17,35 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test(ptr %B) {
 ; SSE-LABEL: 'test'
 ; SSE:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; SSE:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; SSE:  LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; SSE:  LV: Found an estimated cost of 3000000 for VF 8 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; SSE:  LV: Found an estimated cost of 3000000 for VF 16 For instruction: %valB.loaded = load i32, ptr %inB, align 4
+; SSE:  Cost of 3000000 for VF 2: {{.*}}ir<%valB.loaded> = load
+; SSE:  Cost of 3000000 for VF 4: {{.*}}ir<%valB.loaded> = load
+; SSE:  Cost of 3000000 for VF 8: {{.*}}ir<%valB.loaded> = load
+; SSE:  Cost of 3000000 for VF 16: {{.*}}ir<%valB.loaded> = load
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX1:  LV: Found an estimated cost of 3 for VF 2 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX1:  LV: Found an estimated cost of 2 for VF 4 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX1:  LV: Found an estimated cost of 2 for VF 8 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX1:  LV: Found an estimated cost of 4 for VF 16 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX1:  LV: Found an estimated cost of 8 for VF 32 For instruction: %valB.loaded = load i32, ptr %inB, align 4
+; AVX1:  Cost of 3 for VF 2: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 2 for VF 4: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 2 for VF 8: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 4 for VF 16: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 8 for VF 32: {{.*}}ir<%valB.loaded> = load
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX2:  LV: Found an estimated cost of 3 for VF 2 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX2:  LV: Found an estimated cost of 2 for VF 4 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX2:  LV: Found an estimated cost of 2 for VF 8 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX2:  LV: Found an estimated cost of 4 for VF 16 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX2:  LV: Found an estimated cost of 8 for VF 32 For instruction: %valB.loaded = load i32, ptr %inB, align 4
+; AVX2:  Cost of 3 for VF 2: {{.*}}ir<%valB.loaded> = load
+; AVX2:  Cost of 2 for VF 4: {{.*}}ir<%valB.loaded> = load
+; AVX2:  Cost of 2 for VF 8: {{.*}}ir<%valB.loaded> = load
+; AVX2:  Cost of 4 for VF 16: {{.*}}ir<%valB.loaded> = load
+; AVX2:  Cost of 8 for VF 32: {{.*}}ir<%valB.loaded> = load
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 2 for VF 2 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 1 for VF 4 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 1 for VF 8 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 1 for VF 16 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 2 for VF 32 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 4 for VF 64 For instruction: %valB.loaded = load i32, ptr %inB, align 4
+; AVX512:  Cost of 2 for VF 2: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 1 for VF 4: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 1 for VF 8: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 1 for VF 16: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 2 for VF 32: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 4 for VF 64: {{.*}}ir<%valB.loaded> = load
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i64.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i64.ll
index bc7adc9b9cff1..cd161fcdeaeae 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i64.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i64.ll
@@ -17,35 +17,35 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test(ptr %B) {
 ; SSE-LABEL: 'test'
 ; SSE:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; SSE:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; SSE:  LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; SSE:  LV: Found an estimated cost of 3000000 for VF 8 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; SSE:  LV: Found an estimated cost of 3000000 for VF 16 For instruction: %valB.loaded = load i64, ptr %inB, align 8
+; SSE:  Cost of 3000000 for VF 2: {{.*}}ir<%valB.loaded> = load
+; SSE:  Cost of 3000000 for VF 4: {{.*}}ir<%valB.loaded> = load
+; SSE:  Cost of 3000000 for VF 8: {{.*}}ir<%valB.loaded> = load
+; SSE:  Cost of 3000000 for VF 16: {{.*}}ir<%valB.loaded> = load
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX1:  LV: Found an estimated cost of 2 for VF 2 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX1:  LV: Found an estimated cost of 2 for VF 4 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX1:  LV: Found an estimated cost of 4 for VF 8 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX1:  LV: Found an estimated cost of 8 for VF 16 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX1:  LV: Found an estimated cost of 16 for VF 32 For instruction: %valB.loaded = load i64, ptr %inB, align 8
+; AVX1:  Cost of 2 for VF 2: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 2 for VF 4: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 4 for VF 8: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 8 for VF 16: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 16 for VF 32: {{.*}}ir<%valB.loaded> = load
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX2:  LV: Found an estimated cost of 2 for VF 2 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX2:  LV: Found an estimated cost of 2 for VF 4 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX2:  LV: Found an estimated cost of 4 for VF 8 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX2:  LV: Found an estimated cost of 8 for VF 16 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX2:  LV: Found an estimated cost of 16 for VF 32 For instruction: %valB.loaded = load i64, ptr %inB, align 8
+; AVX2:  Cost of 2 for VF 2: {{.*}}ir<%valB.loaded> = load
+; AVX2:  Cost of 2 for VF 4: {{.*}}ir<%valB.loaded> = load
+; AVX2:  Cost of 4 for VF 8: {{.*}}ir<%valB.loaded> = load
+; AVX2:  Cost of 8 for VF 16: {{.*}}ir<%valB.loaded> = load
+; AVX2:  Cost of 16 for VF 32: {{.*}}ir<%valB.loaded> = load
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 1 for VF 2 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 1 for VF 4 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 1 for VF 8 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 2 for VF 16 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 4 for VF 32 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 8 for VF 64 For instruction: %valB.loaded = load i64, ptr %inB, align 8
+; AVX512:  Cost of 1 for VF 2: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 1 for VF 4: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 1 for VF 8: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 2 for VF 16: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 4 for VF 32: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 8 for VF 64: {{.*}}ir<%valB.loaded> = load
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i8.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i8.ll
index e623047cf8c8d..4c2cbc41e53ff 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i8.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/masked-load-i8.ll
@@ -17,35 +17,35 @@ target triple = "x86_64-unknown-linux-gnu"
 define void @test(ptr %B) {
 ; SSE-LABEL: 'test'
 ; SSE:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i8, ptr %inB, align 1
-; SSE:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i8, ptr %inB, align 1
-; SSE:  LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i8, ptr %inB, align 1
-; SSE:  LV: Found an estimated cost of 3000000 for VF 8 For instruction: %valB.loaded = load i8, ptr %inB, align 1
-; SSE:  LV: Found an estimated cost of 3000000 for VF 16 For instruction: %valB.loaded = load i8, ptr %inB, align 1
+; SSE:  Cost of 3000000 for VF 2: {{.*}}ir<%valB.loaded> = load
+; SSE:  Cost of 3000000 for VF 4: {{.*}}ir<%valB.loaded> = load
+; SSE:  Cost of 3000000 for VF 8: {{.*}}ir<%valB.loaded> = load
+; SSE:  Cost of 3000000 for VF 16: {{.*}}ir<%valB.loaded> = load
 ;
 ; AVX1-LABEL: 'test'
 ; AVX1:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i8, ptr %inB, align 1
-; AVX1:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i8, ptr %inB, align 1
-; AVX1:  LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i8, ptr %inB, align 1
-; AVX1:  LV: Found an estimated cost of 3000000 for VF 8 For instruction: %valB.loaded = load i8, ptr %inB, align 1
-; AVX1:  LV: Found an estimated cost of 3000000 for VF 16 For instruction: %valB.loaded = load i8, ptr %inB, align 1
-; AVX1:  LV: Found an estimated cost of 3000000 for VF 32 For instruction: %valB.loaded = load i8, ptr %inB, align 1
+; AVX1:  Cost of 3000000 for VF 2: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 3000000 for VF 4: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 3000000 for VF 8: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 3000000 for VF 16: {{.*}}ir<%valB.loaded> = load
+; AVX1:  Cost of 3000000 for VF 32: {{.*}}ir<%valB.loaded> = load
 ;
 ; AVX2-LABEL: 'test'
 ; AVX2:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i8, ptr %inB, align 1
-; AVX2:  LV: Found an estimated cost of 3000000 for VF 2 For instruction: %valB.loaded = load i8, ptr %inB, align 1
-; AVX2:  LV: Found an estimated cost of 3000000 for VF 4 For instruction: %valB.loaded = load i8, ptr %inB, align 1
-; AVX2:  LV: Found an estimated cost of 3000000 for VF 8 For instruction: %valB.loaded = load i8, ptr %inB, align 1
-; AVX2:  LV: Found an estimated cost of 3000000 for VF 16 For instruction: %valB.loaded = load i8, ptr %inB, align 1
-; AVX2:  LV: Found an estimated cost of 3000000 for VF 32 For instruction: %valB.loaded = load i8, ptr %inB, align 1
+; AVX2:  Cost of 3000000 for VF 2: {{.*}}ir<%valB.loaded> = load
+; AVX2:  Cost of 3000000 for VF 4: {{.*}}ir<%valB.loaded> = load
+; AVX2:  Cost of 3000000 for VF 8: {{.*}}ir<%valB.loaded> = load
+; AVX2:  Cost of 3000000 for VF 16: {{.*}}ir<%valB.loaded> = load
+; AVX2:  Cost of 3000000 for VF 32: {{.*}}ir<%valB.loaded> = load
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i8, ptr %inB, align 1
-; AVX512:  LV: Found an estimated cost of 2 for VF 2 For instruction: %valB.loaded = load i8, ptr %inB, align 1
-; AVX512:  LV: Found an estimated cost of 2 for VF 4 For instruction: %valB.loaded = load i8, ptr %inB, align 1
-; AVX512:  LV: Found an estimated cost of 2 for VF 8 For instruction: %valB.loaded = load i8, ptr %inB, align 1
-; AVX512:  LV: Found an estimated cost of 1 for VF 16 For instruction: %valB.loaded = load i8, ptr %inB, align 1
-; AVX512:  LV: Found an estimated cost of 1 for VF 32 For instruction: %valB.loaded = load i8, ptr %inB, align 1
-; AVX512:  LV: Found an estimated cost of 1 for VF 64 For instruction: %valB.loaded = load i8, ptr %inB, align 1
+; AVX512:  Cost of 2 for VF 2: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 2 for VF 4: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 2 for VF 8: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 1 for VF 16: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 1 for VF 32: {{.*}}ir<%valB.loaded> = load
+; AVX512:  Cost of 1 for VF 64: {{.*}}ir<%valB.loaded> = load
 ;
 entry:
   br label %for.body

From 953b07febca46036b2311b5998244fe07b61544f Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin@gmail.com>
Date: Fri, 20 Dec 2024 16:09:01 +0100
Subject: [PATCH 203/209] [mlir] AMDGPUToROCDL: RawBufferOpLowering fixes
 (#120642)

1. We can use `getNumElements()` only for memrefs with trivial layout.
2. Buffer ops expecting sizes in i32 but descriptor values can be either
i32 or i64, add appropriate casts. This implementation is not ideal as
it can overflow, but it's still better than generating broken IR.
---
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           | 20 +++++++++++++------
 .../AMDGPUToROCDL/amdgpu-to-rocdl.mlir        | 19 ++++++++++++++++++
 2 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 5a7897f233eaa..4100b086fad8b 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -91,6 +91,13 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
     Type llvmI32 = this->typeConverter->convertType(i32);
     Type llvmI16 = this->typeConverter->convertType(rewriter.getI16Type());
 
+    auto toI32 = [&](Value val) -> Value {
+      if (val.getType() == llvmI32)
+        return val;
+
+      return rewriter.create<LLVM::TruncOp>(loc, llvmI32, val);
+    };
+
     int64_t elementByteWidth = memrefType.getElementTypeBitWidth() / 8;
     Value byteWidthConst = createI32Constant(rewriter, loc, elementByteWidth);
 
@@ -166,22 +173,22 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
     Value stride = rewriter.create<LLVM::ConstantOp>(
         loc, llvmI16, rewriter.getI16IntegerAttr(0));
     Value numRecords;
-    if (memrefType.hasStaticShape()) {
+    if (memrefType.hasStaticShape() && memrefType.getLayout().isIdentity()) {
       numRecords = createI32Constant(
           rewriter, loc,
           static_cast<int32_t>(memrefType.getNumElements() * elementByteWidth));
     } else {
       Value maxIndex;
       for (uint32_t i = 0, e = memrefType.getRank(); i < e; ++i) {
-        Value size = memrefDescriptor.size(rewriter, loc, i);
-        Value stride = memrefDescriptor.stride(rewriter, loc, i);
+        Value size = toI32(memrefDescriptor.size(rewriter, loc, i));
+        Value stride = toI32(memrefDescriptor.stride(rewriter, loc, i));
         stride = rewriter.create<LLVM::MulOp>(loc, stride, byteWidthConst);
         Value maxThisDim = rewriter.create<LLVM::MulOp>(loc, size, stride);
         maxIndex = maxIndex ? rewriter.create<LLVM::MaximumOp>(loc, maxIndex,
                                                                maxThisDim)
                             : maxThisDim;
       }
-      numRecords = rewriter.create<LLVM::TruncOp>(loc, llvmI32, maxIndex);
+      numRecords = maxIndex;
     }
 
     // Flag word:
@@ -218,7 +225,8 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
       Value strideOp;
       if (ShapedType::isDynamic(strides[i])) {
         strideOp = rewriter.create<LLVM::MulOp>(
-            loc, memrefDescriptor.stride(rewriter, loc, i), byteWidthConst);
+            loc, toI32(memrefDescriptor.stride(rewriter, loc, i)),
+            byteWidthConst);
       } else {
         strideOp =
             createI32Constant(rewriter, loc, strides[i] * elementByteWidth);
@@ -240,7 +248,7 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
       sgprOffset = createI32Constant(rewriter, loc, 0);
     if (ShapedType::isDynamic(offset))
       sgprOffset = rewriter.create<LLVM::AddOp>(
-          loc, memrefDescriptor.offset(rewriter, loc), sgprOffset);
+          loc, toI32(memrefDescriptor.offset(rewriter, loc)), sgprOffset);
     else if (offset > 0)
       sgprOffset = rewriter.create<LLVM::AddOp>(
           loc, sgprOffset, createI32Constant(rewriter, loc, offset));
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index a9ea44925e914..4c7515dc81051 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -30,6 +30,25 @@ func.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 {
   func.return %0 : i32
 }
 
+// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32_strided
+func.func @gpu_gcn_raw_buffer_load_i32_strided(%buf: memref<64xi32, strided<[?], offset: ?>>, %idx: i32) -> i32 {
+  // CHECK-DAG: %[[rstride:.*]] = llvm.mlir.constant(0 : i16)
+  // CHECK-DAG: %[[elem_size:.*]] = llvm.mlir.constant(4 : i32)
+  // CHECK: %[[size:.*]] = llvm.extractvalue %{{.*}}[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+  // CHECK: %[[size32:.*]] = llvm.trunc %[[size]] : i64 to i32
+  // CHECK: %[[stride:.*]] = llvm.extractvalue %{{.*}}[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+  // CHECK: %[[stride32:.*]] = llvm.trunc %[[stride]] : i64 to i32
+  // CHECK: %[[tmp:.*]] = llvm.mul %[[stride32]], %[[elem_size]] : i32
+  // CHECK: %[[numRecords:.*]] = llvm.mul %[[size32]], %[[tmp]] : i32
+  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
+  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %[[rstride]], %[[numRecords]], %[[flags]] : !llvm.ptr to <8>
+  // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
+  // CHECK: return %[[ret]]
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi32, strided<[?], offset: ?>>, i32 -> i32
+  func.return %0 : i32
+}
+
 // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32_oob_off
 func.func @gpu_gcn_raw_buffer_load_i32_oob_off(%buf: memref<64xi32>, %idx: i32) -> i32 {
   // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)

From 56ffcd4e01851d6a8c171e2a813828f720ed1538 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 20 Dec 2024 16:38:50 +0100
Subject: [PATCH 204/209] [mlir] Fix integration tests after #120580 (part 2)
 (#120751)

This commit should have been part of #120580.
---
 mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/sve.mlir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/sve.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/sve.mlir
index 40307c5f9da07..84a8e10388b7c 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/sve.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/sve.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -lower-affine -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm="enable-arm-sve" -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -canonicalize | \
+// RUN: mlir-opt %s -lower-affine -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm="enable-arm-sve" -finalize-memref-to-llvm -convert-func-to-llvm -convert-arith-to-llvm -convert-cf-to-llvm -canonicalize | \
 // RUN: %mcr_aarch64_cmd -e=entry -entry-point-result=void --march=aarch64 --mattr="+sve" -shared-libs=%native_mlir_c_runner_utils | \
 // RUN: FileCheck %s
 

From 8177bf5022c6dfc48d236082fa02076feedd60df Mon Sep 17 00:00:00 2001
From: Ikhlas Ajbar <iajbar@quicinc.com>
Date: Fri, 20 Dec 2024 09:41:30 -0600
Subject: [PATCH 205/209] [Hexagon] Only handle simple types memory accesses
 (#120654)

The code was asserting because allowsMemoryAccess() was called with
Extended Value Type INVALID_SIMPLE_VALUE_TYPE in
HexagonISelLowering.cpp.
Fixes https://github.com/llvm/llvm-project/issues/118881
---
 .../Target/Hexagon/HexagonISelLowering.cpp    |  4 ++++
 llvm/test/CodeGen/Hexagon/simple-types-mem.ll | 22 +++++++++++++++++++
 2 files changed, 26 insertions(+)
 create mode 100644 llvm/test/CodeGen/Hexagon/simple-types-mem.ll

diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 9fbf4cb684a52..900a9054fc2c3 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -3793,6 +3793,8 @@ EVT HexagonTargetLowering::getOptimalMemOpType(
 bool HexagonTargetLowering::allowsMemoryAccess(
     LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace,
     Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const {
+  if (!VT.isSimple())
+    return false;
   MVT SVT = VT.getSimpleVT();
   if (Subtarget.isHVXVectorType(SVT, true))
     return allowsHvxMemoryAccess(SVT, Flags, Fast);
@@ -3803,6 +3805,8 @@ bool HexagonTargetLowering::allowsMemoryAccess(
 bool HexagonTargetLowering::allowsMisalignedMemoryAccesses(
     EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
     unsigned *Fast) const {
+  if (!VT.isSimple())
+    return false;
   MVT SVT = VT.getSimpleVT();
   if (Subtarget.isHVXVectorType(SVT, true))
     return allowsHvxMisalignedMemoryAccesses(SVT, Flags, Fast);
diff --git a/llvm/test/CodeGen/Hexagon/simple-types-mem.ll b/llvm/test/CodeGen/Hexagon/simple-types-mem.ll
new file mode 100644
index 0000000000000..01baa65a59353
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/simple-types-mem.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=hexagon < %s
+; REQUIRES: asserts
+
+; Only simple types memory accesses are handled.
+
+target triple = "hexagon"
+
+%struct.hoge = type { i320 }
+
+define dso_local void @widget() {
+bb:
+  %tmp = alloca %struct.hoge, align 1
+  %tmp1 = bitcast %struct.hoge* %tmp to i320*
+  %tmp2 = load i320, i320* %tmp1, align 1
+  %tmp3 = and i320 %tmp2, -18446744073709551616
+  %tmp4 = or i320 %tmp3, 0
+  store i320 %tmp4, i320* %tmp1, align 1
+  call void @llvm.trap()
+  unreachable
+}
+
+declare void @llvm.trap()

From 811f2a652b6232f203e51a17d553e268fae3a29c Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Fri, 20 Dec 2024 15:49:33 +0000
Subject: [PATCH 206/209] [Compiler-rt] Add AArch64 routines for
 __arm_agnostic("sme_za_state") (#120059)

The specification of these routines can be found here:

https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#sme-support-routines
---
 compiler-rt/cmake/builtin-config-ix.cmake     |   3 +-
 .../lib/builtins/aarch64/sme-abi-assert.c     |   1 +
 compiler-rt/lib/builtins/aarch64/sme-abi.S    | 167 +++++++++++++++++-
 3 files changed, 169 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index 1f63e158409ca..706a1ff7eeb6d 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -43,8 +43,9 @@ asm(\"cas w0, w1, [x2]\");
 builtin_check_c_compiler_source(COMPILER_RT_HAS_AARCH64_SME
 "
 void foo(void)  __arm_streaming_compatible {
-  asm(\".arch armv9-a+sme\");
+  asm(\".arch armv9-a+sme2\");
   asm(\"smstart\");
+  asm(\"ldr zt0, [sp]\");
 }
 ")
 
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi-assert.c b/compiler-rt/lib/builtins/aarch64/sme-abi-assert.c
index 4333353f8d2d1..37305ceb39c50 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi-assert.c
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi-assert.c
@@ -8,3 +8,4 @@
 #include "../cpu_model/AArch64CPUFeatures.inc"
 _Static_assert(FEAT_SVE == 30, "sme-abi.S assumes FEAT_SVE = 30");
 _Static_assert(FEAT_SME == 42, "sme-abi.S assumes FEAT_SME = 42");
+_Static_assert(FEAT_SME2 == 57, "sme-abi.S assumes FEAT_SME2 = 57");
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
index 45bd221655fd6..8dbbe061edb9b 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -10,6 +10,8 @@
 
 .set FEAT_SVE_BIT, 30
 .set FEAT_SME_BIT, 42
+.set FEAT_SME2_BIT, 57
+.set FEAT_SME2_MASK, 1 << 57
 .set SVCR_PSTATE_SM_BIT, 0
 
 #if !defined(__APPLE__)
@@ -22,7 +24,7 @@
 #define CPU_FEATS_SYMBOL_OFFSET SYMBOL_NAME(__aarch64_cpu_features)@pageoff
 #endif
 
-.arch armv9-a+sme
+.arch armv9-a+sme2
 
 // Utility function which calls a system's abort() routine. Because the function
 // is streaming-compatible it should disable streaming-SVE mode before calling
@@ -204,6 +206,169 @@ DEFINE_COMPILERRT_FUNCTION(__arm_get_current_vg)
   ret
 END_COMPILERRT_FUNCTION(__arm_get_current_vg)
 
+// The diagram below describes the layout used in the following routines:
+// * __arm_sme_state_size
+// * __arm_sme_save
+// * __arm_sme_restore
+//
+// +---------------------------------+
+// |             ...                 |
+// |           ZA buffer             |
+// |             ...                 |
+// +---------------------------------+ <- @96
+// |         ZT0 contents            |
+// +---------------------------------+ <- @32
+// | byte 15-10: zero (reserved)     |
+// | byte   9-8: num_za_save_slices  |           TPIDR2 block
+// | byte   7-0: za_save_buffer      |
+// +---------------------------------+ <- @16
+// | bit  127-1: zero (reserved)     |           Internal state for __arm_sme_save/restore
+// | bit      0: VALID               |
+// +---------------------------------+ <- @0
+
+DEFINE_COMPILERRT_FUNCTION(__arm_sme_state_size)
+  .variant_pcs __arm_sme_state_size
+  BTI_C
+
+  // Test if SME is available and ZA state is 'active'.
+  adrp    x17, CPU_FEATS_SYMBOL
+  ldr     x17, [x17, CPU_FEATS_SYMBOL_OFFSET]
+  tbz     x17, #FEAT_SME_BIT, 0f
+  mrs     x16, SVCR
+  tbz     x16, #1, 0f
+  mrs     x16, TPIDR2_EL0
+  cbnz    x16, 0f
+
+  // Size = HAS_FEAT_SME2 ? 96 : 32
+  tst     x17, #FEAT_SME2_MASK
+  mov     w17, #32
+  mov     w16, #96
+  csel    x16, x17, x16, eq
+
+  // Size = Size + (SVLB * SVLB)
+  rdsvl   x17, #1
+  madd    x0, x17, x17, x16
+  ret
+
+0:
+  // Default case, 16 bytes is minimum (to encode VALID bit, multiple of 16 bytes)
+  mov w0, #16
+  ret
+END_COMPILERRT_FUNCTION(__arm_sme_state_size)
+
+DEFINE_COMPILERRT_FUNCTION(__arm_sme_save)
+  .variant_pcs __arm_sme_save
+  BTI_C
+
+  // If PTR is not 16-byte aligned, abort.
+  tst     x0, #0xF
+  b.ne    3f
+
+  // Clear internal state bits
+  stp     xzr, xzr, [x0]
+
+  // If SME is not available, PSTATE.ZA = 0 or TPIDR2_EL0 != 0, return.
+  adrp    x17, CPU_FEATS_SYMBOL
+  ldr     x17, [x17, CPU_FEATS_SYMBOL_OFFSET]
+  tbz     x17, #FEAT_SME_BIT, 2f
+  mrs     x16, SVCR
+  tbz     x16, #1, 2f
+  mrs     x16, TPIDR2_EL0
+  cbnz    x16, 2f
+
+  # ZA or ZT0 need saving, we can now set internal VALID bit to 1
+  mov     w16, #1
+  str     x16, [x0]
+
+  add     x18, x0, #32
+  tbz     x17, #FEAT_SME2_BIT, 1f
+
+  // Store ZT0
+  str     zt0, [x18]
+  add     x18, x18, #64
+
+1:
+  // Set up lazy-save (x18 = pointer to buffer)
+  rdsvl   x17, #1
+  str     x18, [x0, #16]!
+  strh    w17, [x0, #8]
+  strh    wzr, [x0, #10]
+  str     wzr, [x0, #12]
+  msr     TPIDR2_EL0, x0
+
+2:
+  // Do nothing
+  ret
+
+3:
+  b       SYMBOL_NAME(do_abort)
+END_COMPILERRT_FUNCTION(__arm_sme_save)
+
+DEFINE_COMPILERRT_FUNCTION(__arm_sme_restore)
+  .cfi_startproc
+  .variant_pcs __arm_sme_restore
+  BTI_C
+
+  stp     x29, x30, [sp, #-16]!
+  .cfi_def_cfa_offset 16
+  mov     x29, sp
+  .cfi_def_cfa w29, 16
+  .cfi_offset w30, -8
+  .cfi_offset w29, -16
+
+  // If PTR is not 16-byte aligned, abort.
+  tst     x0, #0xF
+  b.ne    3f
+
+  // If the VALID bit is 0, return early.
+  ldr     x16, [x0]
+  cbz     x16, 2f
+
+  // If SME is not available, abort.
+  adrp    x17, CPU_FEATS_SYMBOL
+  ldr     x17, [x17, CPU_FEATS_SYMBOL_OFFSET]
+  tbz     x17, #FEAT_SME_BIT, 3f
+
+  // If TPIDR2_EL0 != nullptr, no lazy-save was committed, try to reload zt0.
+  mrs     x16, TPIDR2_EL0
+  cbnz    x16, 1f
+
+  // If TPIDR2_EL0 == nullptr and PSTATE.ZA = 1 (<=> ZA state is 'active'),
+  // abort.
+  mrs     x16, SVCR
+  tbnz    x16, #1, 3f
+
+  // Restore za.
+  smstart za
+  add     x0, x0, #16
+  bl      __arm_tpidr2_restore
+  sub     x0, x0, #16
+
+1:
+  smstart za
+  msr     TPIDR2_EL0, xzr
+
+  // Check if zt0 needs restoring.
+  tbz     x17, #FEAT_SME2_BIT, 2f
+
+  // Restore zt0.
+  add     x16, x0, #32
+  ldr     zt0, [x16]
+
+2:
+  // Do nothing
+  .cfi_def_cfa wsp, 16
+  ldp     x29, x30, [sp], #16
+  .cfi_def_cfa_offset 0
+  .cfi_restore w30
+  .cfi_restore w29
+  ret
+
+3:
+  b       SYMBOL_NAME(do_abort)
+  .cfi_endproc
+END_COMPILERRT_FUNCTION(__arm_sme_restore)
+
 NO_EXEC_STACK_DIRECTIVE
 
 // GNU property note for BTI and PAC

From 8c0090030bf89df7e0dbe5827a83d52627b2c87f Mon Sep 17 00:00:00 2001
From: Vy Nguyen <vyng@google.com>
Date: Fri, 20 Dec 2024 11:04:55 -0500
Subject: [PATCH 207/209] [llvm]Add a simple Telemetry framework (#102323)

Objective:

- Provide a common framework in LLVM for collecting various usage
metrics
-  Characteristics:
  -   Extensible and configurable by:
    -  tools in LLVM that want to use it
    -  vendors in their downstream codebase
    -  tools users (as allowed by vendor)


Background:
The framework was originally proposed only for LLDB, but there were
quite a few requests  to move it to llvm/lib given telemetry
is a common use case in a lot of tools, not just LLDB.

See more details on the design and discussions here on the RFC:
https://discourse.llvm.org/t/rfc-lldb-telemetry-metrics/64588/20?u=oontvoo

---------

Co-authored-by: Alina Sbirlea <alina.g.simion@gmail.com>
Co-authored-by: James Henderson <James.Henderson@sony.com>
Co-authored-by: Pavel Labath <pavel@labath.sk>
---
 llvm/docs/Telemetry.rst                    | 257 +++++++++++++++++++++
 llvm/docs/UserGuides.rst                   |   4 +
 llvm/docs/llvm_telemetry_design.png        | Bin 0 -> 94604 bytes
 llvm/include/llvm/Telemetry/Telemetry.h    | 162 +++++++++++++
 llvm/lib/CMakeLists.txt                    |   1 +
 llvm/lib/Telemetry/CMakeLists.txt          |   6 +
 llvm/lib/Telemetry/Telemetry.cpp           |  26 +++
 llvm/unittests/CMakeLists.txt              |   1 +
 llvm/unittests/Telemetry/CMakeLists.txt    |   9 +
 llvm/unittests/Telemetry/TelemetryTest.cpp | 242 +++++++++++++++++++
 10 files changed, 708 insertions(+)
 create mode 100644 llvm/docs/Telemetry.rst
 create mode 100644 llvm/docs/llvm_telemetry_design.png
 create mode 100644 llvm/include/llvm/Telemetry/Telemetry.h
 create mode 100644 llvm/lib/Telemetry/CMakeLists.txt
 create mode 100644 llvm/lib/Telemetry/Telemetry.cpp
 create mode 100644 llvm/unittests/Telemetry/CMakeLists.txt
 create mode 100644 llvm/unittests/Telemetry/TelemetryTest.cpp

diff --git a/llvm/docs/Telemetry.rst b/llvm/docs/Telemetry.rst
new file mode 100644
index 0000000000000..e9d0d2cf95220
--- /dev/null
+++ b/llvm/docs/Telemetry.rst
@@ -0,0 +1,257 @@
+===========================
+Telemetry framework in LLVM
+===========================
+
+.. contents::
+   :local:
+
+.. toctree::
+   :hidden:
+
+Objective
+=========
+
+Provides a common framework in LLVM for collecting various usage and performance
+metrics.
+It is located at ``llvm/Telemetry/Telemetry.h``.
+
+Characteristics
+---------------
+* Configurable and extensible by:
+
+  * Tools: any tool that wants to use Telemetry can extend and customize it.
+  * Vendors: Toolchain vendors can also provide custom implementation of the
+    library, which could either override or extend the given tool's upstream
+    implementation, to best fit their organization's usage and privacy models.
+  * End users of such tool can also configure Telemetry (as allowed by their
+    vendor).
+
+Important notes
+---------------
+
+* There is no concrete implementation of a Telemetry library in upstream LLVM.
+  We only provide the abstract API here. Any tool that wants telemetry will
+  implement one.
+  
+  The rationale for this is that all the tools in LLVM are very different in
+  what they care about (what/where/when to instrument data). Hence, it might not
+  be practical to have a single implementation.
+  However, in the future, if we see enough common pattern, we can extract them
+  into a shared place. This is TBD - contributions are welcome.
+
+* No implementation of Telemetry in upstream LLVM shall store any of the
+  collected data due to privacy and security reasons:
+  
+  * Different organizations have different privacy models:
+  
+    * Which data is sensitive, which is not?
+    * Whether it is acceptable for instrumented data to be stored anywhere?
+      (to a local file, what not?)
+      
+  * Data ownership and data collection consents are hard to accommodate from
+    LLVM developers' point of view:
+  
+    * E.g., data collected by Telemetry is not necessarily owned by the user
+      of an LLVM tool with Telemetry enabled, hence the user's consent to data
+      collection is not meaningful. On the other hand, LLVM developers have no
+      reasonable ways to request consent from the "real" owners.
+
+
+High-level design
+=================
+
+Key components
+--------------
+
+The framework consists of four important classes:
+
+* ``llvm::telemetry::Manager``: The class responsible for collecting and
+  transmitting telemetry data. This is the main point of interaction between the
+  framework and any tool that wants to enable telemetry.
+* ``llvm::telemetry::TelemetryInfo``: Data courier
+* ``llvm::telemetry::Destination``: Data sink to which the Telemetry framework
+  sends data.
+  Its implementation is transparent to the framework.
+  It is up to the vendor to decide which pieces of data to forward and where
+  to forward them to for their final storage.
+* ``llvm::telemetry::Config``: Configurations for the ``Manager``.
+  
+.. image:: llvm_telemetry_design.png
+
+How to implement and interact with the API
+------------------------------------------
+
+To use Telemetry in your tool, you need to provide a concrete implementation of the ``Manager`` class and ``Destination``.
+
+1) Define a custom ``Serializer``, ``Manager``, ``Destination`` and optionally a subclass of ``TelemetryInfo``
+
+.. code-block:: c++
+
+  class JsonSerializer : public Serializer {
+  public:
+    json::Object *getOutputObject() { return Out.get(); }
+
+    Error init() override {
+      if (Started)
+        return createStringError("Serializer already in use");
+      started = true;
+      Out = std::make_unique<json::Object>();
+      return Error::success();
+    }
+
+    // Serialize the given value.
+    void write(StringRef KeyName, bool Value) override {
+      writeHelper(KeyName, Value);
+    }
+
+    void write(StringRef KeyName, int Value) override {
+      writeHelper(KeyName, Value);
+    }
+
+    void write(StringRef KeyName, unsigned int Value) override {
+      writeHelper(KeyName, Value);
+    }
+
+    void write(StringRef KeyName, unsigned long Value) override {
+      writeHelper(KeyName, Value);
+    }
+
+    void write(StringRef KeyName, long Value) override {
+      writeHelper(KeyName, Value);
+    }
+
+    void write(StringRef KeyName, long long Value ) override {
+      writeHelper(KeyName, Value);
+    }
+
+    void write(StringRef KeyName, unsigned long long Value) override {
+      writeHelper(KeyName, Value);
+    }
+
+    void write(StringRef KeyName, StringRef Value) override {
+      writeHelper(KeyName, Value);
+    }
+ 
+    void beginObject(StringRef KeyName) override {
+      Children.push_back(json::Object());
+      ChildrenNames.push_back(KeyName.str());
+    }
+    
+    void endObject() override {
+      assert(!Children.empty() && !ChildrenNames.empty());
+      json::Value Val = json::Value(std::move(Children.back()));
+      std::string Name = ChildrenNames.back();
+
+      Children.pop_back();
+      ChildrenNames.pop_back();
+      writeHelper(Name, std::move(Val));
+    }
+    
+    Error finalize() override {
+      if (!Started)
+        return createStringError("Serializer not currently in use");
+      Started = false;
+      return Error::success();
+    }
+
+  private:
+    template <typename T> void writeHelper(StringRef Name, T Value) {
+      assert(Started && "serializer not started");
+      if (Children.empty())
+        Out->try_emplace(Name, Value);
+      else
+        Children.back().try_emplace(Name, Value);
+    }
+    bool Started = false;
+    std::unique_ptr<json::Object> Out;
+    std::vector<json::Object> Children;
+    std::vector<std::string> ChildrenNames;
+  };
+       
+  class MyManager : public telemery::Manager {
+  public:
+  static std::unique_ptr<MyManager> createInstatnce(telemetry::Config *Config) {
+    // If Telemetry is not enabled, then just return null;
+    if (!Config->EnableTelemetry)
+      return nullptr;
+    return std::make_unique<MyManager>();
+  }
+  MyManager() = default;
+
+  Error preDispatch(TelemetryInfo *Entry) override {
+    Entry->SessionId = SessionId;
+    return Error::success();
+  }
+  
+  // You can also define additional instrumentation points.
+  void logStartup(TelemetryInfo *Entry) {
+    // Add some additional data to entry.
+    Entry->Msg = "Some message";
+    dispatch(Entry);
+  }
+  
+  void logAdditionalPoint(TelemetryInfo *Entry) {
+    // .... code here
+  }
+  
+  private:    
+    const std::string SessionId;
+  };
+
+  class MyDestination : public telemetry::Destination {
+  public:
+    Error receiveEntry(const TelemetryInfo *Entry) override {
+      if (Error Err = Serializer.init())
+        return Err;
+      
+      Entry->serialize(Serializer);
+      if (Error Err = Serializer.finalize())
+        return Err;
+      
+      json::Object Copied = *Serializer.getOutputObject();
+      // Send the `Copied` object to wherever.
+      return Error::success();
+    }
+
+  private:
+    JsonSerializer Serializer;
+  };
+
+  // This defines a custom TelemetryInfo that has an additional Msg field.
+  struct MyTelemetryInfo : public telemetry::TelemetryInfo {
+    std::string Msg;
+    
+    Error serialize(Serializer &Serializer) const override {
+      TelemetryInfo::serialize(serializer);
+      Serializer.writeString("MyMsg", Msg);
+    }
+      
+    // Note: implement getKind() and classof() to support dyn_cast operations.
+  };
+
+    
+2) Use the library in your tool.
+
+Logging the tool init-process:
+
+.. code-block:: c++
+
+  // In tool's initialization code.
+  auto StartTime = std::chrono::time_point<std::chrono::steady_clock>::now();
+  telemetry::Config MyConfig = makeConfig(); // Build up the appropriate Config struct here.
+  auto Manager = MyManager::createInstance(&MyConfig);
+
+  
+  // Any other tool's init code can go here.
+  // ...
+  
+  // Finally, take a snapshot of the time now so we know how long it took the
+  // init process to finish.
+  auto EndTime = std::chrono::time_point<std::chrono::steady_clock>::now();
+  MyTelemetryInfo Entry;
+
+  Entry.Start = StartTime;
+  Entry.End = EndTime;
+  Manager->logStartup(&Entry);
+
+Similar code can be used for logging the tool's exit.
diff --git a/llvm/docs/UserGuides.rst b/llvm/docs/UserGuides.rst
index 0b204d512876a..6eee564713d6d 100644
--- a/llvm/docs/UserGuides.rst
+++ b/llvm/docs/UserGuides.rst
@@ -72,6 +72,7 @@ intermediate LLVM representation.
    SupportLibrary
    TableGen/index
    TableGenFundamentals
+   Telemetry
    Vectorizers
    WritingAnLLVMPass
    WritingAnLLVMNewPMPass
@@ -293,3 +294,6 @@ Additional Topics
 
 :doc:`Sandbox IR <SandboxIR>`
    This document describes the design and usage of Sandbox IR, a transactional layer over LLVM IR.
+
+:doc:`Telemetry`
+   This document describes the Telemetry framework in LLVM.
diff --git a/llvm/docs/llvm_telemetry_design.png b/llvm/docs/llvm_telemetry_design.png
new file mode 100644
index 0000000000000000000000000000000000000000..48fbc003da18e668b0a4bd2478a26bd47242f243
GIT binary patch
literal 94604
zcmeFZc_7s7zdt<Ea(9=6N)aj{WyvmMsca#XeOF0#l4b0pR47Z?BL-vNx3P?63K6o5
zZ5W2EGmIt67-r1O^KsWXzw?~ue9t-0|G&>a^~YS-^}gQw^14Jm(9>l3o9k~72*jdw
zPt5=XIs*Kc`O9BNfq$GQhy6jIP>`0|9V0)RmGmQ@jf@%Lv6kMD>zA+CKMH+^+j!FC
zTUU;w)qzhpo^AZraqg_TnyZ8`_;zb+=oW0<N<0w%+5Fkt^hR*wmLjZw3*C&L97~r%
z&Fn9}mH<xpzu*7m!2e$kJP=<(Vl9ZSQm$dTeO$=Cx8zb~*JdG8D<1-wcC1t}7xRMF
zLL4aU@o<}f?DI=MoWAnUH=0z6t)WS+z-rdV_T)&GNklovB8o(qsqNPI$fyPR<z<(0
z=uW1w$+Xx*ltZf|GYItHu=E9K#|09`3exu8^n&C{t}^XtNFi%?+~DAwRRwR0`q}HM
zD~l5=j%1~6v%v43`Qz0ZRt}nzezDCPE%_^Oag+-zhz3`0*;){f_~%0`R&%IW4y*XD
zB$DH3NjEf<QdT-I(xCWpTCDt6cha==>UGud@jmND@%!6>uJ29U79s0s;>6jb<@D~q
zi%Vp)CnS4Z(+NU8&%t^C>wMl+wFDn4tEN~Gt>qql`x+1SUfy(3R}XvB!)hk6ce#MI
zz1ThQ+NZHJd6L6=+Znclud-$DM;Z*9w#H|tuShj$9I`ovHO81t)H;p?+?f;9r7w@?
zujd2I^tPn#tk&sn=UX)JhybqOZf4W;m$TvJlHOzd&6I6g-kmm$6O?Uk$GYiYmB7C5
z+^eeM_iH@*Q}Dr{-&{Y}DBdE~8`HGcGVfY2GHL~R4DX0$I1Sa(R_v3B<e)U))gA5a
zIxLdsCa^zOmiqg1N~Y#^V{dv$eA}Mz4qf_%irJs@H_0w*LG3YV`|0GYx4$_#zW2q8
zW97Jm^%Kna^XjK}E3Cuyrp0i@{j2u&_Cz71U9LOhBnWiLqsDE*zu~MYk7SWoU9JPd
zdC2!`(!}bM{>Is7)WOD~W>yikf4-8I_`KdqxQ2=ctMZ~z<9_H_hVyX!2A?%jbj!%e
zX~`tg-9oi)8t{xCq(l|G3AdgvjJU5&x-ETj!`9knBCflK{B+9O{-bTL?51|dGM+Df
z^UXC5r;leohO2;m*C;{R#M;8gPcMN$^Y_A(X~%d6;SiXxs70+s#aza*58JB$kRozZ
zhMSnMrGaf>Hra)14dr=RBAg&T*(#X+)t^itkVv+1@hRqHzWP}X-Beks7YQ+H>GyR6
zzVcf9Spc8pZw8r}#e^)mHTHki6n7lSTjdkAtlO)psqwB;lPhmsB{P9;e;jPEtYe4_
z1vYd=vc7}vsO<Yuhj(L}SUG-kL?lQ^7v>_4A~AveNHS2DvHGsbx6$0Nc*2g4H}xf8
zAQz3cS8(`(Q|ZY4vEu&C6Mgmb78=yczYUI`FJwcsaaAm^-(#Qx&8M4&@VkU&T_O6O
zWCCS|7_{79R~ACJj!=@bTs$g%ErT@z`Pl!E4h>ppD__m(Oy7aT=jlRSCTqswks&jn
zI{0585GicaU56}RYfqHjdRx{!^9sz%O$@$v>7P@d&`2cxkP6L=7k$$JZ!p_G^Vw<b
zcicPKK%nJoJV;-qOkkatiLDFmNS(@NHu-Ys?=!#6z3zurb!PZ}wbf0PVIUs2X+VUK
z(YVQwt02%YTdIr;!h{-=o=@VgADos`*^T1d{>PZuD*brm_P~wgubK=KbmmOCfW19l
z2>G!W7a{-xoe{$|dfP63t6f#t4+acmWkPDv|0VD5l@FD{8&ZuMCKG*P;^n)y(6CAI
z83T7A4iM<?@G8CZ#*FD4r)m{eh2^JU`Ve2!mzdv_ru0!WLU21PLl6^n^)@*c?l-_W
zywO}TT!Tor#`5s8=C9NGE%c9^-+v#&CRKRhsSpf28+(+6yZ45$aKlYB&UIx+1rWoc
z6E@2W1GcPn-@@#fPR$<i_$~T&tKgaRLf>>2IYwc2_E$_~2wuTm^9+y_L})~Fm5wy2
z_rSu}D<)ngtL%2482e3F#K^|G30_L0Q}r`1ZEbBaU2j$HdK!>aM8sD4@+jD#t9;T_
z4%A8fHvexkS?sA?hdPCyv|R7p+<X_yK|`ONJaru~4%mVP!X0ei>mx_eQLVy#iTZuQ
z`EZBmdV1ZvcSpTAXoGNE%)xIX1$9$bM-Et}cgrJ(Gvd#!=X`(ccGXfPm7v!|Zv<XN
z>$<J&STcbgbqRZn6ye;`8^x3uFY$tYOy~c|F)zrgTcmNp7tAoTuuzsk+wu^6A2ES0
zL6EW@Xy;8VxBCvhzH2(*CFh?7vWnCy?hs+|@|1K!-P)S(Q{cyeqt1ghOHKTG3NA0X
z9siKxGI0zBtIt&LF_Ge+VZ1oR7JQunN+c4J<t9~$G;0Xi&L&j-kI|j!Y>{i*v8r7s
z=jP_JUPV^3;OL?tP)?Rs^)q~(%k&00+$M!^#N>~l?s~OyO08Cq<o3#HN?h+|TTV=^
z;tzpt3!Lq>s0s3GV7jqSZv`7RhravGLbLB{o?m9L7oTcmLnnGxe;opaN<^p0c+jy5
zzy&+GVKl-KbWzaL74S~Ydkw4->)3c1YD2pgQ|0DXx7O<x_BL16Q}@r|IXI-8)>!K)
z>*6|`Z$8WFLm}9s@`f!o>aymvUvTfT-{7s}KH(cT7|NdgA%N#Rht77YKDX{(iR**j
zPnGRFk@4`r#gwD)o{gkamgA>*(ga>XR2w@p?04dHqJp;{+NsdT#+-(t(p9F-T>h|=
zIbUDSxBHBpZ4i3Q(p_R_%r4s60jE)`75`$w<ICVqCOoDNqCzUduv)YCx2(MNx3^ya
zE^J@Nd=m5^s>THnP(FCg=jVQ%!;h_3&j>wyLH4Y-b`DQe8lGs}{Mq3<f6p#okHoC-
zTS=dYiCH<W)O&XDLW7tm;0{uauG1Mi2kv<TM*m6D_3~_IfzV7$Oq6e+*R6+E|6&H6
z;YjC_ahJ41wZ$@qSQ4`41fdQ<j_`b<$j7b`GsUv1$+!2F>-TjeY@J{KG-(5mjk;Nk
z0eS;&_!>T8+B*tFyzN-PD#qVly#zWVyb5fnxH>SWN9|$xki}``Q@lFdkce|x*VyS1
zr)B<|fh5C%^VA-hPBgf11}?Ec0)t0gn-|M8@yW@i`=1HNSbf1AfCv3F8l5Ulb49hq
zq^%#-*y|3ronVoj7(2x^JRHNfe(dod@uqGrBy&&jDosqxfLD?}=PvF%uJ&w?COaxr
zj2>QD_x->GTITQNmUZ9!T1MrgG;*I_!0ER<Y4mmRU{%Skq!Gnce*Na*mC+g$-Fd}^
zx>Jh_+^&gj-ps%x_jY(`OzrBJoA0qkhIo3^!2$8W9LEYg<LhcC++fp_x387A*a7jo
ztT31`!!ADdZ+q2Ztfoh5*9WGi_IAz6_0yvBKg+#vH#h%g8cFxmtZM`I;k)KET(B|6
z?JB#U?A=JS@#}Sqyo7CUp(FVo{ZXS6-!%D%k1Xrl+&rM$0_7fQ(Nvy*$$tBV1rH%g
zS2Qk03UoU?uc_(lS!R>-iuhjP>ZYHN-t^e99iCJytI9U>jX(O4wYM9;>N>IGwn|7A
z^bekNwSo?nhKK-OM+9vx=1J`b^0YYM;|Ke!Dx0PS*5A>0p(C6yQ6HgyY%;&e-pFC1
z!P4}fjHJa~P)Lt9zQa@TYLvO@$EqqbXjq*-RO0{M->yS{=THXwE9*wc(_WR|Vk(K}
z4PKG-?Ip=pEhQnlb13brSjfIG+C%yK>OEko0L4VwQJ!2TWV3T9ee!v=?&{-kxs5~D
z{!nlIYaJDB?RF0m*skRe7(C?~v_C?2QxDVdVyEFRyN7duGQ|d(?u5NH)Ndpd?YW1>
zA1EJ>d_e<kOJ61Q-(+&uA;|ohCBpqNU*LjL`FdJ3zX#GI1dvHxWQccQogEA4!FLhk
zhv=V?fG){)vbesOoL_hC>~k|Cv;Gzc@eQZdWbY~Kz~FL5T4~_L=es3URhy>lv?YVO
zFktzIgoK38*uaL6>Y9Q=?&0~Ca9$@ewz!b<<{yC)vs3?W3Sx-uRHdzvC|}0qohVk|
z=iiJG!r$Mx^KpYRPn*L%<MX^=fkZFy%&vpbRqKpv582D*`}Zx1SfkxtfXwO@hP7|>
zAULJTx+sIu<x}5R9Ra&$7rlO+D@bW`kxGBTY)<&v`S}Oo-^2g`+=MIR4sK<kPWdP4
zO*IJ_$jfLfDcPs0F_6&{Ge<$!{?hL0S?6n+TT^Ld5;0Ou1^YVNt5CkPCj1GYKDdeX
zl_x+v+(dvDb{f~g)iot6Y<2y>?f+UC$udm-#G%aSI$@KA!O>Ezofgj7gsQb8?R!jm
z-DZ_y>&*cEuOQNkTt|?F5asM*Y~+ORX|GBj;@{=gmlt(t)bDOyW}o|D<^3h+4x*zC
zOx|+(vt*C&$|R{%Z35c~Boe1_-90xJ6@N{>e&hc09g-BoKqOn6EFZb2R$?hQwKYZM
zMjNlTyMc|h(pp`A<mz6|i#H4N)Q)N`pkBN=!24p-$p+l@4G2;74`?fD6BY`C<fEVh
zrS9w3Gp5#m%{^;m>OrV>CiyNKsmNb>f*Txo*(zcL_MFf&9?ZMR?d4<`v2=Cx(Pwt}
z&1J8CNCmU0#WxqBhJY_0g?@<LxDlv;WdPfW`mT;O5cN6o=_^Ny3*fc^6TZZ$_7hPN
zW@Ksa%$p}h^d@T!6KjNjbXZ}}JW+_0IiKHtEsBDNb;!C*{b<?$YHDNIrB13osFn<~
zgICWig#m^lzuMj1?Jz}Zzj0#n_Cf&mjp;a!e2FQ-jPGA>>TT@-XWAL$UfaQHmrZRD
z#*RdW)WwCC@&|i@LQ~;2HQQ@E<S~Yj-0fU1GL2l9Y&CmKU9BXLOW{Gv8)qT+i9&7c
znP+xlZhuRpw{t}qJ~@Nr^=iGo^98QAiS3T7C!Gr1H#4wMxIV~p3{)WTR>o~~AAb;c
zj^ulnHO`!-j@gAg0CXTfi}{Utf$wZ>dQpz<W7);QywgKkC12$QG_r`Ib*xp~9u1`!
zlcaWxGJQ(cZPe8_JDnwRyNe0*BtNI5X`i-L*0dlechlH@T{(SazUzP7^K~5(3sl1a
zZuVMAbAv9>8|4LlH_)R=@3wn^J?$C1J!0fi?hof2!ee?@>!vAp*w}~}Z%>_hmUW<%
zMRBrk##40C?%o%wJUW+f{!Q7CgLEAa7(xQ>OxcO@uO$vDh1{z_SYHYD0)>t_np>Fc
z3^nVcGCb#E3v|;=#%x12!meByRbt_}_Tw&sqi)qm!0qp9iDcO~DSA&>I)o+uw#NSu
zt^-PdEv&Xr6KMud7rs%vD~q1u(xebTpvvcH4NF&AG<Srgxj|Keb^QSAyZ=^uyp0)k
zIXUevH%5F2Sng{~yU>ry3ZUClOGZH8Cyz(M{O??gf6J=Y9sr-}Ukkj5mPxMH|4%03
z-N6RQ@LN?)BP920Sv8WfGk_g38eS07cYgS9!=!^QT^Z#jh4@a-MU*wP2mdF<$df9o
zuii+S2uZHrX?P#73<CM*1p51bYf7F<UpoSG{a-AXdba;G`q1aJ`j82AC0|geqUVqp
z^<u(MAL=0sUql2+qZx)H%V)cYdBXmafR>`9)?8b9PjMabo5|({gKqQQi=hVsRa$dq
zd}$sRU&LI;1+cSGE{EW*j}?@mMEVR~`$asHI+T~IycKmJgr`J_58AYe@c(rOWkuhG
zS2}!tymHR+eB&CjW38#W(gEhSl5ERG-I*vXU;liIzUQO<-_oMEwU{4u=x}J0)RVMy
z@_sjqe1l`r#!Ek9>7Ab^t?kdEVv5<N(O$AsW95d9QxMx_saW1s%vY@8s#uU)l6i+X
zOmTed6pw*z+dw2ov=V7}*p3gnVhb-Tz(Wq$|6iJBp41v3fpZg%nm>n50xURCOIgin
zvHGuqoVqf~r}4c~wGZLl?Og^s7pUJjLz)Y~YbmL&4K!brUY6FOii+L@&}0B{7$aVj
z-8#0nr3HGjQTSik>|=$w7%<#6HRoZd6bISK5d<pgigYPyL`%yJ%oi0d9?MP}0F$jy
zdIfn5SvL#wI!`-0=-#(-Q^z=vh^v$Us1Z^w;xW=aCO36+hgcr=@#S?PrigEO|E29M
zl|<G{0bvrQH|e0l40^zFU0YiR1^qcLNB0c7UzX||a!Gl|%z5+az$KI(p^Y_vbVR|`
zJ^gON;YYYy|E+FJnb`eX<uMeVf2BIW{$8|Z{e5c_8@ki1jm`y7sKC?YsbH!ug!xCD
zW6|ErU5sr<ty@~^r4yPa4cBKlOUd)`A?cr_!d_48;kg4B5<6lp*sbtEcS&~<4PmlN
zAEu$U{`+C9QO$J9Behv(*7M-NT4-5u9}3TF*RRk6lm*9SBL+6C*h9I{y99WL&GVO)
z_;iPrbO;yN{++FOk?h&y(C`MYFgv@>qtLL%HH#`;Vego@>SSA9*AUN{Hyz5`A1bFq
zh!_^yVp=yoMdpTl?W32K`YLWf<LW}-x3e8BJE>3`1t@)djJnQBKc%2wj?f52k_PuC
zJiZb`MvT8!6Mcp)g58@R(DY`!pd{mkjisS5JKa>H7GGldI-_(%w$;wS4cn|qz{8Gl
zV?KN7{c030ER`jW7K@U=$aPOOjMc32fAVTW=7yg6s`+JDHa|mVP{q}Rq@<Q0p!KuZ
z@YP;R+JE=9;_lDk!{bi*dp-&6E~6z?dX2svbQ;9IAx^B5m%F6rTyxTJjjlk|q)Yc|
zU3p1em*5?Cm+Ibq7s8bG)dswdK!ct_7YgE3SgFZ-;p$9w=a2vh!8?oA%rA>PZycFV
zo+kIlr~b_|$}Q14)^<uq3DU5z?^U!3gGMQBv=_ltiPCFG7yHc{%U@n5l1|;adJieu
zSOSf{tXgD4d{|ycPh#QP#UmUauLvaX;$JWOJTNG=*I`(Vm%}Q{QEM>MkuyK5BpF-}
zHDgXL+e;lW>)hr-Vm>&_h4b;22oW+&hnD>($$o<C7o_@p*RV?8+<Ufbicj)9lbd>B
z8MtGQnW)R_+v)46i_hyvKon(t(fvU6l@^r59HWRXPV?}9&VNgQk*Mo6n3p!qcYkeK
z^4+6bPi08--{>*<Cy*?})f57m-x5N&P!=&IXo=<MZy9c(cd<_SdlFh|gQ?|@7}c~g
zo8}<m6a*nNT1j-RX)rCX4O+r2`K~i~?5W$_DM4-~Hf}6|t)Dn2w1yV_e13<fK7mM?
zPZ#S4uiIR-c>HVGx0EDTd5n(=-FXAa3&6K$akI)!1cc95H;^`0gY{AF)4MC3>MHB?
zc&6IGSR0$>Cm+tgS&HIc%h{UW5=%=apj4iKKuMv>XIZ#<Gk^}Sc)duZAS+)4*V#Pc
z>pUGZrOkWJ>8h-yB8G#`L%Ou~cg1lcYd-MfUlL9AU={ButtaR*J5v+ooCuxdVHoak
zrbeiGyn}Hei52GGfr3gi)nlFM{@PJUyiqQ_Y;AApqO8chdiDNj`YB>*vVzYj!Z3dr
z?Q4MHR-P;;27F<nd-YiRZV-{Ou7udTQD7!{Cxa2Kt^!N<C5Enxg;F?`w?9WgP@&uo
zU)FV4*O77d=!8Zzw@}YEL;HL^esL(#gvVKKY(?dLNC*&H;>j^!w*1$_-NogfUX+xS
zOuAQ%_}bvGCg~05L@fmvV%jOlh4G>-9FXkk4CdJcIaq8qC6!L()D+$cGp*K5lhIG|
zu||dm1Ksu}udStnzHc&7<=TSTdy~@DRHEC;NN;}QIy7Z<pirl~(mp^mEr_qd$E6J~
z6#N+Mv*do`I@ee!1lz{zJ>2DskiqD-h-n*$X}!30kZ1fOOjk@+yxTy<L+y)Picdu+
zigYLDX{mLNP9F7St@Wm$fMH?$7*P5p*ir7@?q3}3&Cj<DiWU{UQEfM!R!K-{8NMN2
z{%iSMSo^j64i`Oz(#wPQo2LR%f@_$sKx^y!w~4hXh)o_tJ3gGX{mqH!)p04A3!8o5
zLLcd65ewyzAp1{wOBy}Wtt%be(an3$?7~m!iVkgK2?s(R@WHV3HP0NiqweU6QrDq|
z{m|LS;mys>N5inZHD~EmaI-n`_?KQeeNm4g!Si?W+Xst^^OBs-d<F~1r-Dw7{G}ye
zKw_*-S6!E}Z^+5n<f5+F0gaHaj!Lu)m$FX*th$hx447DCA?EH7v|mxmM~bCz8XQ2+
zP0mZYA(8(uxo(Q<>P5JHstJ#hX*s0E3Wd^$v+;_q1h0Is$io<E&Hb7zVh`nZJzaAo
zdj4b&2}?)``t>z)Y8LR@pRkP(=LU@wn?Sdl#!Z%W!&MXRI<#hI3#BkXi=L){Cg;BJ
zH&p?-@FU2W9pmZVhsVUgg}oZBp=ZL!zob^VknNieKe`++byiM@uA$3`Kp;PW4l45|
z$zXq<-AD&)Zjwd!4x#yTFwpla*K0A_?AHa2zAi3DWF&(`u0pWX%=zhQ-Jq4K{;ypa
zw|P-=ApWW3zf0aIPFG?|Nld&xz)12|fw(}1|CV97qV2&9Cw^Uy>^Hd<@@1wA<s{8y
z_5-M9(mmVLi>y52MN#)B*Zr75ADFTGc8eTy%4US4i2@_@OU!89Y)39<v%JEOX!KL<
zg4gyIoO)WHD(=H~>HF1HR*Z7H$9onoxl<Tjkb9%s@->lYgZFx?#+%myTENz3L&5L`
zi~fOhJ#Jl4%kC3ra|XSG{!4XPMp{7DLX#htz2JW6d92d)d`%S9-5K|7$^H9mMzK@3
zrR(FBhnhzsiwX@5djJnWuSKI4^i^^N17y<Wt=1ow8+OJd+A|-`DZ>>^Yj(9)SZOs9
zngdBiB8HZpFkQF`ynkn<I?>)(0Hg{B>$@amSgzIL91jho(}M@@6ylw`{kJK_<!iPm
z!4awY^^N;tVoc|Y=3&c|n*-H0<0*2=waHV@>zd~&jd6_iBpuxwO>WyKc1qYV<HhUe
z(=XyUlbixJ;>)2=x#gUmV9X%!dB%;6g2Tyl+aWDDIbbiWJdVKtW<haUm1%r_$nL9y
z#eBhbKud9BKDZpYnw9V{YK8>%U3zw`L|%T60R)3&kwE8t&919Sfl5B?q(++{9M29q
zd79KM<7SRIzFxY7OtymL2M4qY+2E{(r`4ehU^*6vW?)7RKO#UOI3aUQ!yh=wW<AP;
zHPBabV)SK^WBf(;ur?CQcc_z=u>6E2A3}rcbPfgD%&Sl)yS+bQcy46Ke@6Ot^u1vv
zphNocDYy5!kc`jXXgEw~$~01vjNp+9dGSKpWnzaXp3E{)jE$j28LV1jZZWzoS7`k$
zGl|4o-aO6v(uwQXMqXV(v2$cdS56*i$EFMh+0Wt>6ts<RW>v=C=z022F{#7uBH2)X
zI8$+NvL}u_Qea(of1N~J3;B^Dwb?OmaKYWbVD{}4isJfQQ%2=r4)JxdE&llJ=GeCW
zv)?iuS02QP?VIy^|1->7$qOvL=wB{wuoe{Ly4gaBx<DJuNqAvx2c+Kj>F)31AO9Gl
zk@gmC^gRXlONM=LxuDkmrFjRhY}a7|o&%c99f$EPPb}qP%XL|<z7KT?*0A+nUEPVq
zC~q$>sy;cBClym~m>3E55WglW4tp|o@-XPdBcgqP7`T_^+6Wpjj*WPI0}cVD0HdUa
z$igB0a*?5gNykqbs|}dZYgH9i{F60b)yx(JLDNA)Dz;#b@P$+)7cb^omE)#{%Z~=U
zw!nNgIYJOb#@;}zB%1DTC8`Y}+7vAK^tr8HFQ5-+#yQU{@|W2{(~;)t!?sYForPpW
zm^zWyIbf@qoag?*AQAWxy>IP|t^bg2A$dSURH5wSTl5@uw@t}ZtCO(4hl70e6qwA$
z^ZI2%%h+FWJuVO))>hN#nY|A}@|zCPbTy*?)`)5$+bkoK5Lp@M;8+LAG+@uiWeS0`
z_3!GPf<Y2ew}KxSrnRDC=};<;tYa?qfQLhu97X4mA^nPo@6$_Ck=#j7kSx@@jTHV;
zI#Huz10AEJ<qWS6x-BKdBrBW=FwymSH#*cG4)ZptieVRD3m&;(^a*^#?0Cnj@AyPg
zLQ>watwUR{0egaGMv&fSJ-f8F<6Dw63-QC8o~4nvDZ*s<Lb6rb=hQ-{QsgsheGo82
zTzv}6fu;k{5;Q$3Uq}-fxuo#;g-MargvUU)u+KKu@o5@e!x@T}ywI|?cIM~H8|A5=
zcZx7_7Pf|mN4lbww<_~(KS=&9<v*>KhPgIgp;x^qYS7(AHU=(XF>SE^#xYnoKE`)o
zqygjEOm}9WkpYVYQk5~3PvEb~IqjfjWH>JrX_};!j<4luw=&{>F=6Zjlu|~n7=jnX
zx=BjBxi3(hNaGIq%8^n0kcd_=faz<y7wVNO_v9zIDFjjHJt2Qrw46rc$0YL8Vf#&g
z5nWzqv%8@khB%K>UVUv|=OVh+6W*WD^@>S)!@oPE%QQ-A%N4jw`cx{u+NwZlz0)a^
zZq8o^_aS{7jw@LTps1S)^_#wuq12gFd3!Q=%?*6q>Uu91S=YM>=uOlTe+AEGxNw6)
zMH<LsL*n~iSp)$s-e8tUjrhnRc(@ZX-MpE?t^x}-@sE*hj>v$&e(xI+y)5`RTT51s
zDaE~W(r2_#1moEDt3K`)+AC4St3T1yHr+xz6=W5%*FA?VQ5(v_Z7pnc=G*WiW2~i_
z3Y==!8kW`;ik6yE%aGfHA?5{r7!3gr)T*te21h8@5<aO%Uq!b$FHeD3pSP?LyIv1x
zD6BO&IHx~91M_u?;l*rQkXznM^&S5C_YEUnjQ4Z$<Xyx5rIt>qPr8S0-Y1;24h&Q8
ztex^C&#Me4ju|*;!sA;JF}^tR2X&Hkb8T>E-wxfOLkK#0@0yQ>ciCu7vGd_a#>a2_
z*wn-ft;V(_+)r>+C|q_DMO91*<83n(YqzEtmePlaYxy?1z8%{^T)BvPworHAuv5Z?
z1;nMjn#ZpcoGG+cnW5IhnF@wLmtS%(h(Z;^RYNvgw0+-VtPF@chaO!2l*eM)3L8oC
z&Nx?u`OX>r;vwEB{U_ll7q(rMXgK9JZP*2q3D6i?@h;QoBbXWUgMgxDBrnN2B(#}6
zB@|cwNkgEBg<xhTem};o=9_HSLnVbgjA6Ph59Xc`4{(jlG?39<1H4=1s$Lgy3DW$t
z$TA+vfk>|4=Mw~x<m6XgORdb6x4?p?OX|Bn%zG-C`t>ds$d=^1#ujQ|kPDywX_R_t
zJ<$~eiu2s5awH_BeUM-mtSqee8jhA`e}EaJ`nBR!1Y(y?Fs0`|%~I~%&S0@i867c7
zFVX<z>42^;5>hlS-#Fz|Q{EEspyUL{J7A!i`FQihhv&mEjdR2vd7LRsN5N#|uiJ7=
zehUT1;@AD%4&|_YQOJ`o=ru?<9~?A!tjq54p$GWotp`3nu3K)t_jxf6wtl^?EF7Wl
z`~f@j2qwpuUrlx!*dXLR+WS*$i1abXDs1GpE0xQDK;{GWIW^ukEtt)l{`<FwEj*Dn
z1$n8?&IO~*Ct{uA-xF@RZwhd)qfnbcfF6r6)y=$31l;RSO}^ZWVgzH-D|fW`{pfo^
z5W>XTuc?UbwE?%DT6vc4!&XTM$7%B9!2RR5NBoJUj}qXPYe*bWx^C($lYm_IYZ#5%
zV_t1g4xcZHfqBJ1WA9Ie1U@g*2-b*okxmbs{&vCP&`sppV{aputRlSk^xUT(f=mU%
zDn-=*k|SQFm>;gEio_WKLu<>b_|v}L2`3HXu3BhDa`+@<AD4izd;t<s?K2vhVO{pr
z9eqKoR+x{Q00*3(%|7M)uu=XmGo%a%e*Ct8hzwFB_0dTj{)1f{#!ypNCH(m9I;ou3
zaWSyKJ>4mp7YJpbU}k%x5kg>$_?vJ)w>?paQ}Ahi&Xu8n3czI#4NJ2d@-Gj+JQ<s7
z$$ZoL&;yKb{#t3{#<{M!nNz7;FY_I|RKiHu;;BMTt4hMZ9ZRiQOUdXB=FxBbRWwf;
zHX(VXfQ(*Uvd|ye@Y(-qr>A?XCC3?CUY73z!d3n5%m1Tp)motCo+CK$MgPYAFNtw5
zAvDe_S7WL$gFS?D>mm~$&k?b+<q2jdpn7#jA0ZECmX+p8XOqqhc`Ht;n-P7T+cF3Q
z#X|d&M*6lHuR8}znFxZQvy3zETg9*6O$-`p;PNbowH?k})+8>yKTBgtn2+M^@3_fC
z5WAIm7!cI+QGuyjBOiZPts;5?NS&VUXM$^%&LEIv#avYn2{wB)ZrOQr_ho@_Iy&97
z(U+9ZKpLvbNQzjLk?5_uTt$*dx+UP>dU+=u;<(+6%!N`x!!?71=3;ERNo0{@v*V$g
z3bSI?o5I`sKmxuo>6K^?C>(`I#lL+P4@$5g>O&peC~UYh?IS!J=XF7O6J6%cy^RE!
zooN;MG8V(^PE#v16laByi0t!Wv@GSl6z`Mg2o0tf@vaYiRjC^o_Cf+_E3nrvfq4XN
zWJD`D^&!LruLiHkUJE2tc2#|pn@%n%ZvPZA`WJI>598N+4}VPZxFASIeNf5Nu&h-r
zt~k%hDd+X5!(f87wV`2Z+a{pqy7-Y>OK}jc$XrQUALZ74kaKjsqmmES+p3F)%++84
zV-F2bl4(_El(Ts&Xu?e<WpiUPyQOC}d=NItAtr}(hl~-=5(C{nxgDy|o!FUbNYyI|
ztUs<&h<>4c{`%$2j}=9b+3{qt>-`C!;jQJ*-d*5Q{=%nU1sQuDn|MG(ddx;ucRLeb
z`<8!vPqA)A<=cqMIc1bK)CjkGi)oc~J(WHF<$1Ja^ym;iW(rCf?2cBQF*rZNX*<IS
zrd&~7R~^x+<8>1|ejDN(l5*8cZsqt0@+F1+F8?WGj|uA?3!@}qrh>!yEt5sp{qs?2
zdDLYFpebcbqeBks7LomrfF@4cbT7{P+P)P_^6{?ZKP7N{dZRBx&~usv=&&B;RbFW7
zfqIAC^O5nV-=2ILKm1hNdlf8j496lliL-?T4WbrjOY{ysApTP4NJ_oFl5qp4tC)TK
z3u&}ivFz!@k5BYH1OByaOo{}&N+cCD>21iE94Ic^%pZ9h^6RCot#h-QQh6R-1F%x6
zhTI;IA|kk}z;_?S@T3JqKUeAmuiaCk52(@`jQ94p3h}&NQMQ`g?=E_(-Bj9*mNUo?
z===mS%9d0tTOw9FQ1LEmMZQ&|)8;OZkqa_~8AHF5yq{tHs81yvzGqsU>fUcL#Ofu@
zP19L*y@qRYEA9`ymp&q?@ao3I;YZ$FFA6lC6ZU^OL{|)6`$Di#w{Q`)dVe$d@S~KV
z!J2F2mYIV0&qZr*GM(3hoMMT$oSDv17)=JU*Y4$@omJG`O$Xh;6Sl=gCpF(W{+*Hc
zNfTpk#O~1_7ISaBiT}?5W#vQ-&KXRNyOKx!xg3p_YnAzoegUQe^c{u#qSqB6;?}AR
zAgZ+$_GT^bYBB_MvgsxP`a}h;hQ^#Ol|q+#w__5VBX1{tjjaLa;@sV=%(vaZR;G)T
zhi|A#pmZBgz3<+h8VMA(61=?_2lB=Yo<O<251~5_@dsz+*)z3L-@<$vu7YHc)(ey>
z{($}mqEe?r?r{8b<U?FaYU$%cH?6ua8|j=T_|?0+v<%DOWWx)5XKEAcl?a6Rkb9Hl
z0HF1GUYbV6JZFIiIdAU2B*ZMfE8>CRC~MM_VXQkut02$kR@{2mn9kxHvlNcsZWz=~
zwAT&@-cWaLVAt2at`3t(1u0TjhN}o`ommtfIGF7Q`kZL2AV_X{QbolQ7NX#nkq(jd
z9jh3&_EW`K$HLP-);EG&uvLLA8vE)*LWY8}z4nJoCt}yDW1(ebWEA5FoC6V9H7YXf
zDZoXlW9=WAwghWx_WNLt8>RjAPZ?luLA)7ImiO7E#?GvaC#r3_Sa$fIn)txEd{-Jl
zL*Xhey^^}e^zy;FrpNhKoePAumv;&K44&Zb?9Cv+rQB7;UvJ2=W?2uIU6^+a9};g7
z^j8G*!x(-hXH?U`T;472@S{hv$8RIv5QyCL5>(bi(p{XBGd4m4Zd)U3mojPW02b+Y
zIP?Hc+OAeu(>}Z$43{nID~{33FSgePz2(0f3!b~X-lE;=<favEGSUxZ3{aS3^D>a=
z1UjwuJm=A`Q#Ac&PHPos7T<N?$vTGuZy%STmB_>K57_&oRkG05?YSCg=eA418E?`!
zmfpX5?luNcfA*NSXsJW-Zq|ZG!cRk2qh_E#Kcitrv4)dRt`2!TdJ%0!gQkF@LY{>q
zC1fmt@yO-#nv-2Q8oz{;@eUXAn3%sj8GEwC+nm?Hbd@hv5qjzRr=c7z=fSHrMF^aU
zZF5yjoF6&>d+%yMvpa4x4NyMOm1&i+USs>|5Eo$Fw2RUNLWywYnpg|=mnwK`*<<5Z
zTZivbTtJKOLbS4K3($AE#n~J?DjgN`YiNr=KGB1dS)Dwb>3d_e3}w;zuuDw<mu>h?
z<{Ow|p(bbfusO}6Ok}8oUpwZ_0uMM(a4(zt+dUf36jWslw_BKRfRQn(x`{M=@fpY*
zWJk(cB@ya08?Kk~!2vhj6a&^msC$@lEz-%gK*jh^?4iqM-Yb(bx3HH!XPj}5$V9|U
zQkFkl%gQ|bM&Ud~Gss)%YT<mwIa$msai%f|$Y5U%{w2EliZD8{UNIMk8UE|tFSESE
zkLu4E$oI+>`HCBv*VDO=fB6E4D7xmM<b57<x4vvU?Brd14l$}IMkB+KQs1XU8TM)x
z9#0L($4Fue`vAqjjC?mHuk#8f#;R;(t9MEYfJo#>y{+gXM6-8uGXr9@iI)d*rH&hS
zvV246iFJMpI#2$+<?u<E>3qk~b8D4oZA_}4#j%0XVzz0@$>SU*3YwMQOF7pJ_{zf4
z`VVJ5N!H-i)D+Yy4K50>NWQ5E1dD5kJEjMeE0c}&U6z&|*h9^1qxsjK+EHVyTXZJ_
zMS<be5mNy=TB41GcW<Em*US1Iu|h4j!mCknx7@AwD8xcT&)m_`p;IO3`S(%opew~`
zpnV#7+l8pEnd1bRb~Kk}x>k;wQbAoOV|$=201suFEg}&FTEe^`yP=`EW$k@YFU{(P
z<6rJOdnFL|u%TBK1|K?eutb0GvYmd;{NaJX_BOVjd=Ky0nVZ24Fe@Xdig9vjcv>zs
zrG$la@j#GE?4hY#1s_jw+Cns62pJ`6%GC~B6A)0X`KsxytK#E_^tM#^6lJ)n4__Pp
zG$_b}F8rju<CrjXZep#8VAeD0X2AYxL>GkQ79rdsH6iojuFrV-gwz<i%3G)crJTYP
z0D<Y6RyE9OQ{Xz1-TrXvoG7PH$34)|gDT-pzN`;vl~pLd`Z{CXotpPKU#&zT9epYb
z`(ZR=*oA8?pzvvG9!_0zwxHPwnVMJnrc=6Am)|Vguou_o3<Pro%JG=>&_0q(SZG#B
z06hpi2|1CtCiaDECZBxU+Zo30b>pOs(w^L36BZYbf4MoHd8*#BHs>F_LBrKb!Gqhs
zQ?<yKnfz)31Ec+l`Y!@S)by_u<zw6axpSP>fq|PTaZZ$y2zSLCGdQ_HnX^@x#FfYy
zEl>scY1&0x%{^EtW$;cX3H+wBjpPT1p@GIFqb*i{^f-hyiDLO0P9S-Jc=(wjNF(yI
zda~}QPP(nYT5t>#=$>ZwoD}V)gOJGcFimA?iJ(i)JSx_5%kGVPq-<x7_l{S77<V=6
zh;8&2yB~VsjP>ZxpeyCFJ7c}C2K>+kX_wlMm3)GR6lffEW@&`dH~M=`fuC(Z@wSWe
zehWW?^8yNhQcl0I$MlAoJ?wD1lCAS9%CWWAn4cM8lM$F<bO_BOx>WS>75ZVg<l5m(
zo9Z@JGEATdJ949Z7gOd$5;@S1$A>DeBV-MbA43%M(=R5v6s_j9)#(e4B@F)sz9%{9
zZwmhlu<Qb%5kpA<Y~=CyyUS;s+QI~MLEX>oj{3AqE1B}2;FfiEm75GA&TlwY2`c8j
zHjo#PN(Cio6P%)NmD-)BS^MY<)U*a%1u?e)vEllsx;9>_kmi<pzu^<J<KRS>hyDo8
z=m+Rq=#R3gt);$suVFSPMhqL#E*Ej3>hR+}WmsML-jUUkXKA3}XZ#fN<8}yPgq^^f
zk*rZdhNGmRVVtokdGIK=0?=|p(33`aht`W;Ki&lwtU7hhRM@m|gleZQ^RBtD%OK+&
zYnK)4Rxd4PV?Ruk6}i+;su?dg<JV@33u%Cy7qm}Zzi_UsGJltQbcmCka&ds*7c5pV
zCD}~5s!33vZlhSC8?Yk$_Xk9s4n5F$NT5&@4Ni_ksC>H@?4R9VK@c>@TYGwKS_SL9
zkpLR_%Q}C%`DF*B9_67~!)^bL1GQ6%ux}<zoLen<gBD?NRaFomvE)i}nao+bFZX<2
zhokLSHoJ7{--T%Rthr7q$x0^IAW}Gb-|()wp+;f^7Bb!K)KF$>YwaKFP3+UswPqn+
z3@ES_LwJ?R4xPVw32pAmv+}Cg0iDO|h0mjl#S8A?b)vr*zR)4AJ9QUnye6%3-{&Y@
zzF4^V3(K#uyU<YdFq+bM?Nh<)6!xfqj5}ekWcH;?u}YeNai?A{vD5Z-cJ`R4{klzb
ztCQIoF{OFILrBIIp6@j1ONs&-GTjC!Sl*#V65Kg;Q=Rh&R5hB{Xj2cou~rjb*{?W<
zNe;lZqNXc5DGSGV&CJV$r`;RklF%4aLljQIz|CwSfES^w`;u3(eN3ufdM6*YPeEka
zfXPZ3S_16S&4b2_0nDlTH6wUIDtpk+heG2BxiRVS$OnUJ3_l1FxBpYB>u?wFNJSvt
z2k9%Xbe(cBwJPUSL>{5YE*l?FhJgShudDeuBnD&g8k=%;-yD8ZPr*U$MM9#2h4z@E
z!yO)sh5PdPH$Eh)aAY=hSwSIBVdi_dqY@8hy@<F5Z9m@S5HtF4Q>%Zaj4hXB<E^M*
z@KkH$*=eMScl(pFMRsy>j;@p1yNQgu2tha%^1cY+p13l!hISUmKKh0;Hox9ggz-5T
z)(M_KoX8)2>Rq18CKlk<CLDq_msAKgOCKN*k_Qfk3qeza@5Tt$Xy;+CLZxsXQ=eb(
zv{1rDfnJ~KUswzN0bP(wJIq&hbiL?Qjl$%b*f?wRO^LmWGlb3utLIAQQeW^XWpqh+
z1g$g+maQew{zKUvOGj-r?TG3tldI+fdrJ9=#Dx;thdA#9pl*0Vx=0Xz(wQD<QL}a1
z=+>dPT<)JS_imS_75kkQ9H<Ga$Pg{@+AC$f3hSX6%uh*_3&gsVw*E?)jgcK4^jdn)
z?3S4${`w5eoA!J@OPPQB$mca~!L99v#gMFg3T05&*7;h7JBk`&^TRmKsNZvlW@iwi
zBbWEgrM@Oi<0;TCDMG|d1H+g9Q<y)6PeB$I&H_`%>6vM|GU&^d7_-2(US%(4%@zPv
zG>ZZv%M+YIc{-I>)4Nw-ZRkHC_0ka+ASUOVD8ubE2%`?g3aW@W*59;^G<P5Po(y23
z_D$JCeQcY3{}n81^x5Ps`i~P1f@><*32V%teH|^QrPu?k>nn}l1jK<#sLKei={Ykq
zXKj~V<(_#Kvhq?ojMXcWT2t6+YWV8_@Rx7xmk0?_{)0+1tQ)C1?%q_`O~AIu2uS=r
zWS-`!nHHRwl&miW0<i_iIZw_D$vv7kY&&*^e&hHTWch&=0EryG-LzKOOhPpFuWw2!
ze1O6^8NMkWC3yiDMP>3Rerj*;=5L&tF3yB_7`c&m9Z?t&7&3V<<pVfTqvrsivcw}D
z92p)cKG2tyhCyeS_FDP@zH1icUZqvx6fx;%hq`#e_FqGsN>=Nutx$I?jy+FPSAv4B
z1-L5$O~#fNVd}~<{{u{{x^}KF;~3AL@$3aLo8hfvp|g28IrBI$AJqOOx5#GU3^QBh
zQ~|mAUkZ~4;$FYrh0@W{u`OSrG2fOjG%+yP{QWfGeUS6+ZM71)W!g$hC!6&F`~Tl4
z%li&)ojj!^D+3FP%j~Triv%W6Xm&1i>s)Dz7qfcH|497lSv}wh+B3lncxN{7e(Q`V
znAO4`VI~Lq_&<T`&fT5dgYbWtc`pgPOse^enrI2X?-Qo>zd5q6ws9r*@Xx~EOBF-9
zd>}vmw@c0W%V6n#F;q@_5GYe#Soj8Y{@_{3oh#F+hjwoV;d$&dql>(ETv2*%lu^sV
zemRc}+hdidhpFSivHZw?v@0e0pQa1D%T2w=hbet`Bf#XS3l1DKI_vdn;4T2}Ibokp
z5mNcLO%-X|mq6hmyIHpRvH<927*E~zLY;bYxnKuq*Z<c-x!j%(;GMO>OpI*03h0bD
z4}fF-;mNFa@qoyyOc5!3Bg+2@za^4TZ)jpdcbPS}xB$9l@z$F#=Kq%a;_SegFUPJN
zzumt7-<+G(o-VtJg{J!@C9)g{vsB*iyMz1l2vtK6U&NaXkXp%&W!l`oq8eCDzj2Cw
zWkY1*s1pcuO(SOf$pnuIBuw>Us$$Tk5K?00P~H*xzSr{=0K^`=sbweLJUmuhCh0q?
z#%LP&CIDhkiZ`*6xTBY$`yVYFJkz|R`1ln2)j2Ovcjr3jdk)WoMoYT4P;>>-MVcO(
zNQ9gKuqbFrC7s>-aLr?6G|k(VY;Ki*Ut*_)8l}Vtvg?;`8@o|1z4OT-ibjK=#GWXS
z=$`)quvz_Jw(l8ZZ#|*C`r?Hg;JMZLThQ{bn9plR!{raKT2=u|-Saptp$pXI;tsLu
z`_2aXWsUA1M7;ZLtq%R7kqvwL6o*t8xyc4ZD`>MVzPdk(N3tE7=Z=fw0gsOH#}p{m
zu2dD`#oYg{psQHqIW%nP*;&@@tjl;8R89xYsZ7*Ob2Tm`h0q3`^+q-CK4B=TnIbp=
zCr2pyuVIHL1lCr7=Rs2J{V&RxLbDg%&gr3kf#u}ev%1=jJpu4QXpIh8=Xs&ZyNT!b
zH#&#6=JY^%Fn3q#4!!#IvOmTQzKLJH3W7|cQ6WYGm*JSvdtXCn`vFUeWGe<lGYUAi
zrSm-wKSi0;+kKJ;X-YBv&)Dy$<;p|b<BPnVMFJX+6##-RZb9)-m|qg{B%na7w%)yN
zUbvs9!w+yyXkZ|(Ks+C~x6nbKW?YykG;|)<1LtP&K@L5S*EQ%lc1n6D)g`KV?v!2K
zv@+tpi4fB@nJ4QBhOdZxUQv|-0)-3lg2ox;4lJ>zMu(x*kZGBn161LvtR2|5M8sok
zKtJP(hTBkHyGv|DnHyGQ0YKLqS(Ag*Ouz9S<g{0Q!<UzgCZ1C}5HowGI0f+y5!Um7
znb@=}(Irky0+dDu`hUG?iaY#MnVOU2f_0HTZf4;Opj}Opfac-;JcnJfMN!p#_-Y7E
zhO`fl)l)(Nm5=~+l0=VDltlCuJWRAd#$)7f1?NXBKHj#Xxm$Yh>_^w+{rr5Za3LaP
zoZCaXw&cfHUpfE}GUR!A=^=wg4zX^(-v{HXfjNq3s{N>=L^)4Q0(jRb9_jC63rQ!G
zdulXlEIHY{OljhE9#-53GqMxO7S4s0M?vr6fh0MWFNm%yF5mum*4O6dj}b?pUo=*e
z_14m%J!1H3ibmZVdrv`7d9y6yek6z-Brw7yT~K0}e)#86?p@3Q%1D80fio%n`CaEv
zaiPfRt$uFmga;jQ+Ouo;yPbgBw(<>ext&)h_7x(=i_qhgrU^UNCo)$$@B+z)rEa(O
zm(8@%6e8-ME(b5{M7y+6VZm*J6b2vQee|?-dQd@|q}Zkd#!ehC_5jXive~nc=8-3e
ztTDf5W&*JKy?EYeq#y`~4o<ZpW{8_wRaWWwu!l0JFz71AMyC`G{qe>=^;8MlNA%_c
zA2EzY&)5m^r#b9Te2n8X*4=zEr4KzA+UBgYlb@X(4yb!K#PtdO@D^Y2Jm!vCNsNUM
zB}(d*JbYXP*SPhD`HAufzJ5ZkjAdk9TvVJf2)gZQ007q8fVbXTuT(Dzy5o<uSw#zi
zFt*d;$%grPf_0~q>#Ib7#x4Ewyp1J%A=?LNS<`lOMAIa{sn|M)&sH1|TST-8oNXUg
z0FVd60k5^S)rj@ZuV>Xc_b?jmE5>no#>J2D>&-iVYj|2i(kr2iRgAjTXKaFSRKK(4
zD)s^u2MJM9YZusfdF++>WvtcOLUObl`jt4>>8(=bHe8FRTR{kl0z9X97q13NQQB#~
zVDmhpt$nrWQA#l|K`APxENsrj8gf`*FY=4@03Bjo^+}gTLsx*`s;s)AfmAm}(5G^?
z(EN;LTd?7w2O%5kH+kwI>%+)0%CW3Ku`AC!q_wvCz+Y=<3(2gr5tf{l*JX;V;WZPr
zEXw3FZ8(L9h=wBFpQD!IW?TBUCT}|TUNK29iaZWvu1cbgFyufFJ|`q3(SZkMkpD<q
z=I9t@OCQw-kIfUTsu6r-Rto@K=$C=;vSY@;T0|&W07Igt?ZM$HZuejPj6HlNKj&y+
z^x;QFK36?jZk^8>ZV6s)m~frLXtpo!4I6gstn+yzvm3qnO>$)qIgJmFiPlN`7MJe7
zP;0juMyZo>5qVVSXWvyAyFRBSQ2!qa6ZT1q3FiNYLZDqzH?@pGpLy$;j<GElv9vn$
zz|a?N4_;rfwT!HCOb5&<eb4ce!rmH1c<tKPRBezmc1Bn3Lg1w5*X+K4Q)H{shv(d_
zF3A{=mxe`*n5K&c-^t|ZK~$=G=f7z8c6Kufo*AhtuZ&e%Oy<Ax%mAi@t<Z}^sh-k8
zedN8;0jHR_PuI&Dp=@bTW{`T%UAdd>_OmvCAsfpvjpamELZ&WUtxs?rLI$H>h^DkW
zf4s2wUI*lypziIRQC3=LdVNSGwXA7yT8$rQDioJzSd=ZbP&!9>Am^ztMov-}K(My1
z(#hE5#Dh3mh(AEJtaDm}+1R$$#Zl)2u})t#E(^h2S0)=mc-jlZn;yBw@^;Yajka$i
z%n~go^cjL9$&D799ip1;e#X<?tK68RNC$IAP^RO?tGiaqHtzqBS}%`yKb|5{h!+=Q
zx&}1nQE$&_m>jcKA)*w{B{c%Q7%HZ|k|C01VP^imK3oO!6ks!}jyB-;S!I3Sivq#&
zs*l%)w=B`Lc@738A_a=t>lh2(Df}GsP2+OCyo`c1Sf{a*jm=u7H7Gy*6;x#!in5J~
zkbzT70shtQ*P)Jq)ZcNQnKdB42zy)x)aS*e&sL@a^95_}%EYY}74L5W<9r0soH$A+
z<tX4&^)B)V-~~Yo)%`~srIC0S<aBZQQLVaPDH_)_`lF?ou6Z;Fj4fb-YYTeQsBmUB
zi4d9VcJSy|^_htKxEE@Bt94%;PTCG$dSXo92(NTVKO5v(IxZ2#rBPx%()j|2Fbk+)
zWzaQ)rp(jH$Yk!5UNWhmiwe^6zGPfSS;Mmr^@a_jAfqckluToIc}nIuDI`oty3^1r
znavuVGWtvoVqY3Nn-6>SB6|3EKXgfe5p*lix1s~XpgXKE9XlM^dduroRibIxK97mn
zWs>4*6L)Y6Ur2$-5JR(vH{d&}JVVK%lZQz-U!OHDoQsr?js{#PqpVV+VyjYvzX5;g
zN$=kFP{*nBk5&|jMm#+g^*px*$5`aHWBIQvPlsDve?(mkgKjk}M3`-jK81nf89S;>
znfLx(<|E$6zubMOg_`h?+etKyN|$I8XI&ky-h>P|^3`2rdq|9Np34B@9L~p7!84@@
z8$P;)B-@})vb{I#bvB0v+Z{VqLOJ2AR>I>_)t}x4rDlO1<oMmWEaoFFcYe|NvJgln
z^=6Q4A!$c>_Yxu|52T4XtqAxGz!PPm=v?=w2OXb^vYn2(sxcgQ#=7w+xW3Uo!b}ms
zUsi_mJOi?v4x3<~Uq@bA9%FOCq^@{5Y)*$<10LtvJc!W2^{~8cu=sxb_IRC#Gy4M~
zo#(~90ncgRxd(}y@YhozS+yS>>n&2gDB3H;Jp})rEp*SPV%I25e)Aqf{>mtnQ{=Ct
z6g2)-(&tYyJ>_8$ZyxLJG)V1v*q#BsJ0pIrJ$G&4Xq%Pj_Tisnm0wRGXQagR1diag
z*w{Gp^ZeY9g_XgS^VDB$A-B#YOF0}(XOO41dQMl;ix!@lSO}ZlNFu8|nX!xyUb|19
z;5-s~<=ctBJqrUHfcS|R50@?ZFyT&2=6to@;<EB<#(2H9*gAB*p;LvGO~Miu*f(}3
zEQ&|=$JlX>{TCv<9!(Ns?M^dy0lpKHZG-Js`oP>1&5f0096p)<$zzM(hY0<LLn@Su
z3or0NFN(B_jA%Io9KEur^DPB<x?QW2oT*x1{RO9B#xS1M=b!ax85dV=r_Ai{bz}tl
zR`9MY<#SW1>mfX9B31_?)lzyN&k72?()x$`(`;<Tcn{Qtz)|x%oxkemUh#ARd!d4X
zstfEpuOrw_DM$(=pxee6^kkkHHe!*?IKD3@<6GA;&~3JvBX8=5n<pOuk1zK&sO!;6
zKSLi0egDWmel2HAasSs(5+s)Y32qx68-;tsY<KMB&7E(Yv=?e6;D1y%;G7U22&De{
zk9z?A=*6O`4;SF2#-^I1gMnvG7m4{^c&>?t>PnBMndHk)a1Y`2vGP}TCGkbYQ8YXs
z>eQ9d|A(*daHRVG{=Z5|Mnxe)%MMY<CM(J=dxq@2@9k2yWMmc@A!HLSuIpw*$lg1V
zbzRrG*XH-U@Ad8b`+nY^f8h0cjx!(Uan6(XUp6(n9!wu#e7(}QZ)e?4bVwRkKR%M6
z%WnQckAASWSY+b-$_ilDHt<b5G0PXG{L6Y;a>e4a6fcig7L<vv+N{KLfi5i5Vsw3(
zuF(vuQgb7IP&aVyaLk@*e--H`dVR?8-xi}Qu9)T?RpvO(2;1sy0oq_nh6x>m*Bgsl
zp!d7Lg*9$qN{C8nf%LpXNP_{tS;+b?iyC?_eZ=sHw|`9Qh9blMLONK}ewv3?%wrdD
z*yi{A{nkFd_HvfSR+x!+Bftr<{>!55O(JQ=?RYF;ctMbw9k>;abH6M7F4$7P6$_k9
zC1$a=g|K3~;x&F*+%Xv#b$mBbU$we_&Ygs0gsuJMW#r|X$F#{a8v-Ml#Rf)(O*T8q
zR`<T$Xdz&%SAc>7<=<3Gb65#sVjM^Nmnz2@f)D+n`2Xoco-_9?oY9jbtCNwFelQF4
zPGd=+XFmufh>`AWn4WuAn&ZkFrPB}jZ35HK#{|sPr-xPS`xEa5KHu8~+3oee%{my-
zbNgWSoJF@d{|Kji>Y)DT12H|sdodv&N$c-x9p?p1Dk=6OMHt7+vdYL<Y*xO%BPs~>
zznm0BN4lRNAx17B)snydZRGvs+A$k5Owr+N7(@DO%c%|?N~XjeZ-O`w0lK7r(U0II
zj&ug`=hbDBPzG4HBoIlegfH0s))9{MEwCQ6-(h7L68p>n;nXq(M}Xu0YD=_UpR8Rs
zNFW;Wf1BXz*Poj2Z*;U2*XEZMlsK&vl<fV|+mYHfD#a|~(EH%H;R!a<+Y@H(hsTGe
zrL+iOXe0jZylZDbHy09Ixf8OeEFC_dWMPh+jgo!ZC>a$txY?@|i5=`1ajYUAn)K+n
zaN%DL2NSiqK!$9$gMVqFw$R+KofDZR&5C<4b$z}yT3Xm^HZl@B+&!|8z70Mh_ix`w
zQ#k-s7x@gD02Rtw&(gIk0@oktRkvOcfGvC%Amn8Q?06G1)I)?R*#!Z+e>s@RCs}QG
z+4Qsng%n13TN|~)Ujd_wI`#urmL3;ys%6R}u8din5qW&n5VZu<zS-m7Gz?xT8YWEM
z`|R4--DdD5*NJMg&_3VI13~#mIQx>z$ae0Q;>yG_zzqe#BVX6Q=*8Zw<Tk^2R%lsM
zDARr$2k<K)Fr4FxK<f_J*ps5vn*G%lFRa~)G1yyJ*r*=S>M!fRe-1Fa3xE5=urNLJ
z!KS(efBii@R!ofCTXx;BWuc{>h7r(42IMBp0yO10qzJd9LG#PdN6R3&`abk;>scz)
zZ(;GZEZr;=>+?UD0nMD7dziqb*A(=7R$*UPETS)uBN1hW&W=P&Cwl#9@xPsqGtkTt
zEJCCT<_+{UHAwqI9u-xE(`*rf_@TT+-VsZXlbcSBETnCMYfl>Zmlgfgh*&(wY`5~G
z;SZ}5C4|dXZivc}K3++>Mi=2ePip_vZZzP{jljurVy-h-_Ak0#U5Aq*+A)xEygu9k
zz_9(0+&LjaMj$X6c;iULdeGM+?kNZ&2fdPu7iK^I%V%~5PV57lI$jTtdA_%<gr**P
z&3HIKbU*b=N!NySTcbt12A@P4yLX!1EChz^t@^*nbA+y{q~zYBSyqgDmjGdV`cRAk
z?LBefQRh(Y3#ZFZ;Tz-5k&_nZERbG8gf4O55u5K7wn=%j<8P@+K_IWr{1vPaXIWt@
zYJf`SJ|Ezu(X5=Q#W2F~gGa8-dl`z%rY^E;R!wajO#7G;V7{8G|K%Ia8mb-r@pTD0
zf`nlD1Gm@R(Nbq~Ed7sBrXhMs@B>#1*tW3O&f1TT(;z|=aQcsX5-_cDn!I`-BiGE}
zFq!imutk3bo;%G<=oC3eqwOTxGE2_EfZSgNE2~}1O=m<d<+Kp6_A8&e_kd^%eLP|&
zrlcMI$Hu?q1J|C`7q(Ts)_XnVL3xbfw*dW$iXWL>AUFE$?yE)^L2WHsqwpQ)F|R-n
z09z?*sj{bc5YE>7IRpR%Wr|Pfnbyi}ry(+$|3o2g`R&VjZ_s5`UTU)L7A92BzIN<y
z8Ts5LNI*;<h`p6&^&1&EI#f<Jn{+2Snwqa8YN@X$y-mI7fiEwM+2evH#ybG*yY!Ft
z?Uu8T3qBPv7#DaxcGtptYIKCpI6zH`DY!W|$Hdg*jsKN04frgTg|1=eh|th&LJ@+P
z@s3sew{LL@JZ~hm2&KB?)pOMXgogaB%LS1%6^6@6ra*gw=e(ycBNx&d=udi2%mDL3
z7Z5z4-upS*5_sLbf4uH@;AavqMD*T|m`Tnb%T+J^b{xtIT!Cb_ME?l3YBTx<U4Z|Q
z?Mg8H>G0(^H9<m>DS(Q?f}dW)Yj54+8U@ah`yXeyX{B0a)%>*LyhZiUC*+2Tg}BFU
zVwBx0<HT$vxX>-)HEI>fe5Zw9u241oxu8wT@osdP>GMBUm;7H>7d7KP&j9-w4w6~x
z_ZL>4VTKwe;PaRNh&iIA4+NW&3H|DUc&>ed5Me2os0jBf|J&KoKi%0{YFVu<C`%P*
zg7vGm9BoFMlDWI@V<ROGJ&x}^RTcGiDy8)SExRUAF7d}8`uzOgzL)fLr#ZZ(_;Xa*
z$<w#B@@k1?;x0yQ-m)Ap&tOC|D<J4dYM)`h=^RnNYR&%J;qkSsDX|K9)%<!rX#?tC
z{?q)eD|K_RW}h!@-+O9ghrkW$ig+#dT>XGcym5Vq?Z3&dr%=f8A);*9h}Pj^rlvf=
zp@$p37NBRoylj!$$uJFht6ur0+XQxb{9{BJVtN$_$&kW-{hnlqJ#$%{I<@dwQS9A6
z`rHGb?6>o@c;ULHsb)1Xd&VA`w<nF5_P(bq5aGE3IxT61?$}r=P<i?%Q4la4vNTk@
zrHFY*dA#w^Vtb^Cs6L=OQuu7!SIKQkVG9s|>a^yj30A4IS|BF+M!KbmP9IbI-$Z1O
zu8Bic&<llsjh&W$AXEKoaep?trOMfP1<12iSk2z3b&FG(LaExqTLGd^g8yed6-=r+
z08=jrIs2{P+S0e%jfk`JV6$-bQ4y*AUY>}KRuf---|ld2e?og=bE_#_+85<8Fx?_K
zjR$jyVm(<|w_ob&+)@KGwWfskPQix4FtH}E;VdsO91I0@{WNR_#x~u=5eL_-XwOQk
z)U(e@A2t-E@MG*L{eP1G;q9n|A2i2!AGkfjkJ(2W!?0$dItWSuCLz@J&TQaP{g|0z
zzdzBB;oFS%xLgbVulaI!?BFR98{n%P&}y4|Ye<)Gskcx(>Dbz397oQIkl^zMts--{
zP2$kSayHlPlho&=SMDUM(qL*3QtCsKn;D>;^S*1}Rt*@g#jnl|oh)C@&o%@@_fTl#
z0%1q2ukOUkU^ilKQpd>ZTjoB&)Vx%3uDU<=SbFcHvdQ7|PZ%T)xtA$iQ@tt%(mzV!
zt2@g)XzJ5^dS)y61&pw9N59KIe?IQ>8Q`=K2i#5^D)R6M2GX#n@W;m?ZpX$?43GA4
zO26SN7dEYm9e2M!EP0sbz9x_hwfexQ`B7}2)ipquE_YWA0%pZ@L@4IB5yy7<9wz7-
z-uPAh*sV=+x{~iEgde#yHFw9(<GrB{GZ_9gGUu5@Y&aqIj$~a0b3t#wR)2}G#eL=O
zpYE09PoFv*C}`VvDBWF~Lz((m=QI0u91^6*ammjeC-!#B8J<=vvZq4rN;SidIuuEl
z;Gju@>g(v|L&0W``rVu|xe(IjV}Kzw?>Q77#@`$K%t;P`d=39QJAGRJy^<FVn%JvA
zf2`L#lMb*Kl{`>bycu`PN&o%4ILYGPVh4`BMEZH?WL>#KAqC)dMi1VZzeD2Fgi>cO
z`P7@WloKQ^IYT?1X~i>Mr|?kRX#-*6TR-4oA=xT=!+EX!XT@8vNQm9d3`J5q%M8Uc
zwVtmg1J6-=MBh*x<dlvi^yf)O9$75!9Dnv^HPd=*JKk#eox%K@zT5497iKaF3IZen
zXWz=mH;{6k4IoiCB`^OK!hM$WYQUMWGXXCGUWJJ+lwyhtkhM<GZCKML*7(k#PWl~^
zkfnF4#mDZ(X#2N-!3?V{P}h}w54jpug_mMW-7-460U?!bzFnty&3H@C$>L0WLf`ui
z)SmJAmF=B!Pm^~vK?%SZh%pZeS?x<k^H!w6yAg8Y%(m8BOlJKHybrmicY0Z|u`|P?
z2bY)}yZStb_TN(k)>@molu7O$@J!9-Z4do0&noP2<rw9%-ukRWSA0v5B(FJ|MY~md
zGJLyxV}3<Dx<x4sD#a=ObHqv5?3)X0>2Uhv02fFaT9+pM&DQQoc1d-wrW|^x_-X>~
zIu35F`<l^ODcz-<c{jav2($Ln(DdU#FUIrhVoIRsilUK!il=BjGSx+;#4^(fpV;mA
zrRRdz$eY%&PpJLzx=T!T3%SE9KWDPAdBq&3_b~#dM?bdFk}U_;MdP2WNxRF&ZBc<U
zWn;yn`&;jaP@Af2+n$n3cT-%ZEgGsCLssk8e{~C6Hou*7$2{L!ZxEn>vC8eGL}8|y
z()71=+P#?NO1yWHYr~a0xAPW0xGT0wV-6Z*jZ>w^=!UfWaC0WsWZiO-kjm~?y3&+W
z?uX*gt~`<4uE03xd)rV}_xo#Niw6VU`>g&guvNBj3#MXG5+RX{`7;AXFP(hff7(um
zlH`kI7@oE<k?zFh`X1e#W<}YOH0#^0YWFF0vq@9QoR`1SWPh<wA^Mkah>Sj0XN?4`
zZOjbqcQO=eg_sbM<V@N6vN9*k=2TInka99`>Lu^^m5T?(<{!+N*-9%^$x+PSV_a|3
ziyT{<9CK5>s*Jrxh1f>(<c$XFmK(g-BI8T@rIr)&Eac-vRnFKNHnpe*wmQMh#(#LT
zHDpEv+Z=ZpM^Qle@{N=qvB5L_4PD8<)SnC?wNf>#pk`6(o`y%BeF)(**?)GMe~J&e
zN_TT8-e=q%2ftNfKm}V4R&9f-tYXU+(uP(`Zxlj1$A)IsvUp3Noj-h<T3?^5vh^v7
zH5JFzMPRF7Rxq#G{kB8=9#;FKoy+T4_0`K7%v5M9nBCO9rTQPcq5h#J=N!??A>)CL
z=GWEv7zA&<Rd?=25vo+v58rMWOdKw1%QB~)OlH#T&UHEr;qp|;y!oESMNI8;qGQ1D
zV`)5+#@GCWF8RV5NEiC-jspXSA1s7S^}FyjZSvI%`oJF}Tt6o^U@FW8?Ss2We10y$
z1012})K>?G^9S=TNmLzCcTy}#4aY9(tuFBKRj8EePz>ca7)XxrM)P!@%-e+s@n;x@
z?DV~=HyG9BIk5X^ins##sW_~$>iI;0tNJwind><-LT8sUTiuxCG794fMmhlx$T_{=
zL{q0CNh8Gaa+CdZyLlu-paTk+U<!vINfOXW5__+A>g9H){uKw^6Pyz{&G9tPlKXil
zyl@s2ac#)kBzY!Ad&cD77B}lwl9>XrawYXrHL7jb9g(&hCmV3f%yjU2t;Z)^Hnund
zu-50QZ+^7?0OhoUjo25JlA0Bqq$Urk=QH>$TI2W&lde}(ix14!_K^T^e+iAhd7~;m
zezDIBKToked<E9)xco-9MD|93KGLal*-=m3zq|g)W+8kbD|Y-W(}^p9I-geJw;5sf
z)d+w3aihjKUM$p8p-q8m>B)vVx!Ry+bdybLtdTdyxWP|lLIDxF5ZR80y@d?GN_Gz#
zp$4A}A_nsirq!yb?Z>($>eZd9kdTs86I^Lm$hqT~M~ktL39sW#Y4Q|-8|dM2-vzdn
zapyxZ*Q}ZM1{d^rP67-&y<6(x2v_dSLI49~*BDYmTtC`#ZrA2#F4o*U6A<R;6M4UO
zv23E{JSWdb$)u+27#fcmVYbfruyA(e3bPPsngVP*d_G{@6EPXc;=Csbx|Plp?(3<<
z1S@m1rEz1cEmZE`&W3xw2FK1~52a*t$;8J$E||`9jj^Oa1WTTov{1a102!FgYt0Zx
zrye~6Cl_Z5h`p;DNdeOohPQq#GjWhwEjxai1~(gWSqbw=HwC((U~0;?^jH@Ky7WV?
zW47&`(^e$as_%u;{p!I1a))!>>`^=%U%P5j<JMz;XClYEU!^O}cS@lFN1;>CxH67f
z(4VrPS0Wc-<Cxr3MDsKhnq(6fpO46cQKHbx>z9w}g+Q%6DWQ&%cH|oV=3_~k7az9~
zXf^=(ZP1^sS*;B^#FzM`-t#>^3>wUPQ!0Jf#xEyf<Acl7+--t$sJ62nVTMRv-0K&L
zhZ#T7+%lp8&PMomXOq^UQf~VoJLEyg_&}B#syMTWOK9$K_!YJlBZEiT%9}c}sM{oZ
zBc)HNv<FYn>>=j&v{Cl0lQV<;5`Nqiv@e#|bL3+U)wk-Nh`c|kPW}bV>>ezjfAaW*
zhUB$`MfYPVO+n>uQ~U&}*+`&T8;>_SO6dx1ys`AmREE=Ah|yl3I9D`WW}G04t6vmm
zn|Z9e$+7H>Cmp~gC-uUwpo+Qsm(ls4Lg_-d!=ED&WuvBmAEPwg6hKTf4^S;kQ%vi2
zeq2~Jyux-NLC+Hr5%L@(knjz3jIP42FUyYB;Yr%25unp@TT3o%(&Cdy*76(3gdh4C
z-guh&kzQvqIRLDD_%8~&&H0Z7ZXRO4!9QDb6go{FD7YW3k)f|ZH1_u6J$l@g&y!vl
zUt<jR=(VVcyr!X3wO}-)p^ksvAER+NWrk*NlTYQ8&0={8N~K+ujW*XP2B~NjR%6sb
zkw3T7;NpPx#`hwTjht{<{kN~Rx@9Fp4Am>%1}`tW0_*l7*|4kVk0V3<Xl&(vG$Pjp
zG^ktG{YLG6P~%J`XmR10S7bFLm%%@K?c`4TB9M>u@<Fj>9_&D4<Z~$~E<-mJSe00y
z8#NgYTRU70xMX?__m$vZ$sOv-8lCQot4trKvXZ5>O5PA$Lvf0>Q%aIK(32rxoI<)2
zs_!NQZ(?`t(zf1A&7%{gD|P!$jZxB4{#~Eh9v*Dm(<u3Nnu><d_t5XMTmO6~Ic9*%
zoLQXRp`RTb&Ki@kT+Q&4OTg*Lo|ktZy1b8CADIJo113&i;G`IL=L_B3ob<zeH~Xk}
zxIfQzE@wb<1NTCX8c*<abUdL^FrIJXaa11@(wBb==<I_Mx?>kic&r<v$`xw-L>j3X
zww5YjcYR`y4>S#9CeLx0N+XKxVW2I?<td<qHagg!&<Gd6%RJ^+WLuA~X7ZZci5Ht$
zM)p4OH8#g}-hPv}IiKOhjDlujmt8;kCwZPI)r^}Draoo5KBgoKl*tOWyFgoYd_<1v
zW0MyBnYd6JOhN1Qg=Bq|{4$yJLCo}KrCMr`Bth>;BF2ds^2K>TZCToM5|jTb6H`Mf
zd%ldSMDszC<n{#HBHIt)S2Fq|Y3}-xm3O<xdO6-ZEKt_o(z_tMLdDS6UZql8Bd4qD
zcyxC}<dGI}^AWx_r_6ZME*QJW9S;dJ-*^NeEPKnw#JJJwCM#)3(r1OYVL=BQ`iH9y
zYF;ub4Ho^mi^|WN_Ju)=9KCQbZ;Sv4fv%Pjnm=O#rx6?^3|&GapK-fWvo>im_zBAC
zbYSe#%~Qd!n#_fK%jK_4N@OKjM_o2Gu}x6;UDk9Ouhk={vFB<Yz16*&;u6jiU-h<<
zZIK})nC)^kNJzx%>bCY$7V<A!J@>DjlYXCmWocU4+!XLr+N?Oz-j=q-0d4V19>#Tv
z)9Rq0M=`kSZa6u_5HoD*eI|#(t)zhaRq?_<_Qq~tqNDzJ%K75U@>0}7xlZli^^3{Q
zCEKF(&|lH~hV<Wub83#ZoE^2x6;Rdj7yC|J8mp=VopH}5vNN30-N{Bn&Y{oIWocZ0
zIDQj|$|(a$5&Ys=`u-ka=7-^k#Q?_fILPCl>ILiEZ*m5$D^<ybc#Sgm4A}I>L4VZy
zF{Hm+3XS*jo4kZLPYFEtz}fnIge194@0I6JRON|{mgpmfTY8xavhmdsd>I1pqt6ZX
zP!q<f0!KDBNRxcdJeGpiov-Lbk;S4=pTx^eT?a}XDIJ_VFHP2hz|zanw!InOtgljy
zt%K@RNkfF&vbsa$GQ1g?3W|5>GjxK6>@(brqCbI#hkgvsUwuSX<QM<N%&!;xz)-xn
z2&rQU|7&5m`pu3Q)Bv*5sw2n~msYASwcbGjQP?LS+u78vX{ViCy#rjKEzuR;wbkjD
zI_UZ9EH^fEH{ztlqM*q&zcNp;Z{!c2LEk%>Z14EpQ9I7F3NT})z%}tbY7c|bru90@
ztl@5xaH`l0=4<o+DjN)3iy~XdmS`c|geZhRGR~G~RU#K)t8Nqv1|3`$S`n_CqV$ux
zsLS}V5PX#gNnV=g@)LSEWp4nz{nVbIzIvN1GykpOeJ!mM8e&)cJTy3xo83>f)x!&X
zx@B>=fp-*jdyFB4+uR1a;q$j`Z&{^6F0y$Hir?KkXgp;Y|ILFn_vTpl;lu|YgNx=p
z83)1lS5YD?1zA%*7L9vb>xdisoF4#RgWP`TmU&0g{x8UjSQK8Za_;HNcNsrwW3Gn1
z4H@Rl?&xi;{A|^6pa_hPl`EgU-(CC(cZ8DT-b$<4Dbvmid6v;otU`xSg)VssNM9DP
z1=+gQdZEa}uFLM&saCqvl4Ao*8m&D&b2gIreAw0!eL(Aw`<aEqh`!Bec4`W+95Fvl
zyoV7U#0Z>ApX7}y>Z;M2B_EFHum!%<lWthr1IJ@gMGs``D<0G1n|Rlw&FzBKT5mKy
z>a#-=Om;&XMVSsYkJIHdSofOGd_r=lI)lkq5Sgg`z+#<Fe{;K{#E|Y1JW^ir&Lo;k
zWg!we^Qkz52I-kVNqscrUB6BkyzNfzKP;MA+iU(!<ux51@kzw&ZO{{E1-Q<{E)^n#
z;4bYwB8Ee!OG|*Mhe48u6{cwqkMJcyLbkD`2|R~_mSn2Kf=~}X=dN&CS_000-xC5c
zv|f!<#lCgQ^&uMZVfRl<kbtW{FPdb<@GTg7nB4Y%J<fT;>VrhL)f_up@7+*S^h-HZ
zxkHvo_Oj|SyKM=B5Z_LrL+5>NKJ0?uYN?}%&oy&0mbaxhqQ7Gkv~>%=;e-gL$G?)U
zeeQf0S6b^y>vgodE2%f>2KiET*}X}&;S9qcQ67{}h#BNe?;C*wv04d2VgpFTeIu`H
z(6w(GDLVDWj71bzvoqKsF;2)_O8qXKnf4*Z6NTpJWVGhyk!{${ht#MoXp$2ZGms?n
z@k87(yTknVgPRYh>&f>4#-he|Z}xRiB-4Vk(lh96WRU0T`%-FV8p4>ppC}~5Lk-*8
z;)L2a#}Nb0|Fx8v<zYv33ljD}OSx^GD$TejzJ8={;#<`>omq1dj|kV{SoNRzICXZ|
z@L{Wqv^OWp|KNtX$n<-(Mwrf%!(D5%nlP9gdT=e92<u*RKUh*HN3Jyn4Hl=Szdb@9
zZ(<-26(vWz!8&-ItrNFagXcBb^8!TALf`57q-c+&kUFe_v)lc0O>XJ#MfY$z1y_V4
z54>ffo4IP5_4sXRLm;hAmSSU$T4z!4eRkD$Q_-Qxs8_k04(;(lPfF5C3rf^ROplj`
zNgxjCB2_ET_Jtf*jkTZLH0~DN1sDF5c;RZfTPt>Vp6x{5C`GmJ8=d~Cm@csG9$LQH
zK(^afo})@z<?_KaEi&HHuAhCIUaC#FK!>dms9dnE)NV6B86-Zk*CJ&^G>~tz_!D<*
z@dBu{flZtRo5=kXEn%zVY0hxF-NZM&LuO*}&b;5M24SdBdE{E?&6Tw#TEcRHpKp~u
zg3p|aiG2m<zuloxfA`A%R{v()_m0F}p!MZhGiW@M&Szg9VV~j0mz0Oy)udir4Pyuf
z+8-A(`lPeq`)#xMnhmL>m({)2-w!9QNmgilvxsIZG(iQWdg>6mv%FVp$dLpCfARLv
z*pgx6Bx#P>Aae5kZS<iS_<rTHPN$(vPbM5RKXVD%)kX#azAR$ECvM&2D{MCqWpabu
zA^O$5q?h;p{66Yvw>v136}+5qS>U?4|89gS2a`>jT&HWEd6(1WKFoUSCT-h7YEmf<
zy$uZclIwHCp0HP=<P`p~LO=d}+_|e~t}7DLO2R||tDebJ@uPc@f=@0i2%ZP*&6;%a
z_;W=A&^ESR<-5ESF_Dtht4MxxC04An64YTdR{C`ve1C#Lg&HzyxydAMP$X6<==MhR
zw+q!^QInjtWb(IXD=8yYDah=yy2`h885jvpQ&xC<skXYRUK~3)Y(p6>5ofFCj`Si=
zxzoxoH@dst8brkyUpiJvOZawU*IlCq*oh_2335+|0bXzDu+}@7-_AZh)LNw8`N^i&
z=V{>EZ-c(TJon3C6o1GjcgtAiZd-*~DovGZmTB526Mso_&*htdS$rVqkbZoG3sOx@
zE?pj^1!cdw7Knx>9pA)Ya;3ww6!bxaH*W&DhvdTCN#OG|fzR`;S=`_$Iu<QF^SWH~
z4LBsSKMv{f75qxFq_{a1<IO_#^dR_AjVTAME$60H((bq>l)o;fRHuk$sK~HU>S%;0
zq|v7Ss(Pul`y<~K3hFR<k3L{TIaq<^c?~QN7%a|uY(~X6Si$ztYmHIe%<#!v4Q2C7
zDT~zw>PtA&W0es{1n}E}4tkA82SBOWd+)PX+WqR*Tu3w@xA9zLk)#=_ux(7_!-M<&
zRnV>f1pD2c3_7gwPX6~J%N^~mvk|h>d1fW%fOrK2)+o38?S75ie9we)l1Z{O=~k4u
zAnZeo6053SYt-)MiCQbp(dpx~(lQI$#H6J3gU-2aM##^fM`YZF9m2eh{npa|=O2iU
z;>#AHo@3&RUw;T^e;I1;7{7LK|76VRKTge(g^EP-a4QIlFm0Ug-0ZmXGK8PiRlYGt
z9VlKMN7@5D6qH<idxCV6^Q26OkE<hFfezf%u>8moXCl&n(P;Pr>4kov8ykdq5%id{
zBvcP4wO?w$eEy`!WPG|tpE`n4W>gO?WsV(R*sg%>TA}MQQ6R0b8~To`Sxju$o=@3T
z?w32)SsZK%xjKx(>on#+neJdQDQ!e{cgokE_NF}n@>GsG#ShmV_6|72w@L|K>ufhk
zz)@X#lyK1h#}A+f>p?<x@uemx8c^Qy^Hu1xt2}vnXMCNB?E!x+r~g!QDC@J4yAVqn
zwqv`Yv5wXE0>emj=i}Pr|M`5T!_L~dp(+cQ_(GioAdjvy=|Lc+*dq*@M%euR(ZP!(
z)3nlbzYWY|(xOB?v|ppJ!&=ABRrImAx%YJHaT*nCS}B<I4GcJURLJ4{laAuK^u?oT
z)ic0)q=OTBwYAJ4t+&9}s)wMpcPXQQ$XwR?sSZ;5g!uE1pY{}KPto0aF!GXYo}+G}
zyxIUIW?p2-u@Xc}L+Di^`lhFo$6uThbsf_SFB6zP?)!^e1H$?o2rKs(lj)Je;c4nh
z#mLPSEW&$((FpY7KpD>#UTa`vEWFm}<m;=QY2SX)A2#uP*Qyapctliq!*ZgretZjW
zB)wV4-C&0vE&ZWn?s1G`Ck2|O4m2&N`f%!<j*zn{(#b(`-~YJV`2~24Y01|0JTFb=
z*ZUPVVTL9q@iB!uy!8cRN3*Qt$fLc3VL+4qJE*v1KhtPjq&A5nO8;0+x8K&P@#o;x
z<H;c45!T5|G{4;hF;a7s)QqM5SY4td>Rz{f7zyMH;hhM-`93zOv|n9)q?L95*DqP}
z($8HRh=tl3_ss=aiG}w;hpXOy91Ch)kqrZVu4+G)`Ki;Wq8(}4Dg4{LUZb7d9Jy~l
z1@#wS`*FI;%-nJQ?1^%Q0?5gCN~o^Y>Kp!kUmo&zn!<;Q;?#~0tQ#3c)BwSL$ExN5
zj}jnpG>%nTT(}7NCT~Bq;=u7<uhnkwlWkV3dReFG(O0BWr{eGbic?KH3aZ5RjOGvL
z*(DkKW;pt=bsuf}AD(8Cnb+D-cZ~L{wSsUDr60d9HE-Go&6@HuY&^pAqt$9}0qL6W
zbcdZOq<dPk^D}5LKcO^~Ma6~cB;>MXM~e8Erv=6kb^1w>0%k~fo`A&S`Hxm6<DT6y
z4n&fZfKZJVcy6k@OUuDY#*Uo^Rb?o`REL@vUIw`uQYGx|M*&-y<OV&uyY9=?kzy}i
zDV>wKx(Pq>1);UFUf^l+6ks}!xBdN6Jr;X!n1lA>@z1G>d=u{Kjdlay)Jy-M(m`VU
zYLy01KVdkm<tF{WXC`MStK9cP-}4_&juY(12D(s-ZYhYTf!@dOcz&0oDt**1cOqm@
ztSE@*jB_2^RvYT2La7|K+Y_2jQxjj-JMWIf*27#%@?eUS;>#>r5$aOtB@3L?9r{ak
z)W8DVty2W=iCwk(dHA`|r0MXFBF4{3!@7-f4^V0#Ru7=#pF1~nrAmt@)e%rC#?koD
zdEZ|UNW948@joX7e)OYH;(_j@3lhW$PN?ph9+T%7o)R_v9(hb;N@WF-B@LHg6{st{
z+i?0Elhyv?Pj*RBbq&Jsqc~kM$ctAQzbPL0*sJm*6fQtCmQa*Tf3w<Te@opY6^rkQ
z>pmq{5&y`&LvS?6HX68Od?Bp+vjc!<eZYzy-~8Vz(ulQUn)>XNn`SO#KU|rTD`8_;
ze^j7r4dV<KUh6BU!@gT=TQDuFVm^n^*=KdQLtnJfP6GKx`kUmDK>Cy=VSJbUjMw^k
zPsYHXfq@vz&lk;y$~$Gj+CQ84+S<M;;r1QulUn#e(A(}3X>eJrvlCj`Bi~N+rMU>A
zp6uUub~QErKzlqZ-Rbg+6`|dR)*`>G7O&>ANr&F#PVg-w@l|R}PfY{@mFc*<OwZ)m
zk=pP)jA(+ArjApI2lXh+-sK+)1-Y8KNOES5kN1ZF5%+q!^4jmD13tyn$FnM!#*tA6
zwvW1lDt{$tapVTHxYG09!J^<pj}s6Lmfxi41O(Uxjp*u4{u)6&a__|_76^NR=+TEm
z2KeMk$EMI}nANYP8eQqq+1l;WT0GNO>YaHgIBiR@|L?Tv644tkQ+=SU1)2p>lXrJt
zYIVQKoeF}JQLa}dkrlZ;UtjCpZTntOf>2FAo*M*DWikC`$4`OR|H|)LFNxWYfA|cZ
zT0=|T^Zul366=Zcv-cge9|NT~RJlj8&+)B-CkVfo=I4I>ttO%n$f*4GvD1AJr|tv|
za{|f94h-oMc9<9tV};l88XeZ^n!IPiRZmkp3^z<2%<GmE7V_%t=2wcZgqc~*f=b;M
z@ydTUHbiYO27zs*h<X*c$wKStO-E2)Lx}}z{=@;K3mX#NUsqn9nHB=?V!kMWD5G8F
z!pB&Fab|CRGr*7+9#1csK{htX&<GMaB~o|HCsm(cv$6DNHM=F)7C^-<QnyO7Vr*@3
zT{~bfqMOAfyvQFXUIKJ>RPFa2$%ri}n*M3uekv7Uyl}ocrG2=_xK)Z+SEtmxBYG3Y
ztt%dgq{O2;nfe{VLm%9e99a^<fCJ%Pq3V?LAOe;G5ilf1Z3{xG@s=aanDskyxI)1m
z>D=|SM51D*Nm{RS%+qn`r;y2LmB!EYoj91m<Q|nHD2mj%K_G6Czi$cx$>Y>!_3dL6
zvV9iH><f+(pRCJAy9)bmP$62{JQWb^i|v;_bxi)@L#tc~qkRgXE)<-E$nf<YqW@<t
z5XhyaNPhn+j1!9IEv}?he_<@YGac$F#^U-qvd%UWfw~vR^475uBV6i~i?6y=0-s9&
z<B~i6x6sP4kF3pXd(BwslA-WKo5Cc$XHo93J|<G6Vj={aiR`XBG+I{nvB|v=Xc8DL
zfv?l0hR9s`|3vOl<`gvdeQ!4?V_6NWSBr>ldH4Lffo<;J_rc3yrVfX?#Qa@);#>gG
z^H9i=0rJA=!|(bGZoPM6!K8|i8ePubX*gP;9}%#7P!ZO;wcDLDpq<Ho<^FTRDf;of
z?fT8AQxK4t{ZHg_3i5zMhvalAYSq`_19rF~80e=BlW+e``+@8l)X~@U!17S~!qR6&
zgmW~(>ZsDTaeBy$>fcZD-P1|&Y0_h)|7JW{+%tZCrUPfRA^jx_U;;*f#eUFdr0T>x
zoLF+4TpoR~(s+wuRp368)B$+$)MR&uti-7bwmLWs>L1SVr37nQw&MZ9Vy@w7h~Yzu
z;MX;WP8|g%{lF_*Tc>%y@f62X7HF5eoLv8Lr37&NTyuZ3L)>F=-RP%ESH?TJpX6T@
z*5L9QN0sAOrEfntU%THj8t9&sQ>O+BY|RDQvBautjsKxyB+700#fKYOESuRZ)=*EQ
zJM;YNqq$ZofdN>4|1}s6l?bKU-i*4?H@!HNw64v%vFsFC(_wlR8dW`hNA@GL_oE|t
zYno5KZkp;MNmvY>my0wS5<@;9RpUqpu|x(4NBs$0J<f))8rmRN$_u<N=)6s?#Eg|w
z;e1=e3SK>c*x5YU65zp=DTPbk9hAFK+Z~2+rl_J%fW(883&*T<_n@PQXFM75e7&fI
zXa@qbKeLB>cifTXV}rbm>9Y1_f>@py<ABuU{qPI;^LO#|Z2JRDw+@VX@K{uLJy-a9
zW3S0h1FU~3^VrgZ0+Y{GNsG3iifp!+2V1$9A4Zx{VdHMrK1(OEqJm2Zu?zN07%-^Z
z?awF#H3yyN9?>5Uy++K*N{Su8m{aBNAo$>@ajs&6(3eco)Xg)VU06M7DVEy3Dh&Kq
zg*wDdv*8e6=6@g?xsj}gy~QZsav{6ETQ<)Xr|t^qdv#VTb1A>q?pn0wM;#;M4|65%
zc6&`aZpTAap4UXvDuEK+6{%;>+h@J7ezm$B*)x^Bwan4?*!6WlKXQ;Z)VGyU+A~C#
zzk(MuUtPI8DKZW!1u=H`FqGRCFJ+{XPHzpy>Ew$?GW(=g=`T&FgCL!l#(4Cc?{YtF
z{0z2Q>8(=E1}3i^es|Yu*KyvR5RWptZK<b%l1+-<TCN^IYD93<t+iMKYt(bZV{+It
z&SJ1n7;cz#*jrI5Dde2FPp5E*GAziXc|n8$t?CuoY!VhdF>RytM(;PSRF*y@4uQr9
z*~A}sC)C*y)2Ey(m<p*w|AKKBbUeJ{21(vR!M3VD+OSHf!oaFJuXUdP_-6uA1&lu1
z+Dq?senGTq-jrAS7vFm#NMzVE;3r}t&aPhm3mVqfa{IMNtH$Rh8*Ou<6oBhF@sX)6
z(cTii&n68UPMk<Gg=xpgKpocYN}_BO#;)CH@oBrJx?E^E8;f^_?T67-^_&fGGumcp
zD19Q?ZQzOFh%R#;lt6mSQMJvqF25Tov`Vqu?Gne)6w2!vX{qmD87Y=)ckf>tA<_%N
z<q>|$+}tFb%wpckzAJ_}zmbBhVQ#V4hT+_7(oJrlyKlD8T{~o_%dqU)FCT+-0sZZj
zO)bLY7^&Ij5Lq^*GF>Vw6em<SQFILQCQ{T&BTL~2Ny0=&?Y2Y*U9sQfc<Ud5$)veb
z_RTG?G1~>Rx5zGXgrAK)?F&t)=_=Gms%h>`nMYdfIOHmAj}*x_g;U$)e@2q#uut15
z^9~X0%|<FOCftU%;!oXZncU6L#R0sMD)#wTw!X7>{J*xCJ;;XDaq<dl>1F2WqA6lM
zsqfA8Tpj(n%A{4UrmW}<%L6qe*}iXZ5YU7|J+pvyuI!Yo<`yLJ>7OIhVnG6`jjtz*
zezoAoD4OhEu<N|EROn&jw9vr4LZlBih07B!Rf|SOn=*KJwdDafuC=@@fKRT*XKw99
zvPgdguQT~(SXLvWF~HJa91E<-18f3VvifI?a_*^c>unz`gr6LeNT?Vdk#tfpyJQyR
z;rIK7$^P8%hl;xw-M+-k$e!b79)6Og!;D-jFK`^~z(Mz4#lBqj{k2MOH&JMHJ80`A
zN8(6giQhWmoULnEzVDzm;mJ83X7SpQz&z`plhINM<>0Xm2CZ`?7PYBQLo0A|3BhAt
zi&wt;FR+`pO!#@XuM3eqtuX3dB0A{YtlFHd_{ZlEHs@+$AA_dH@Q;fVuT_eR{c*^6
zF_lEc^l4&Qz(T$w0rHw~7CZ5)+R~V9v=J*-0^CGOxwAe81dkXfPJudcU(0p3O9!=t
zeLBKUa%)dfF3TI}(W)1o-D8`dn}nfqzKGwkdBiZnh*`hjF6K_QWe(EW7vx;sO45^p
z%@JKEf3t%1APyr|3*v{`P+bHXYLyUM1}yN2uT;l^=?b3<y~TA(Mm-FV(?m?@FM<i`
zEHp(0&65$%@JsQw9|sDY-^av0>OJ3M!!HSLbSAA}`rx`gODat5?js>Fk4;cX&~ASs
zeXD=Mr#Cwe=D;SBP)OM!^nLVj|CQDoav4pUJ9|6JmlzUlkLqnQAIjxT`r-fwN39xH
zHituEO-4b)uC1Yqnry_XBqj)x<n3N@MlQUGhYM90Zf+F$V6!YG5v@6N{NxKq{h~fi
z&9+>)l=<3$y11Dmz%qGXjy8!{?7@pZ9R5ftkOX5fe7j);3f~MLXYCpAs|h^0S9Iea
zd1I0|ppfg?o4u2#A@5!g$YuDrt64!9M&!4A`~?O$q75`rhp)0&o~tz06dckiO0H%q
z%M`s}RBgWNFmYGUp#lEdoxfJ%*S%iLgcQj8+t}WCIL^mKO{E69s?d>;{i@5@`|&Vs
zoRyf4$F%af?}W6Jkd}~NP^!s|%*P%lG~7Ec;cF%KwkWu)h@KnaZ%!^lbr@BjsKT&x
z7)7W>y7s|8szFqgBVOcLFSEc_-`&#PC@xh{3o(Ji+g$;*%&`2?f=oqWsV`5uj~Bds
zn$h^0{!4WY6A+?9l;vd~R|%0Y^r?&)N8!>ob}pqtNbifWPC5ebYm*L{%M92DY4cM?
zlPnDbFk@}N{vn0acZ0f?0#h2cpo+Ia(5*`%>FFU7A^VTgj^tHFt%fHSnRP2-9#o{I
z!HBkd^5*=OZ--Vp**T_P%+krXUr`%l0rt<eFI8uz;Knx6^W|ZQN}~M(7jySBbUB$|
z&{&0Hs6R$;<|&w7WUP0hu#_~TN-6ZST?&m=FBIzE2j;W(2^jcIebt4wp2rJDvadzm
zt1>J#mNdnSo<WE>_KuB<WCtrXLKzjAv1~O(z_ZmAUEA&r*1dld@`lSp>UYII`M2WJ
z6^&k@6rHddo~#QB<XrzS=#eFN1MwB)&MU7pXX}DOVgt>Yc|Fea_PZE?7uBwP>a|Rw
zV<YbXwNmk<)L5_H4I;;bSz7G7NHS!_%K#S-M})*CtgcgN_ma!F>bmI&s+M?qe4PS0
z_o?v-c!UasAnL8*HcsL=K7J|7S?#EJHeXE))T)En4T@PpL2TDMg8`uit3&^J+~N@<
zH+Gd?tVVY_zIK_4DbZ0C#1A8YjChu=JDmN7+*r<37$r&}r^k2cH-An4m%qk-Ow(Yb
z{*>V0`8<gxhy#D`&V%gW{k+_f0i`rbopVDIW(I=;l1^JYs}e8GGv&!!7QLAJC#5ek
zb5B&w&a`2jPV^XuHGvU^<Ttp05lKZl*>}QMt#CWRZ9$|z05YtV)&wd_lu&;~gt5o3
z$|*lU#L^%ya!H9b+h?8N+eOkm%a7{<Ay7e7qJ5>^Hp7R-pLfjFrFp+!n4C5~yth0s
z^=b1VfHGz+^#St^ys2}8VQAHl?|#Ei)?-oD#6tgZ*rSZ=O9vjmx$tv;yYSvM1>Vy0
z%tMQP_gV5W)EJ3!vmHxKkTO13f#azM2sY&p25$6B>Q?N?RZM{wuapx%-XU3JG4w}>
zla4$Yxe`L$Mh29i0B)(T9gEi+I9s6rin)QsrBe<xU6Xo=<X?!{&BqD2al}J2MP1h+
z(R?hZr_I#O5QNDpHwjck6rl}jAj7dY1p-cw1>M*AhG_G(%RTPytQ6l!#ngXSDAGR_
zDu#ObRL+O!UedU{*0lCYmk=E_Dg@8yhwLce&(8l`ynbG#Hi$fG!Q}AjuN@UBx+<+w
z9cbhbllV-U<-3$@o(|F<Al=j=c<Mqg6_FMW+Z)CSJ7wly2lRzz<1JtLr9&4?=qu3K
z@4@mV4`3%0?SS)e&*EqUP^?%$ZI@JHuhz!=viB(q&7ZfSSE!y)cTn363)C1d6hK#M
z_W9vUsqr_{q3VhqKv-+LP48wK^Jvq1ty-VZAyIs=P=KU@2$EoFUl=`hK?9KW5s}QI
zEXPPjc{$DK*7Zk_SK|H?fHoH}lL)dDDNbEyPW(_`dv!0=DKpzKw2SMPN{|9f%eHcC
zQ6d=1H)HVtNIzr#6;<|1u^%z<m#lQ_2%7CDt}kDhXF+lT+tU=8V8#`$Q$rPf0q$gg
z1x1Rti?^R>y7BBuWER)E25i^K0S;N#CM3je*fHkTbQ(}i2j6dD0u5gH2?|Vg!J=KB
z)0StyZ%u0&jIG1F#7K0ph9MEz*6@sXoV0DZ)TqQfLYORPpx3UycR?T2=TVgEnEvVc
zl99LNrtCx~ssDeS<Ugqb8Jqeap8X13<*mH#n@L8MSDY_tB?<38VghF3dZ7UJI30eU
z&1AyD=B=B@u?H9{@O_g{WfTo?ItM`am2Kpy)eD)p?V9Ypl37jurRWlKEQZiC!PtA#
zDR1zI42Jhu88npIUJiK$ydc!9%3FeMrf~x>UY2pus|E7O^FiMWbZ%%_KWta2&m^tv
z`qX$d1Q>SjHyBi8km2@{Q8abLNQC!&R<PJauojREyQl|jyh6hn`v@y;wx}hNGrhYj
zznPD}!~Iv;@=-KAP!l%I`=3hZQXcuOxe|$kD0pVP2A4zB7Y`1I>F!RbnULZ3ZuLM)
zhoqCMAQ#EKpL^qV&{fP2Za$K{xkT}EW*!1jV71)6Ia5)qn+c1r2Dhb;9;xu7Gbvew
z+hLTN{L;LXKZm?MC4MsC-p7HIKt{O#KQAaJ?F9g$xXMK`plZzh*myG%D(~q$D}FT7
z7ES-zyxv10GQ-fOXlgojm+d!Tc~+q0b2>b;QO9wxXQ(ksKlYIdq#mRtT;x2L|44xH
zL4J1EX-M#+5k+>86=251FKs6J7U=Md0_uh)jk??*#Y573j`XNm7I77KV-wN6#Q0#X
z`(rr@u2VZ3*3Qz`NWzS=H%EVq64L*Q65h$n@e4iDpkQ3gofpxj`C6oVWiTD_RIU^q
zLV?~o!qtggQb2X5LCv+@q)?eU-xT(^)^lcNXMA-=?z3J?Aac@WCy8wHkGVgF`cAOV
z>=#>B8d5jQ=J=D5gz*y-)(4I$|5Qf-luK40cy@cGWjiqKdoqu91!oTk$$kwsO$Q8r
zHGXdgKn5TDR=U5FBwoiGLvrmue?aS#_1cA2uZPsz(S2L=0JEgZiE-KH;ZLT(0~$IC
zd+AfDLS`|8P*;IBwpzRn3ec9}0<xe#Lk678pB4_(L9^3D3QthZh%bs5^up}v8tlAB
zE__wfjG;CN`_!>M8Q6;RTE1-JtH6DWB+u7x5}GVNQlxa^{MTO5pFk2Fbk~?4{ApSM
zI{G)*`ZE}p@!Y^6A8?=BnjdGecRDs!C<T7_;+}-I1T6oU*y=tN%&GvudXF1hyFy5R
z0K)dO68K)OWh$@f;ZqnE6|Eahdi9muA6e~lGMYcKS|Wf0LcL~hjrt1~&Z_Jm8CQmH
z>Rp;Fa_`FMi1m!W<<ap_PPrUlZt3r7otGZsd^{~d5@j&LT`SWE=unG;T(5MDhy7H1
z6cP3>bMzyI0sl38ag|;bqNuCq@3PuwnYyU5-YaiUjNr<m>e-2$)&pZFRHKb72R7?4
zVtgSX+W$<N_6`FZFC^#dcNW7su{`8nCx6{Wqr>iA31O%FAh>Aj`4-YJ_jbgPLlpp&
z&cyaP=}-Kbw@ttNy8`fH;|tive9^fF>>r(K>}z%TRQ&MhHLaA5p1ODiz83Y=Hxt_v
z5q#>SO3GQKkK}x^tN@*QiZJ5dZRwk5KAS^C!`q;_>Aq3AN)z%w-j3s{7%P{v2S$At
zi~|mb(c8u<#+|@N_gWBp7UB;Lm|emof&0F?uJ_q%a*mMdH8$FX+-Z-uP@sL0i(g7V
z*nV-XDbAA*Io~ozSe%{(!=@PE?jK5&0by~aUNk~`S#iP6Xm^ieXm@c|oA7BkLvw9U
zX<JjhnTTXBepr-JoPEX{7-U>gKOyl;cHDZJyA2AZ+LAJbZO+YAiCceH9aRFOx05Ti
zqtbPY6kQ5!P@p_<O@9iX-R}TEz&JGMnmGR1J+&E5i=rkXEMBLu5`Tg#V9=k$8gupx
zdfM=#N<wTnJzLy_*jFyq6KnZQPU=q6C^xHuVHfmO*^5p@!{@j@#w$Bnqzp+D-#J?%
zk*^u|tZH@AIga?Q$!%K(IF%(jhc|%qsks$mTKtZ0q0Nc4#*SiYnzwt1*f8@{A_sn)
zh*0WA)wAhKgNZNh;{)-n8kxL%NRzll7%doji|be4_MLxHa(QctV!L;fXI0=XS$3?O
zIlPFJnYW~3Hmc2Ei8c!ilIjX&_8OT%z~3+2g&xFPf`{5PN>n}2S?c#h_4$jp#HvAB
zKYpoV^LNSUnGTpqhca8Xrh09(Kh#PlH3N+UvC8R-w<jB#6i<<7<?tGC05<S071HVX
z#yzY!>m^E_K@H7>Zlelr{hWkXC6`6G-Xhm}b4^Q!$f{Hm6=*}iQV8U<deHjbpoCao
zMxBD00+6m~x*?&IA+2t-FHrmED36L}^K!F8wn8LV)g(~r?b9@1r_MuF?XZ<ME8)4r
z^F3-x>=^Yrf1PqaFq%$1A-|1_YQAVBrY}N6?n9+<_?O#Y6zt(UPv=wP_$w)Qof8UK
z35<_%;%w)eu*<2r7B(Y<(UW}2PDyKcZ9!kqF+p9$W$rt@k{gHx1jI+w(Nm7w?QND0
zU}|{f^3XY`n~5J*wGBS|s3aT&Gv&szDe8Y28(<6ASnl46$7P^TwXvh9Gv{6#n#2O;
zuTXbJ6_;{O^VsX%jzIZ$^WzbGX&1PisAbp&8wJ^hGp+2WkH&x=mCRY#>XlR}0qK9=
z%WsvwDB%Zy^;<QPdk9W0dM6?&Rf6r{5Ksw6KzbC5j!tz^2T89!k!D+dtCn}no_`F+
z2V_Hv9gkZD`NZ^5$gxDa&AWO1=X+m8w5pbb&S2Exb8TiRG@&z909IE2R1#z}g3)Ls
zHQO@k*=L+oYVvO?XX2QocK{6N{%5=@qa3SS$$tsN?mq%io-H3pBS3HL8j1B^`%ne@
zCFuYPBsC33!j)thA!gj<3qxFLwlpv3DK96+?p(k(+XEBcu=Zoo#h43tb`uNpcQiJu
zoBb1R<~6aNV|1#3mh4pFHzKatX#OpeC3J>R#Vp2wK$3r<pW3-48vg`ELPp8_=9X>=
z@+SyYNq>AxVLz>0w>Zt7?9E?Dw3hT&-q@UzL`nAw@=|ARD=5pMgnK^>XNs~sQEzK^
zF~qs^ZKYY4_|f0&0K>37KD;Hi?$8;Y#oybnvmDi5I7ht<jvd+=PwK_W(Y00k*N*r9
z-f>^}lNYlJ1&MZgne)&B;}{K5LUw<EV13|5=sUZ`5)%sY(4pFRhQ}LrZF>l9;uWfZ
znOj77N}sqnPTTJKh1~C4^p)FE`=@ytLhxYJmLh#cSeG~b+0Z$aw=Up0B&A6=Yn8*r
z08aC0S<R9f7nm!#j3WiC)z+yABn%WsMVsnYAjJ^!5D~kmJ8(mr;zbbJ6!eR<?SV;c
zE7m5DcA4VCs5&wMC)S4sJHxqr?Xt0i)$;dKvXhGvk*oX5Qi^XOx9sW_b$3|67%OP^
zWNVl>itjJ8CiRzDd*ShSHT|0^oXjY8C82VY4@rtTu1EzQl<bO|jo=0MuX(N}H8wua
z^!|YB(=5kWR3T~bhOY{%v&AQFL0Jd-3+=c2K^3Ob(pXdQR-vvDc$aBx#l6Juhh&Su
zq?&SKD0sF9Jla#J1BTu$W-^V(LLNugxES8cHQP?JK4ldQQmIYV&5cud?+JlLp}g!V
z0MXKH4Nze0{p!|SfzcFwxVb<{c_|T$x-#0NuuGX==;!{fBDM@)5pl68KLTBRl*`e`
zO~f^1ujn~BXm{hrnkQD=?15W~j{H#Rbfe8T#2;7A`Y)%DNoICnGpNHGpJDl^#P4M6
z_%3z3_bl@^2dw}x7#OBl`?&3&)F-aGxf*QMD~ih;w*nYhoE7LJP)^^UzeOGL+3wf4
z4V+3fvbc_k{W7c1Q{|0&t?3hYtkOaIhLFD3(l;ip_INYOI1`n^B41308LSoz{}#VJ
zL04=3vm%2CVnFAE$3e&4jbFR+y7zR-?@rVMXo5GN#tB1&L+}ZYsXA79A3&t=jX86|
z_zXCxQpDnu9DVo~QyCIN!O8j#EKJC9MSu8euq6s0koODV$&&$>1sgcg`>US9{(b9^
zj9WG%`LWj+UwukbRdEVN9S4EiUbgizLsGX{eh1JN+2*%Th(su<VrZmYalK7|Mgca&
zicyWR*sNg6U!^vLxbYHn)CWfu0@q~*mWSbmE_$VOXUP#t^w}dNwh<Xqg<tIh%588m
zR95v*w>lecTXja{e&uNtq*AAfP6Ctik6KDwZ66en7Zmo8npu8`*^6Wx?_fQU8Yv!j
z1MuX{j$&qSr43gEfS!zl<Zzz%rqfK07jG}$e#b#p;g~*AJZvl%qXe+4_L!MPsbr+U
zqv~bJq3fJu#lxCB+@bg;-_yzvI`xu>KkYX1AG<wbvRpP$zp3f9Bfvo_IZ~m@lc9)N
zW8PxoHMyYr_L5?oDE?-|YVMpl>U3mXEsbi9LMVji6GKt%Oi6EcY^-(fC7&z`@R%Ua
zx?*S=ouXW#$nza|xgWUAzC7j4If8-bG8=*(NNRMo%bC)=)>N8|y_6=)Pd4lEYxh8L
z>9LkpIG|0$e)hzOP#!VEU)#iLq9Ayq#_}E_HWzz4ik)h(U%)^F>d?DMjRDWQr<u?b
zF;`@OSNflDE8tT}rU9fbMx%PM-QKZd*dO06_nhn_^w%t_!&<54RC{S3fMptPt9i3(
zW^#(j>xJJhdRowIT-}*^Qp5dw4TV-1^h>a0YvyT?ioaJ^2i4pj@wAjLj~gGh$?|&8
z6dAIB2=pZtB7LI3*$oE5(O%JlwEW(M63cfw*QrCtg+V5y?{7%^B6BLXS3b)`ooalj
zbmDdc6m+EKTzOa^&kApBlf|iZGUpA#=|Q_=*P_T((B~N~@vgkPdEeSs*&JISdcA1|
zeg8zM57c-9po!SCWAMMT0vTIdmU4iaV~ax<U{-wffv<9!<L282*2alk^8|WDWx=9@
zf)5!^`kHBzJAP~T$`3C2Ux?0ok|IsD%tcc9tH88wqcke52^4GjJf8e`lp}g>p=5;N
zAhP$vvX!!=aDu~t#l`LV{3%e$j9rn$i2Zu)*EW}G;*2n=;EZ6gCd5|Cx{dNGt^bv;
zQ2)KMFICXP2x&vy(aFG?_5u4gUVh4HS(%ymQVQ59I|dhz#qBW!`u|b))^SxX-TUxO
z3rL8B(x?d1qSB3siqhREh;(;tr4&U_T3S@PyGx}cl<w}>^k%~bezWx)51#XUpYyzb
zzJDD*$L-uRYu36}T{BZM<h12slR-6Ef##Vv?WUkxN<AeXjM3anHP>KF=>6GDXr6v&
z|9hOVo`O(2pC{<gR<U~GVaO~ys`eXmixVyPnXXJolMn`M1q_d>y-;%gV<sej&SdfZ
zOyUszp<`R9*LCiA%Fr^@P;6Dfn{_&D^81aUB%n|ev*G|!<>(~-q)LWX+#iLQbvmhE
z+0p_=v@qxw+UFViQ}WxkmEeMfv=lYJP)Dd$UeUHS0aWd3cUB#AoALD!-!D=pxQgSI
zI$7@pY;F9K_x=}y8eK?X_hitSQo5DHCI)b#tZ0rA?A>FNZ;@8S1z{fXbeAwkH?8Xb
zj~_OO0P4rATCQ3D%h;KS2rvzZmjXT8_a*Irl7<0YL|5Zw4IqSNS^enkt7`*cHLLz}
za;L0N3n9*+-{xG$6gbq{FTT-Qiua#~MEB!6=sUAK+TLEbF##vU4Rd`qOmjUwHt4V!
zRlcEHUFO5TSl58!@yl7#s+e-+q4o18Ii%sl%H-~&dCsZp)IZb3M<tvF$GvF20thHe
zK`+5=*5o;y5sC>yw+fVp=l<m5VnU1w`tHXr&NKO=S)dy?@V2yOyS0t_KTll(C9>0S
z863CFM`vvw(`|{>n~oF0rzAeU`>AWcXy_31EDBu)o;#>qO^~fBaGd%-eievLedi7A
zKlEC3H79r;QRIq@?h#{#rlS?_Lb25^b~s>(jH8vDIFP|AzLM5-dHkS*otxx8?+0VN
zCQjK@yWY{8-kjk&Ts<*f&7FQuZtnl<*MFw)-)%<XNpQCgM+G0|XsC~!oXD>u6`*=>
zlcX*vAK6x`3C=TVp%Z<Od4XcVK<eGzbt!U)H)x2Zv>g+a3A(=gWY_{yhP4|Yi=b_*
z5Z!zGL69Ie7Ivouj}7s9X;l2l0)(Jux)8C$r@dKv{akczzSYR~Tz$b%!UZkBwrTl|
zoWjWe5;PgXF`>MJN8+myXiAa(??0#n2WMo3vlV>65g$mWWKuMj7Lb=VQhp<s3|f57
zWY7i&k@L~V=5xI_eF239g1MgLfB<IC*WL&*a8E&D)zXbskbWXs!APVwzq8n$_X>4C
z9B`V>1}Dugse=hAvYsF~{uErnPHU6+0DkIr!6AvYX{XKp06X3k%ozt92wFTzTaj?q
z1ZPV70EuXM5aBtzja1MVB&FKsz=e{G_j4ToBo<>pp-42f1x*Ni$TBh=fng_2A_2}}
zeEe;ZG0W{X-b7?{#Y;VffVRB=s6G(n|Lr7U2=;Xw&7(@GNO)`|Om7X^f(`6=;8#KK
zejmvcAXGM}MfADUk^V3NKwttl^8ZZ|<rO8PbR-@4a|blyJ50IYi2-PU`ftDK6)GJA
z-P(K%=>Q!x8|o-17)>^V@`@da*8b%2qETBED(wKEo)3=PK8%00WR9i`vE!}N3{m`J
zp%fNlK}71Q`dBosi)xbG5~Z{fZ2v*Y(ge`A7evEtHhn`pOdtY4DE-G!vo>2%KKX?m
z43@!fuNTT#@Oc129-#4c{U)`|V)wRr=i3MJeY1OOC6kTtYP38IJ$~OzWK6MZFj}Vr
z^l$Y2pva*$k(Kao5-G3nzBX5qUhnAx62ncjiv5GysF}TGQ?j^~m<1SJ+MqySiSC1=
zCnllQ{W}-U4vtYaRUDGF^Ta9}Ttu-7s#xo-7?zb~rR)%7N%R}B@OH@gttg#h>S9X}
zVOR4)A7ncppalZI`xg%qGEulA&U_q>?brrcGPpy4;>i2kAgsdFbD5Jh)VrCJwbtm3
z{2K)_$@1|N63*>p9Py&{KyWx7)I_O%8(_d2EE^qxppX24SpdDN`c00vQ*{$F(W+dW
ztPxy{<{5#A7tNr9k}mvC`byQ+&vpR3N}Z_b%3-Ee;lU-@=7A@euNUilD~Eg**W
zrQNwE46fHG0k{TuuHRWShJkjqzWI%rXBpc<B1Zoe;nJqp4d%BP)h7A^17c<W%^>rf
zN<4R75LIonp}Ab`I{EeNc#2|xL;80BOEN|)GPn|@7fs5`_L!&nfltkse$xuYDkMAK
z54Iqh0)f_Ou-xYBzyGEPkhjpzX1ZQb|2+8n(DdJs@V!b&NAJ#;C~=)}4CyRjyd}0f
z%Rg)_^mj~sufmtpR}N~MS>VPFG%tAwqz7X^aDX>Hr~Vi99k}Jz5_(u+qT4q>UZWD3
z=dVTff@N+LWb1(k7=GUji5fuP1V~UhpYN}K9`^Nh@DHik{3f+2>+X4REs(?ObW>(q
zq3gJ_5HF!91jHM2at~kv9)Ofua$LW<0+>>R;Hb;Q4=@|N{TrcEHyxpfrq%*9{-H>W
zvA-!&x_Zsgb2Q&w@9O>|w1*H*vhv7Osr5FD(sLoIy$!G%##`ty+giW#XcU(yYx=d`
zO4b(MaPjn79%w-fq6^D<g_z7&1WJFlJ26RJ-X7n_uN^zY(+0PgSdM3?Wwo_54GC%_
zJWNT|v+6d8Gk+oW_N*o%%5*itC#QigdKX3KJQ?lan`o^i<GPV*;@qIsq}?X{c5q68
z9svlm5L?4rhN#QAf~j29qqj!fVs_{~1L(e&XRD1lkO-GaHyHu~1qf~W3nfk-e842t
z03_=I{X?$2YQ$*2kmI89#mjnT4txp-t}l6Bf4F)rH=VtD4hwgQhr-B_v$y9kAy`)O
za%5#<yy3ilZ$I^QtH;4_`SRIws;`_YpVFVkWIE}|KD7gWx>l_3(2#d~?0`w>>EaIt
zplep3=*;^8Zz7zzRJ)g2ZDlDv3Seu2+{Yqza4PCiR=l|=UcC9ioLc@`wfpe98?n_E
z`VuF1&F-pI-vvOTjwlv#^@-(yQx5NiM2vkVw+|Vcujkh{ys@NSh}S<d8-dj<25xgb
z2eEx$Vu&2*+VIUX4;B$FIcO<yU8RImpWFCw0=QthrBl5m*?@GU<@+Ugr9-CFgG#<x
zuYy%<si(u)FqN-lr46Ys2wS-coWSi5Obr_M$_6r2Gaq1a(LKubMESbhf4O(kRZ4JG
z8ckgvE;U!HbX_sjIc{?p`%GrvUsXz<AHcb!llwxxIO?d?g9IJ26&i?}vFwP}oA+iu
z1oV~_J(`QZ1u3B~&{>FZdQ(ha)qT0x<K27ksZ-^vyGVZRh^2#}=tT`6LI^6i9tsl9
zdNlf$&v1d~hu5_*`Q&@jF6`R@<q{o?WzEB`X)fM)#E0_wpl53(p4pr9v3<lf5gpZ#
zQgwPE!kyNfqv?8%6m^P`D)6`G%N0)ULHP`tu1{TTK3wb)vd#5W@oK9@PEN$QxD{Q9
za1XKQT<FBpeohb=Jfq<ZaBeAkqMqt4N_pxz(@7+s35t24uGMSayRyq-z02jTXxI9%
zxMMJuzDVexx8$htU?8AY6ppsrPlH4x1`pkCWDKt`TU1*-p5Z#FXERUH*w?$%yE-=O
z;!Y)!rn$HV*l8&#PmxE~G1tDZV}-_kAI$;W=ZdWuo$R~CB5z<<PsS@S%O4##quNq%
zyM{qr6>6D4c)37dJ@*cs2P*hH*!2-LdVofZRHMkRdr2%4Q^hv!{os+<li1Y!NPAU-
z&|BAoKePHv%cc$G0Fx~J<9LOY$C*d#MH<A6rQfhfp}X+?z+C(}Y-6L)k<=3*6U@?C
zIpkXQJWPMgpUA$yk$cqQ#&Un&lm7U#S^LY8Kn6*!zBcC!tw@%Qb!zrMSJvEGI~ig~
zlz>{*b&I}_AF>bH3ddxK?7Pzr0`ayE?`t>P+5nvT{(Y&AE@ah^iiBZS@8Lz4la7#)
zCL2(!(W?4GwVvXZn;vp^4w_+A40~r&520LKI|0y?^rO|Z*P=HM&DGB-eob@IdxQp8
z01B|4$nFKW6mHDqkXhqXWcm98K^D#76k-DJfMMf>J;^a|f2~cVu0?mv2fFU9+d5mR
z%s>3gUDK&Beyy-b&Q(B0-_0p~4K85(bw&#_)aqlqwlNVkf5DD8H-hUy==ptpyIZdH
zO79bV<u`U6Mr)VkGu4GV1IFx=HE_XNS5uSgE;ilY%qf3;NRU&B|KlP7%+UMjkO~bP
zqsZ;-vCo9OP14J2yJX%<5p1i9Q|qgfVS(EE?HSI9!n0Mp5MXC?K8UD1009mkqCGI%
zyjVmBu~s@(%wM2WU(EkUmA0S-jd9D;y9h4bP_66vMe0jxu`1CWEY91z_}PbByvu#1
znK#7K(nSw9Xme>LL?U_$HD}kvAXv7;5^~gPp*jEOh%T4@9PthsF+Z_rVerwAyBF3-
zA9Z>p#?JIc$uP9%Kf>e758G#807<(&w4wIr3`<_QW2k3pHKNb(xEuzrsIAl8`B_VV
zSte7o?pl1iw4gVqU&OfO$G<GBI>a=4xq1;JxVnPZX>nD(_!b4t)mdc_e)sHJ$ag*V
zAX5%`SVIFD9?;q;n*xs7bvQNcVCIC!n?Gt?gYN)SbfsW8z?F)<`JN0B7QOJp_Zo;d
zc39c&@#edAOIJX6;YYBl6=%4|4Tbx+D2zV<io2lad(nf!Q{j)6Eh7z@M?9Q*mqG{V
z1pEO`Ok1^9XQo<j-|?nyf#(j1+{!|CNZz4vNeS9-?Bc;#UCyl&3(p{U9)V#k#r0BF
zRh$@p;FC7(VEV+tulX5*BNJJTR|DOs*qSZZ+V>}DxNk?AQB2(8q|po^RFJ<zfGMHW
zQ&@Iu)cndSL8A{w(n_5D*mFMn5p4L6p(=k3mE>~#gbzhL7tqG{&8V(&rQ}8S-e`m2
z!6zJkEj0Ezk?78@VnLtnJSeZBsG!zYIp2gBhCf$Q$~v>{<y`tnHk9CdJW!V3J}5F;
zd0TRh(5BCWSm-kvdWp`Bir*&a7~yHpB!rm%9HYZwT-3y4wX)O-G>*r11PT=DUBHzy
zXev3xI|}M1`D|`#=cgh*I6CO3e<$X1XsqsA^xN$#X_MR?DOGUWfebCMMPDqdnF2Yz
zzvKIe$g)18zskbinBim+Z7kG&?9$UB>qf=WOk5!dAF6!OYo*JwK(x1+%U&e}<#9G7
zd_H}Q11so1jGJB!5MJBOX?g9nt+)6q?FwB)(+E1wkM&myT=^=dhLAr`jhp1U|Ftaf
zB4f9IBkcSUA!Gudip3qki85Hy3%htgt_%Mo-Dm8M?@LY8L)4vuPrc_6YT`fgI1kHy
z-}&-D5R*6Fosyx)%_ra(Q-XR#WLZeKrm1D@hG0QR&emULAKp>PKzehMtb{4OIPzQb
zWw8^G;&rb_Bb`bzDJ<Nu0tph+Bu9GhMSTme<|FjKR*>}ggOUKd*Pqnl&37ChGxqt8
zk>jgMd}2tZ+w{vJty<ANPkE?-#0p<g2n1sfov8aZ(Ch9Y{`jj%7`m9zpJCQk69@mt
zhuc2QV>2FfMM598z+M0yQTrkIV&5x+=-US<^iv??idK&`Ba5S=Pe5ufJM8%DF_Ipm
zWtI{Y`MeWfVmFx+<F09Yh#cs6R=V}iG>e(zsT82gcWj`1*X$U<rZ%+ne9Y<pm~_Qj
z-%hE={Z}g~&OFRCfZde^j?lvHrEc|A^IQp9sz9%FII>g2t>xELo4g)ccO9H-!2$KL
zLQo%L>|U&L;=UX7uEP2$mc+M`*)27F-VH&FX0ak>iAB5dXqs)YTlVM~ar~H(kj?0e
zH}BGN&>K)>h4hSw`Xxj65&k*c>X6sE=o=CU(Hb1zp@Vq$o;73nvH(PGr6XoA%X~KR
zENn$u2Q!==5nL__e|U)sO#ADvX|w+SdfJw;EMqm}U)s-`^>@-5T{Ea|(?9B0&Xot3
zywSVby$kkojUKIBQrAJIB1_i0Wi1cKsXUyN`2G~SQAGB!TlsyDqgQn(mq+fiO4s=B
zwdEq5H+Eo-?|`C&9DEr`LR@`A2slfR0JB-*W4GI~emJ>|(<nAxNyn-}aQ-dM<v=Qe
zJH^yJd3O{H;Jw@yG)ETcP9il{n|84bBLiGD$dQS0g(`<y4F;2kf9{AnI>%7(7So{}
z>0u|y-k*Lj?A9Oe3l3?Gx3lB55&{E3Z*IU-xYZwpZ*cylbWP@tl4}2b+azOYn@UF4
z)g`_htNsy}^!6do5YeMPFTTyO<VcOeAhnDl-%(zKwoPrp`=%ZTfh?WMB&w+}hK@WM
zdOD*(S|^z3A@W|XSnHP(&ze@b%VI33NNB_^mCiR&q53#lL-xzP+9C9T95Ps%?O$3s
zmx=*$t%snHAG+u`Rzbxqqol`Q|75D&)yzJ0!ou$MS&zw79DU@|kVgab9vUTH8LL*A
z$Ea`(ef5tyWz4R}*HQhP()*Te^O;q~Qw|L__;tih#f09(<!(tDvO@GOX~kyaZ?GBJ
zX#ZF~&}|$yv}K<|TlPr-xXLCBU3DF$AF@r=prjUA)3q4inMg9#cRV<>xU88dr%=2$
z?^HY{tw1yfe-UZ1RzsKzHzlbS5nTAzf1GjcX~kgx;@<hb%mi<iUGdAV^s<N*+LgkE
zLfNO&%j2DgN6*8j<=vd);_X^m>hlA}Zw&Mu?mP16?&Zw#dvwx${?o+$#>rsSxQXVR
zBxdRhJQMRZUTh0YuCwpu2<}5-`Lj&ITi+A<cGG;bkdpUT^QMP~XG@Rguc4w$!<^9B
z7)zpulx>!g%jx{xrjlayRqX+ET-ZSJ$Sp+Bq$S%~`FMIYsp9}|ry|a|Y;^zAj()X?
zL3=4We18%@gbsqwiR2`N&|7rOEnabgds)As>z{vYF<4UWv@p~c@QT5h`>2THgM6)#
zm*vXK*UoKmSDovOvR9)MWGVMiigu6XVBe=}uJSK<aWrEAcxwC$o<?Z$7d{8YPDp$U
zPa1YY=w$(auCH~+UGKZ7YxNCT!-Iz;B|}6OGc0&Vg`!!7bMyZ7CE(4-2ItHw3!YD9
zPJxt?A&q*@!^{g_Y3_GV@rl~jCRwMb@Aw+k7LJvbb_qfr`f<l)J8yf4>0Llhbm~>Q
zU{gm;K_Egzsj5NDfPxd?mhxFx)R?={x(x_UdR+|YJ^La~skmLoaP8cDUO!(XpQ0^E
zrr09DOy<k?u3xFeoxg*Y!cygp?YQ}YX8*Mc%~Wf;r_L$n!E7a)2dQCBc;nIHUiC+G
zI>Uk*@g`mY+oy)ESyj3(&DM${6cY2^?qnRNRuA8}y|+W_7__Qq!fnx&rLU-Q=Tn~b
zUg?d|+WdI_O-teYal4W&E@7Nbr}o6iOUsM!$KA<lq3b;lGw!Rrxw`M3^xzwCP^q0E
zKimZ3>L2&n{d5wn=|vAeOw28BKN<CnZRKG~LpuSP*yH-`i5ttF${x8{0uSk_DOdWZ
z-<@6k;3IO@vRmOWXSKh*jCX(%oT^}mR;M6FkOUsEU5y^u)jcYSeH3yJeUsl=%2!0*
z8gH)Z#%oq%#nN}N4#;EL4pZzRGakq6#f83wbhp37O*nY9l=T?ZEor&v!nsONv4A!K
zh*6@rnX2Rc3CD-;W=evOHO=rC>?FEA<{mnD3)5eQVv7cIyNt)9@p1{g4K^zMB_GI(
zZac>1CG37H!nsQjFuWZX?&&!*>k&4*jaWb~L>qxC7MDqIpMObF5O3f^ts;#_4>U&4
zi!{|}?8;kryM*pm&yBJBju-FU(G2)10X$QJnD`nx{upce<0d?aw;dgdfs>F0-~h^F
ze)$USy(yv9r(rAWVM4f~JQ@8h@X<5HI<uoICFRG7)dx_>yQK!`<suH*ZA?E(Ol+R*
z3v`R$B~>r*y=64SzDcSmiJ=sPi5aw=<9IXZ)c2cR1{gA`p_h|ReV24^N5acvgN0p`
zoePDHoDssETViiw1_uY@3tYrZ=W4DTXXDa_^3}AysxADY(Ayg<r!Yvp5q?p_d1w=E
z-)?&(EW(Q?s##u!8m5LKbJUgjLqu>=ls`V)#fBBX)p*Ua8_R7r?$S`<GUIp(qH7wD
z9a^Xz|5|o0i3md4)a(b_Oc%d0aC3cfx)U4~dTS0fPY!QqpLN=r&BSoi_9i;EB7zRQ
zx>T->Rxx~Ro*W%px^z6|zCrl5%7~V8-!0NQ;d2{B$^hQOnOgpMJ?2JnVvOEJSA7u$
z3#hqtXLFp;rL*V$f#P8hFHg}?I|ieH%wrm{r{xoxn4xY}$7xoMw&GRvg~^Jnxtybt
z7>t>o#aoP=nw7y~GluZPkn|>Yo380j$9EOMZ<Q7|Bd!S5&Z=jPxNRwU#t^7hN}e_E
z?jVDz`O-<MOINJiFk4FPH+X8OeHE6poh|QLyyBEM%Syw*dvDd77&*LMHm~QT>;6>A
zcjg*K@8rw**?Bk%dVNSOuO-LNt)E)i5#UI}S?Jd6i=@qBx#hXf^%oTap)NaAb$am8
zLHo-#?0qN?@wm<5C=z8}PE^*PwiYfbQh9n#nGhNrnwDE!b@NT@t}``e)F=v{UDA70
z?bSyO)zo^H%@w1>M5@}7b}J@5Dd-L25w>k_uarVf;jFApI=<37<}!TF{<A*xnphHR
zb$-WlnPf;xW1ZLE&e+rX;3h<3)hUxi^t7M-iq%90`#OCnYt`m^aOWskCRtHmyTTU}
zR>-#^bd#`lM4q8tPhPs|US$2)*-pog6(QGCc`4Xb9G^&SHnFfNTo=`>5b4brkEZw9
z6ciktP&#Xd`uh30PjCPB?D6j6T>D<JdCDcmE`zZn3!)=S8g}zK46|WpuMeI9`eRb%
zPSciN-j2k<`eM;FLY10h?ro(<y;dqJ{B#ywB_+Bf1_oNM$sn<z;&f{v;$vDO$hR<+
zPQ;$^C6HnIssSSQd=&mR#>9LehXzW^_XcsJwqqW)cRCKtMjp<cNYdQ6MbW{YyEMlW
zA}vh+5f{dgMV;(++=E%iQc)V&bkUobVVw6Xo!xqbFV2fu1$-#gM^v)rmGGvbc(>n;
zlo<h_bM*|zA<W+LhT4``5;e10@hs7Jub82jrh|~7%Xo;!<*PIV{+mPJ>Byn)1shF`
zMGI{<X5Xti9}f&%;mxx+SlLp|wIgKH*<fnbV7zsdc5|*nj#(}$<?RlAavy$Y>mqJ&
zoCL#H&6`{5*2%@k!<n(A7xzSaZat?)4f3h?z_7meFSH14dc+<!Vvh=BemWC5JmVg+
z$aYNpqFHj*{LAvkhZOW&(oBL+XKVH7AWAlU*aHTR_pwnscW+_CUddS<SNt$e66idl
zs;zTZr^F`V^wgJJdgj1P48}kYR8>t`fy`pz;nt0<Su@w6URhc~NS2HG!m;(Hqx6vL
zer<=K=970IwL^_<3T%5iv|oRjOFLuN2Zah>oK#eaH!vrK-ZK1Wk<=Ynq83TCtJZ|y
zUiZPN7UdEg@W>(TwfX+0RXsxrNq8C0L&+`eW_p2Yq4mY{D^5PMPSn>Qhy=&hgwA=@
z#MJC{lpA*HS8)Ip;uOiIww2fy8{(1jw(+!B?T-@g4mQ|mvLpy02umIt3L~7!8{*7t
z33DSJvrL_>MJ}CRi;p^Xc7g`@@*_^4pIeJdi_w51Dq9yxA)$`fhwPcrb}@`QP#(N2
zilb5gF|)@)v2$E9E{yi{=J?WTrDtVc@iijvDlC#herRU+84`YHx#{u8X;aKiyQk6;
z%RJutMjg$zgLcoH&T+FF6gO|>;ld7Cb|ZZ<I~h=O<Hc7rD73{dVoGc^=3_H%xgIMG
z&rEo`iawGCCZ+>S%=J}!Wytq8xizURi8<l*V=uh$l%x)FdXmi8*)+;CPVer$XT8A7
zPZ7c`ePii*c)fn9Snf9(xVOZ^RL-PwcD@!vc&a$}@|;iYx^pJq-R}+E85nrRE6?dU
zXI!W>a!wJ&9I$Z;^Y})tC31Nj6pJamIJS!S-kkfk898iUeD_;{@3mQ;Bktq&M)4sY
zWnu>>sBEg)(yh(j@gBXq>7+%`djAz(6yYg?&~7HkG4A4lhWgcMUdbf@xq3|pA_g~_
zN~g8|P$J8&=qVhtj$^S%I@7O1M`zTmRJCgK*XG6Lqq>J^pv^*T;2Qj+g&8OH{REq*
zCqm}UhK-uS?=v(Xn<<qUGr9FL)Z&TvheSP}HzI<{W!?0@)J;g7$F#R;N?dt+%Sl_y
zTBW2q_#8$Oa`I20xBm_3zuL5QHa%VpV0hg6RE>8)&U5F!d4E354a*}|HP^I?)4QIY
zURO+SUiuutA=7K#A5}j_SdG_Z(b)7o`buKgN+(~lA-w6~@zR`)bJb`@@SCFbA^=tA
zr|0*byq2}>zFXjbx7_IUk)GS`uAa*t^t$5Vnidz}fjxJ;nO)yVb^2b+mtG9U*cz`a
zf#;=;=N{_SL^ahg9v@oPvN*!A79EU_TiG7zUcM~~8AmKdMl}i=7ZR&rNqmVHw99)m
z>Vhoy8fmP&(d^+Q#Vr_LjWMyCxt#7aKey>)sWsmcAG%Gufhl?r^nu^NBCYwd_f_p(
z?XQi3d(*>S2A&6QNdAyQ`KC_38G4`WE#s;YR85KK(FN$bxW3C050pm&7dMZk9Z}eX
zp95I+9bnbO5z@UgvB+|o_-%!4@#$UtrDICb-gMy`=3@bD>gi0rq6XsP1f3L;pYMKB
z>DAsb&g(DM@S)z@J?HMz>iM=9#eRNeRmfh6-$$W-ID3fo4xvIOF0Em0@O;#F5B9Nl
z{i(YjF~aoU<!1)A(mVGqW|gUgo_S%AT7PZ7SzfBgOi*)1=Oxu4pA2FsW;{47%O_X3
zSW&)m4z8da;N|QZN&7|e-TwY;;G^#&dJL@{w<uA(S~&;waL1nec)w(68%&fcmfAw|
z<(usCSZc0(c2{ucY2p=E?~y^gapkO6rppT6A8Ed1iJtEq7gXbam8(62lphrt%Q3}#
ziu@5T2-s|&uA*0Y1CbNVI(-mkg)Ms>wiy(8cFVu5cpQ9s*PZyBl!DWmWJr<gOE$Ia
zE54%Q;ywhOF)xBBeL^$(ve<pAxkL><JZT(DMO^MoiD~O=Gqjn1O;~~Spc%97y>&<1
zRr|&S)cJZ{ncXdRGr&RadSkW*^h)oBDuViU;vXjU#Dkq&19n%#P0;~|LA)&_k?0&X
zrIZeLc5gNA$PZ)<+`83Sx4coy3i(U9=`0cx7}jRj(x-Ov!QYv^s0ppt&wV1jxY)t1
zJRKPV9vdh&&YOobNEl5Drr*}Rlc8Mr>hx}cPdWq9xn26;BUi3=-3obJ58)$$+*tAV
zuS#`_mEKS$s1pg)IlR>7)6AJ}-Dz!MCl*I7>A5ZQieu%eMv!f{7k4`4KG_y5@G+sd
z#&x!=G{;xCc{OAIqMC6Nv*)bA;L?YgJdGwDYf;=z--q}TbCXryuC2Ql;bO3ow;tSe
zQG~H?x7+x12XTsj-~Mc2lGlGwY_$D#>v_-n-m|LZ?pEhlaImqJU**{bajHM1&)CR2
zy(`4B`{77hwma>P!{aY+xJ~j9jgt-)NM8rnN;Mj0vq`gF7?wy|U5JNr!qdoCcg~5I
z<_$U!Jw9BNsW4-D_-e&K>obV3)`D|U)AFKs)t`lR9upGahht2neU^6WemY-=24>r_
zh4XKQ+rTX4<XjZjjfWBO?>lS9IDM-p8SZ_EGGLJJbXvW0pW%Kell~Ja|0|3<&!h_H
zlNw~EXYwKyM5N>OSEO@^&&#BUz2!6j3L$Okv=x|j!YU?W)DLec?25e1Jc^rcK8jES
zu`)E_ZOehtEMg;Uv#nZ1Z&iTSKog*WWhtu%N68U7K9Q3n!qcfriTq>9yWHFF9$6)X
zGwIhyYZ4F;CwHw3Uio<Ys$kaVX$6%gdTy0*e%Y>P&#%o|XxJP_9Jx#sSsxj4oq#dO
z>`(lGY{dm4A>)M9IDU6w{SZDa2p~cMK!mR|!{Z9yGHiA);dyi~Mo7XH53H-xkq=in
z>h&itkkgV$F(oIfvX(ry&qI(qEgbRg(4n}hg^dE32ZnbEy-FGC6H+Hu)vdzc`=_d5
z5kt3Q|Ils>*g-v7&R@O9PVG+aJPyN2Qe@?3AC+Wcw>{UCOdIGx``u2nDVZ+Ng?7p=
zx+$4H(1Uh!gNOJ~05we-+9=`88R$z(ma&D``Q<ODfkOFYcXP8Id@;d<5I^*7G<WNk
zE$7K4#nqQ~Zdw)5c5GM`dwOt=KWZx-wtv=CI=*RD<|)iI9@*Bo$ER1L7iBX%JM4li
z_7vtG@5}69fNlKcPoekie}oE9-Qf?+$Pu&gJMisKxlaj`O0LRs#Pjd54Wt^`zzK91
zIfLu+te)<nE#7&sl)&>NcDOD{^>k+IJZqZ1!UGep_{tiIyg6O!>gnv#w(xARZA$-)
zY#@mt+QHg_AP~<&BP8f($`tN)F`6b<HkJcz^ja;3HoyeTdfp56_$)kLiW`X1W)WeR
zt{Qm2%qpuXfVQ2Zr&rO2Wd3~#-+c~wbLnu1$K7Lu{QOGxz~)7-t>N(stkbYFCu|4$
zKKD0$9P{bn6D(u+Q;MMkq!0Xs8p5P(jQn_X1~^vEa-HHG$W+ayykU(L5FJIzEZwk8
zJH7jnbN{K>aMg>FBX?bw38*abfBVRwB5U|?xDTIP9wIN4ahq(c{Sp2yAHIkGR)f`K
z+In!_&1lvtCP#AWQy=`ESqg;u(oFp~4MSV<&-e;ajVjQ3T0JDClbpIP55_N5+Pg2n
zGSULZFTt1dC-IjY1bQu<TtphveuR$Qzn*|Wh|%g#?10wr(D_lLTZ-w#AF4|gr2q!h
zkaLxCy>l+1oovO5)SIG%z@-p;aJ7MV@}iL~`1}6btQT*ZICuf9h8{KjH@*D>CtEOq
zFyeP4&|^!x-g8wt7YWABrrJG~>i^y{(hx2@d2RCHWH$PbXOgo*g@+G;4Pn%+{1i2Q
z$f?4kE({o*YoZb&<sdjpjt&=^V5OCVcta2j#rKKVSjwvdrIIb#)YG_%&bXhFM$gx&
zp|c^~GLmjSZC%{RpGWK58}Q8=XIcr}3S(*=@o=ERX|%v0*nEQ+36ul-3)g$zW3Yzf
zX&5EHT~F&0jb+Z8HiXL=m98uBwGrFh+tc6EjeaAd$C8|`DEX{oP7oj)6itU#cWg+6
zYS#_2#X(PvWIj0INq@>^B8lXYya#?6QWnWNd~$~Ta+>U@OL1Qx4R#%qdy}dYsD!01
zb6h{mMu04EGx$rS7*MCb7d@nl@z)-u-N901J*&o)9MpB!l?lu%_KGJHiZ!nJ(ID)Y
z%=uKNpIJ+{DED<LHhg4Z@^`ksL<mik=XGMi*iQroH_RXclQ&bOa2unjJniTNk*pV8
zQfVCIw_~Uy2LI9;OelfyKm;noM4RnTSFIF=o3v(dMyYGG(j|7Z#>3?(d)S3|q{aae
zWUFzz!fT%R`SC}VmZiuzzUoC4x<r)wsIA~?a4i4g*K!BPnT4xEm2j8+afEnRpH0cP
zlGX7sq#0tZ+XJz7Fx#B}(1J_9N?@gLv{GTL${}?zUf5;nS*Pj-rBc<NO`M`C>!leo
zOzgEg{j;C9C_Hg|m{gxlnUJ=1FZ5+sc0QZ+katx@=_z;?y)Cs-nWedz-_pIGwzakX
z?<-PZ4pevZpZDwZOKBitdcO7$EH9)ZQCy+YdG+0yi?<Y0ZUwUn3BAR{#B6SA3iR~y
zy28Qnl1)9!HCHx-zCV&fYjc^q7R5We)SH<b6ooi5Mmk2mPndg(o50FXSxP_)TuD;k
z>8&Heal^i9vwFi;yudGL(4Vyc+g;61)_<%4uYt_ScfB}INt4A>CK4VSl^+=z#xa?E
zIqv=Om^6P!J8<C;k1#s+!iQYBR<@uC0<^!Q@1POI%9XuO4Qt(>lDN~%)I)g-UmX9K
z0I?tjE*L0%e0+3x-LMX7O%xYLB9Sd!+@efOOxp`7A^JMy_OlA6oe6re;o(JEU(IC;
zY+A^wyrd!xLmSKOrCx^0K4Wa_eNrA4NO6WBY&JtO)3c@9uB%C7#m+FwfR(d{DuH4j
z|DoS!W4BJ<2D3?CEva6ehW44`G0GeQ55Lz^SOr!5fnl_)``LPxd|3vy-XHviMBTU3
z1RNK-(wOBU_cE5h-T(J>y}i&zVM5D;b2;$-4gX1F{MI#(J(sPy`$J`R7kkr`d@-@G
zoFoly5m0hOUw7M_(MeTEtZ|2rRkIu8j|n(da>ZSgV{Mw8aGc~U(EZS_ws0%b*7m_>
z*_{U00^NH4NRDwO?nyHyCO3Eb5{W(eJ!t{CNX3vwQIA?#Vk`vyHUFp<`-lg#I;GYT
z-p7ZC1F`Kc>DsxD_yc!K%axc&P90)}d-q<d_ov53t|dW+KawmmXkB;mj8PvhJ0$UO
zBmbso4FvOYWl%;Z%5Opf)3P`UbT^tb@*~c$y_xOnmNYKc4sI+sa36-R4wss(4i%F#
zFxd2~`It@W_=@jMU<o<QeGPv9{!-2U8nxEbl1gLzxwTN{v;$dzODMJ#g`;6=>P=VT
zo80(RE%wX|3I`Xp8orPFoo(qJq}lzThDxX<)bT?$bZyOdcBbi7?xrUtC0$ycZ+(5S
zX)r@AlW(BF<V9kI^=RG+WQ1Ud{oNsVHiN>Ks9Yhyh>H6I09vpVZ&97`KSdHY>*k8m
z6{#_Cx}ueE(4>()T6*{H-FLudmHi1Qn#actsFnoTG;(yW`y8Q$!GDo*Uyt%^<2XvE
z>`ygIvIIJ#Y>q}~iFt`}dr5nQI2D>S$m|80HO^|=EIRR_rV%p}cBOf00C16wB;u&u
z315;iWibuq!2%OumdlqPY4m5)oD?~<uHRjo!01D>SGyztLAHOGh7~Q0x2V~EnHv$*
zR$9(=w(nG?f4p(IQ|};STY81#BAMv!z{5=~Z}aFVEtA&pJNw|tMTL=ahk1_z-C~z;
zj!C*ecXd>GwvnOsIEF&gv|!f5KC|9Do@-8$drvs%{MW7OJcl<Hv<e>`(w;q=f)5=Y
z9>P^K)w5Su1`B<Ac9sK9`Ut41zq>jyw3Y~E-~{NY0)IFPK4OXU>c-_eQNra5jFx2b
z_@;j1VaM$nCFK{swyJ{kd~2>fjLtiaDdAdK>I4QBuC6#xb*>FQT1@(jXSZ!FRjOz`
zO{NG69I<(Co?|=>@6R0{OG`@=E=gqJ;8@XZe|uy3*+xRwnP(?-6k3<}9JU#IT0Uzd
zJtpO5OTz=fFn-1w=F?EPkxV49I6c|N*RxvAeVP4xCF;%wSXbBfQrx!Y$jQmy6K=#&
zP*Ft{S@h?0-??+gGd4S>L7YpUlK27A{2nnRh?1*vfsZvQh#_Yd^YZe-=a2KN+_u&@
zz@3Z;Ar!3Q)!+2Zr2v#t{fA)~kS<y(Ovq9)6UQ^Jb3+q3G>$i}%?V%^W*^W$JKUQN
zq3v005Z0K;gv60EVei2NMiHGcw>I`Ch-LN$ec<?_)+1$CE8TaNr~qJJPyC>mEG0+p
zxn8g7L|18c(#=9}<LbkHf>I1Ds0^(y(wjEStqT|4Rbh_v?Z3*4;^@pnfAr=3M6vkI
z*|t<dxb6p=@fyed>jry|Tc_B0%2UYJp2s8}BR++QAMJSqNI1Xa5YDc#<~!FO3ob|<
z_zmD7SB9EVeuXEj_n~KhTj*<Ml*b)S*0}dY!WCsUJNwJIDmQQXag{klG%@(Z9qqrk
z#-~D=OR^Tf=eHir<M`#}`K@;;Dcl3zxCRE}9$<exoMxI|m-jfWeEePCLfjAFSR^8h
z$1i?-k?@CT5opnVlQ7Cz53>!vXZtx(yV`x{-TJz%p6xu_&Qfp6McG$rHWnpQI}OY8
z6Q7=+?VRgOJkD7)Jw-rJ(N|_S9SJ-(kz#-TZ=eUQi!H6zs+SR+gX)TZq@O-4N&qaf
z&mNI-y~fWynqC^MEKSBRJlx+IWrD&Z!owYH>R=dmUcY|*wv&{C!bw0(tcJ|(79OK6
zx>P^-BVPe02`bmEYLt){oF*9>t?D2s#~+jpLQ%rcc`KeRxJ>4}xfYXwOG0nCz*&cn
zkMD@uv)i${uLYDZqLc#<56?nO`rf^-b723X6#H{&qJA?0E-+)Z5!}<zgb=!D(o*}H
zHRZ)K;tZy+LG#^}!2oB>@3lTY0Dxcamwe26LB*{M#A5>gT#}e<IauI8&2O!HTd&so
zZ}WZcbdXD7jfD%@(t&(5U}pSw_%|7AQs@Y^)u{?Dr~+apI668ia@}}P;eA+v9vVa(
zDlxn!*At&K)&<hLF9Z|Q3Ej+C?hiicAtBeOCk#8uT)snc5li7wXc$@Mq0g-Wj6Yty
zpD`yIN|a}CF&M{phbo*bMy+yl6bp>+3{|>ti|Z(O>8SJ-n6#xm{`T%|=fwQHOQm|Y
zc1UM}sPI2m;bAw?^q+w0$LN6$o)P+!4gOB0HA23*M>Il?tnuPL`d)G>ls+UEZ@o+c
zwr(yatEo944z8rgxO{m-=^wJ!KLtqZFVUM>x2!~imvs7yy)P8H{3V!XdnQal`SIg4
zuI9GW#AIzz+zPdayRpN%hfQsRALOWrQP&fhZN{p^KYjijq1>Mx`}>~$hjGK8&=-vr
zz}YEFNWi$d98^_R!>i)^^b!u6oR<4I%?I+5tL(q+7Lv26K9=<K^fYoBt#JDtDf}T)
zs7~`|Wv$o_Y7M<`76*n{8%+^*S?dCR+hpML@tjfL6#E39^~mkn)`&L8Q_UJjGB<B_
z0TaF##idu^<`PA2aJQMzlOI{jm7O9Rg7%b-iRl2JY*e$gd&7jF$&<k<G)9QQ2*L9I
z0%ax{gdj8n;uA6AX<}Ld34nbc03Lq>4r{iSgkIz}6}SHTSbm#Uv%%7FZ@FTN{*S;7
z32)!4tE-bh($)()%yj`f=#{aPv0HG_DT_cE5M%18)9_i|&gP%-kO?9;8%|eYQCNcf
ze%Cv0Y#0Q6Y50*ZrUhdu29S?7L7~32XIbRC_BN)|Zv|76laaj((BUltzK=SXhX2(o
z@M>Hx@eJwuOiLI#20g#9vD`EKARC23v9hv0v!DBV2be4HA6$YYzCl=Z!p8W}`Qu>|
zl5=sqCIl25nu=RUfZtr??CM#UWAd;_S)%Toh=L*4|1hUiC^WIU)UY0tP37ZDz0$4C
z2=9%JjRJnd8tpFk8;k}w1ng&SXQ-qS&$figI)}c#C`<fYx)xZ3GOz)&gWXlFjVkeB
z9awfLM&%`FrgJFQK-}%1U=?ve46eUX-u6Nb39GP%yFCBAs}?Az_!~-jFhST559QB@
z936JNUl7BFY2@mE;2(39rg}8!o4Y$*2mcy#?d_Nca(uosG1co}_nU(F;mumq=8LMk
zU3Hm<1x0&W&qBoZS#bbx)r7!cuuL36$`v(`fIPFjOo;LMfpDS7zkHVU1?cFlx~xys
zlb=7IPKa3fT#6GyC#)_;)1|Z2;7?e9H`9~gc8-+P_JhO|Oi?z~w9sp@Em9IJpqlwB
z$@ov=H8FykH5NJbDpV1)m0u@@J^Hh>E(+sq1WcJyMRMtBBSbYpy+)T$=hW>ooO4yD
zf9Azd6&0og1U>v&5a{`W!73e8zyr<bI9l{&#ekc;Uu7tNAXptN3|t#?GBd6GnB`AM
zrEF=LwI-ZLK}x!AQ~wmlAWgU2zN{hvF8lm5vC_Xwo|O=*?)<e!ijIy&TxG2sHFPCb
z!zGT}3rg)<kZ67Y2?Lj~pptm4yPU0|;lyVte*#gP08&K`=WwB~PyWpUwaB4OHFv9_
zqHj(QCOL@faEWPegQP>A9ed`9OS8Y|=2*y5PkPwO;J|KS6o;1bn0$NZ%_2IkqoYz=
z@K6Q-A<B<A@GQ3cW|C9+OSX9bCUs^!*hm`(BDyW!aoq=LqtTcDuxX(lkD&+0HHVa(
z{APEmLMnjBOq(_*8G*|vP87RsX`M9%_NGueP;`*>nTa6?xvM1DHFDmA{B&!zcFDy)
z8SMP31$cf!T5n!U30R(7dLtd{iK+oKQB4X?uAM^%44{<p!>!zOTnFG><i2Bmp0)V#
zH>o3+=`X>Kz#$gpWH^+~eT|VuZq9c5yX3qPcCps3bha{&f2vF7O<8l+`;7c44V+Rp
z+lgf9K=fW#bfU&Q;4NcMTvi4odo$GJII%?-&Ytbw@b-g&*|eqo`)oi;AvjR&;X*)2
z*zG6lsk4|Q>AL`c(40OZeQBipnhD5lsUDOorz?LjwkhX}e`@RAf-ymF6qctoz=^t|
zODe$Wc)!8SFx#LuA&81Q2btx$yal3!=Z!kVW`fu=@=86#>U($V%F4=i!5Z6gay%}+
zu*G3?W%`+h{|5`{se6J&a_iQurH1QYjK~mT9*6r*H^S`G1a5NZlw{>J5?s7)_T7(|
zLCjP5%9X_nTGj`nEGwiX_%tnK*xsNtgyKRm$5G{gko(C;0&KyN*z0Cw6s(F8TL8De
z-VsNw`_2)x!)p#+fxIe<5X0Bkx3lTX7qL+gktsSI(x^SK^Ym-|ca>5#HlC5(8MH5<
z_gJ~QKcDEMEcNj#xp?X;;Y`JOD{SKPm5z({i|dn(ZJ%{&S*p0dQmW%;`hRF(X;7WS
zGJ3h5pQpK}Ex@>L5<+`wlj7+L<zfHp+mV#wFE(bQ3_gP0H4`%hUjH%3!rz(+p1#L|
zEfN*Ml%z>(kX$Y_@^1z<J3Z~7Nc7Dz;)+VZD|*ok{I|4Mh(GLaFJ8>mRGGIT_whM?
z4}v4l3J`<Je!9>@N#+;j&+lLAr{PbG8IpThm12o71fQGWqQ4+!;4^l=PXFBIO^*6P
zV|=W__X3b5Q2LdHPrKJHU%K?ZLVxdIti*~&ElX3W-|V)n?Rq-cZtSbIq)x)NPS0gF
zsDv{9T@%I5W5Y%q;J};o#I>wV4F>`jm{ED5{jsE^q(<xwi`|Ad%yRZGK0hN5Iu+r+
zBqftvRyQr>2Qeu&Gu_1Z2N&_VC0p`(kE$4#KUl|mQc25)ayF{*dYibKIwp1qa)Iu}
z?$q8Em8Dplnq&eBw)d2n2^~SC;EqT$5~nh(-vj=CfNg8j1Z2D1mrYJXlNoEHeu{+T
zI5PoVfU!OIx%B;}kMEn~HXrdMZ-%}pJ7c$3bjMK}+k-7+m&DlaU2I!zK*4S{K9%t9
zF!1!Onscx_Cjee%tgxRA5^}q{5dlMuI>77G{a~rUf{t9Jq@?op%k|WZk*^Gt1BL(1
zKYE@fh60J{?w&bwrULmWqPe*lo~f3pewQOKWPk9QiZMP8GDEWFX<jvt%AV3h+lKB~
z<~mEFrxfrHeV!j-FlY!=F05>t%=$pKr9y-uEhQx%2`>H5R?@(HGaJ-8_V1<;9`XcZ
z0@N(hBz<wo&z(yH0&0!o9!J^Fwo<5yS6-8e%)~?n)wxdIG0Mu*6Of6Nmj+KN6Vvd$
z+?;7iL5g_F%&v`A_TEw9tRb`Q{&+t<0G6Dbd>qc9RbU3B0ozB$fx45~paI>Q#J|Lo
zn^$?DkwQ}v3QlcReg%z*VMAcwPMu%mf}`iGfe##K)2ndw+egNO0nePftV&uD_JQB1
z=0<elndl8>_eJtR7dj!wBoOaeh$(=0VWBHodTGXtveR_5!s(J&?Y<ZD^PRAPEUiMG
z9S1?3DJx(9{@zLA@Za*Gf5y%XzL*lk@t5yKCsn$)tKAJ}m3;W+W&Q#3DM(;6UXWke
z)CEpJx))wYArq-@^RVUn^m>!=BCB66?$>Rr`jIDhK|RY!W375`>V@OfCw!+(#~1xg
zQ=eb-O7PwBWZ^!44kA|lUsQzvf{#|p0=N5GV=;Zp(LYD8^82{YF{iYIhxC`w(9qiH
zsi`Fp%(ojH((5W2m#-Gbb*V*PL~-cO<TlP_=2sHai?G^^s)^(I%n?#?rG=Sl3Olca
zYG4d~mBWPvQ{N!a5Pwl96KM)v{=3QFIt98Ae32M+GOo4Ch#L?O|CmcJ=qDx4O!KrZ
zY+hxBKwc-FZrt*w0$GkZ{$+nLHSs%$y%&I5ZU&Wsyn56z{WiMZmb16F$D6Z6{z==|
z?J^@CjM=ZLdGlT!++Qxd1+oQ_Qmt~eYd4+9!~ovyig;;CvBSK)9`tyEv(D4PQSNWX
zUU$l~3c1^74%srHUSX9nfLs$-KjiIo$mAaK968hxFB~O2-qqeo0ZIjn{RTb}rN#E~
zq8^djMURc(y#aTC-O0V6<fOhRc7(Nrn0)-2d5@6vf<K=B<%Ueq@rkw_-V7E{@6$;G
zmBU;u?=_{E+?hgeXIZvls4GneR{lwcQsr;4*vq;mnj2LRG(X`&3mS_*dgbVz%EQ(s
z7!=x=4(7BhlxrIS;<4`=(@i(vr9Q{r==e{3S%JSgNEoLjQT#X!VDJc-&}~o^X}c26
zBBxMiL&)yuHS;OLM;8^dzq`tgB|dC2-&y7Y*LSk71oO#r-I%H-BO}xGCgh27l{0mx
z>igCG!h{U5#m3SmaUcP_+G)`G`We!I?#y`=F<fled`Em(XHH>i;MmtaP#Is?G+8r7
z)K1k|0(&N>qB8A2*grThc=7!CqB;O}+9j6<HqT3UDDJ-bfEZ`SbZPepjyrXf^*PbB
zCrxS7Ty}VNZ6H6Ts#@q;gf^sEVmVk7FXYsk?#*}hH&eT~zHy6)8>$oBqr(Hu>mQ5_
z(3okB-n_~P=@y$)P_U`q=I7QpMMxMXN$JA-t`ES}<JRyi6~m9Z$;inaGe8pG4!jFx
znm6}(3ay+YLMr+*9|`bUp40oNu-CLIap<&Ww~zS31BhrqheT)Ty|bD*h9PjwxQ>h8
znWYXrFaH+s-I|yVrq+jG4(r0ippE~d>1h5P3cor#Jq3m4>MnPuhE&Jf3uGy!+;Utw
zC$7!CGu3f*9|2bYorLezd=z&E3r&yJJioMpw612!B~1VIhp-oqlf&|E2}pC5ZFZLI
zRnE-Q<Y*NJP;%+s_Camnjn>4c%M!HSWrp6jMzGfgK%{jye)Rny=)cZpeSc?Jwc6dO
zGU0R7d3e<pv0j<2q?gWpy(-t&KnYl{Uspx!tB$lqu=fuEU3y?#p3xbEC06sfUtyg^
zPL|=N$379o<|#G*GnnGxaNP3sC2<S8Q?7_LJvXO^kA#l$vQ;tb-UCm2nsl53BrjAy
z7yuA-?nwy0xhn?LQ4bdY?f;+IFzC#E2@iRJRR{Vl%IK2FmYsi665~Gb+a~8Cl{R?$
zbM#_CBI~$y=ksWspu^g;pdjrqew(pg90GDpojk+(ovT-`(f~uO&>GOLa^0{3z8R(N
z$#DE4B;+{xUJP$#rbezei1EHqfi{F!>HS!~@H1;wGdukTsfj*Eg59MXLMnh83Th6^
z$6v2?&M21>{;#@bX9o#*?%4*teA(09L{Ww}(iliG?8Ho~Q0R5wjD9}{<PVE;A2xS(
zs#!+vIBPJ^898;{dQ)5LLl*dQcZ#$S<Pq0h)^zn`FW=Gt+65AT1CR&X68(&kVQ2PS
zQv#@<3EF=ER2fJC*|+x2dp&vbqykxnDLx#M_}tWHW78ABXMoe~H(DcdR|8?gC02#Y
z=C?G^stXD;iF{T=NmPda%Z`8)(7+q%KQ%SwxMhn|6BOb>H774dd!?p#ceQl<Bghn%
z=iJ2YPVIf6vFVDrHsW%-PZfM*W9qX9%cV<xnZy40CsVWkzLYo~BTRHnfuBT!Rjc6A
z>UiyV&ah7}y20SIGVt1dra3tle%O<vC%g|_MepYuS%3)*w*2i6f9zDrBY849;K5|D
z=~>5w;(W*L01f14g$%S_UtUT4iU&jZRbq}ex}#HwwdoLXdC<|Ta4VAnYPQ&uuF`s8
zK!cW=Ufeql6el?n>r*=pyutqu)8*2j^yw>F8OXob3*`Ytn%p+%Y@%M;edB*JfJFNg
zrA9)?me{)qV}#^af06g2CodIbztBW7eIUK3wl|3`cvk|{9We^9rtg??^WJP>aa<j`
zv8pVfNDD)k=~a`Za4Lmxu2TDayrS~4EbAiak>|J<f({)D^c0xLnU7VK6C;9XDJef{
z5pN)DKpEITMn)!^qULM7Fk)ZCZPSz4(h#{de+*L2r5z!sCGz*y!nnT``~GhfwG)A6
z*5M3BnR&O*ac+YroDe0lGBTs825^FJ-@a`kpT?9Q_Lr1322-%BE55!UV+^I%D5y^X
zR{0Ywd7sM`8`+=GmGG`E#X)>I^D?RjI}R)edruvsRic>Dpl8npl)&b0PFc|_mNH<S
zESCKjMG>HcHaTml{1aQqj)KV($f$cV)RrqjJ8pPsSuYusD7F#fJ~~nL=~MDIL5csO
zv<z`$fwrrwfao!5s|Yx|s9mVatot%D?&eOVgFmPav8!dMnpJ=rn$L!MjTA9cNT|w$
z(JgG>lO>}6;**b{P;Rwp58Gq1nlq2^vHWk1dv4v&1|UVhy|Z&)*#OQ#+<sX;?)LG~
z0XMg=+!8?P0_oHFBU<L5R_3_3ZdBpAabdK&O)FzLC>H{rA%LlrKZ;Xlx9=$qVZbP8
zqn+7Mr}IbnEjFA>u2fWL=HAO3jyn54DjZ;JBcC`0`Z+aVCkHB4I?Q*7qC@=5Tql1p
z#nujXUApKE)E+@q)?9{;ckfhZBh$kN4=xm2k7|qIby4h@f9BLFxugf0_(iU<zQ*O?
zWft3u$XXH45T3hfm;6F4a!&R9Z&89XE)4XzMwYqp#5{2(E*trEsk3S%OZk^$U|?wU
z_E$)u<L!a`hkmVP*5fst!W5J2r@3{@CdfQznLGJg-so=fC0&k2@sGECWnc744kEo3
zspB8ND5H6z;`U$kX5b7&FX|rdn;2_cjuV}WIl@_W@w%Avie@`NG57`m=ggWu3Upb^
zm(?W;1i?y8ujtZ2Dn*}b(iZ6gl0T0O;75)#b;~!4j2Z((cRR(8!<9?<e=9!y_X!+r
zG%=Juf5y<zgu!@xKvFQWIcN(~jOW1Vi+`x;peUOkYLDe_WA7TtfAlq)$%B@dfMAp0
zffqCK;9&erj_l2wakOHdd&*884oN2)&Hu%x(qT+YHU<i!rgeOf$|Bz9#_bBWA>CTA
zfI{D=Pjk|89sGfdYL>q?QvC6L>@DC<yS%t;>gXCkUtm=6e_x$3DFi4SZ$Lr4t{Uda
zvhHJp??c#_YfyXSsH;1Y3lg1xk)wlF_S`P8*c-`jg(LX!VD~JkI#xdYZ+6Bq4e^+P
zYRK@j$9!5+5^JYs{CJE&+C-R=l5(tOFiqg??OkL)d$+<Zj6rlEVRjXRQRV;B^eMUZ
zQDdOs{^;yC_6D-aa9hPa!>%MraZr=nirlBxU{!jD4SEzI9*M>6JtG;U&;<w^-hifW
zVL(5f4S2y<yzm(~hS0!efI57>wv!&SbLjdO@V>3fSccu5{86vO21W1icjyDUr<|{8
zjg+tHcCrG(Gm8ek_>YJ&OcFXrMde}vB52=dX*gO!M|+ZcchB_{nROomcLl^b4Ei|e
z($n=PZ!058L5cI~vx@59E9yUS%T|NI>YSO?p6Noof9|4w;@iEzAeDR1F)Jn?@vg>R
z-EPGjb!@z5(LSEo3{SI;eOqC-8(q09P0(^CZd0#TWEW@jdIB*z><>if{NRfrOYuw`
zkzmiieEFiUSM*p3?*bdA2+KXP0K(NzW#fk^)LMG3oj<y*Qfx7@n0F8fcDtsgW*a1V
z!i8s-e=*-qh)zt*K;P~WJDUMEjw#7^A_HgJk3fP~J)pAHFKplR-C^3!-Z%NV3gd{N
z;T><)nz4ORn~Jh~{K2V>e?^jL3h7I>#M78{ph#~(E$;1k_ChPODuoa}Cjb>aP7x85
zi-Cbb6dnvjSt+6(yJb78wYuQ=+K5h$RDh|OVrsR`rvkYpTT0MMtp>e?1Lw7oa;4ER
z+W!#eyS^E~BUExVCLXo*h!KOTYLs)N{~u%T8PH^xgbm+_iYUc`AYDa8M2aB2svsx|
zBE73f@4Y1fMMP-<1tQWc)X-FVOH`ytRge}+=skoU0wH`S?z6k@KKs7k%U_Z^=giER
zxn|~?b4%_EX-jib<+-`?DR;bP-yZ+~kTq@eX)<HKXuR|l6j~jR#bFPhia-7Uu%xWS
z&zzayxD?$2hOaVj{>-@X(jIhpOF-r-@4s{sWGykSLsjaaRpym^??)OWE-meA0P^jl
zrXV+>gmGB<--Pu(2M#olutUQ0Uk+TG>D7qg=@0Jva9@m-M|No+UwUlpk1F*@_S<uS
z*Z|(}Zn{pZ6Q%Du)dz$AN-bX=_+1Obj}O!h1(m?f0D*lsc~N|6Dp)sQf5}UqeCB@5
zE`JGSkTC_F(0w1<(YXW3|4}vj3-su`u!JfWJ~KJ0oP6Xcop+TP*ONu%KiAo7WN4pt
zLj;=0TO6i7grtZ+V$qlLRT;%f-^nSQv(GXC<2rLIQ<5c=AGf%w1J6sr%aoM{(Ok8c
zKNZ7RSsE=<H&1uNOP+pP)k6`Jozh$KX1G)=kZ760Ti}}fC@=&dXG59Rn7%W8Md!PZ
z@uvru$OtWxPPW*t37E8)?ci@Q3w0{bzpnJu@698AQbysmwA)gzxFxdr&&JH0*-bTt
zWaoqVn*v-hB`Hr^;R#$yCgF~yITxnO7k^!qYs)&MHF?=%GU6~p2!%LoU6s`uii*UC
zs)%9By5<S?^>O`GHp_eeNJ9Z_J6?D4r3d7X0c?lv=S@0GNA|qj<$VEY0d8<OoS0MO
z!LFhps*|QDp$PKg(b%Yn<HFZ7wNP20!Jr4oD!JFfS$<Xfxp7H6Ja9WjHV)7SbU^O>
zj#j&-^({faB#T?dHDmA7o?TlTe|sIs_?_BwPG+vl=W9S21v+Kigz+WdJ`l0HmN*PN
zC<X-bhk!x`Kab={m!icTI=yV|bz)CJveq*)Iro=?#MhQ@Ez&_73vX(JNc*25iNb1*
z<<jB*1Hq@ADS+Htn@_Qw;87jby}zbeEw7&!aC~KbHS<Nn*h9P1J+r?+siJ!B_JtN=
zfq9!h=bkCCqqmhY%4@g@OQEjY<SivR=5ROaBdY%Bs016gl_tx$Dl%N-y2hP)c^e_S
zMR&Qujdku0vCL0WW0Om?se`<^6}D%mcOez7kDESn%zlNsAGJB<qJ1oReu)gr+-Ax9
zxpr#)c`SuV3^Slh4H|rX|26Tk+s{e6jb+beJjm_n&()p@?+(g8w=Zn(T{12+S2Op7
zT$D;L7#Dka<)qj(wX~GA&-AQK!7{h}jWKBoY@;?4lhaE@UyhYd7W7jv$9l1OCVV;}
zw0xp&yMQ+0_mWZ_E>Q)OV1YVJ=v4=LJw6}f<*ohJ(LsE=#r1x3YYSv~Sf&Hlu3e*%
z9YH}<T~}8(UdGLmMkWNsiDwaPLSIu+cK~bQ;k~|$8^;1c;wE(IQrv!KE;SI`UUr<A
z_)IH2(}1hI$JkyA3@mAJxHj+ha?1IjJ)c2(ebIinCgA=6u}AMFY~R~C@&5%!($18E
z>VsMfAK!%{{qbP4aGhaA)BXAdX>KVJx4=khgbcNP)A7o@IOZ;3n3Ohk;F0WL0EeY`
ze~6r?O}2VacJuuKx5Ar@>#{uKd9(hTXzbMYlTqMSNb<Kx5u0bW115|*yLWb0BxJ{@
zc~?ICa-}QVA3`xvrZgWdTuyi<jBHCdW%tr<F8fx?=A23zf`Y1>#Lt4cLHJ5Sc+aRy
zmDI~R4N=~BiKrsq3DsX)GFF#NO1$qMWIpt^Qb90_X*zFr<#@Y5Z-<c`hg0JBq0xU=
z#wHF!HtytyJWyKjdbg2N>`}-QWFz@N?z;i&QnvTq?ll>m@Vs||pI--b)U&{$$~Cdf
zYEFZeM#K3nhk&JzJy>D*MeH5Yf}IQYdG)`Ynkg7C9$uM7?5>u3cdZ_GqPs%35+O}B
z1H!wY$imXyD?%PRTIB;H+#r`rL9llkhuXXq)54Kpnn$umzx#y*(|E93ZPnGMFas|A
zAGNHc&6c9_jr?Rw&<*B97ft8w2^fY|7y1}?Uq3FS^W{9}ujSwJu^J%X#@WvX^p|B@
zNkJ)?wok1+#D<>a;;m*6w_8CbGVQ5<$fEVB1O~Qn0Me`86Pt(F*_x!^vxkLjEdS(3
zZO@SVy{7x4pf@*3sB2GDqPqv8g>(&dYd-Vagz_9|eo?(W?DfB)*hw#Cm6DLeUFCE`
zKFc@U4A3wqRJPc8=IqFrnZJuiMdSF?;?^B)CvFH4#}hRf*_Ic?J1)GjxnY<FrjR!R
ziz26L$tEyG7*&Vg;pwW#*MLKliMvyFq~*&s3v)qBgl5G;8$qoR6p%oNObMK|!H8`|
z7Bhzg<H%&${9@+m&O=c5fDbuBnUC6;Gp+YD7w~a#tz8-H6{ewji*m-i)9ss*x!YXO
zD{f*rZHA4tsL}cSc#B{tBo643(?^dT%R3xbw1u;*4crIVra_76>3D;6pT&IOkXJY#
z{}w{sQ|4ag6XGY+6vZL$Pyfc$Co=D*t_W2zITmsD26?l#ZPG@_YKL2;)6GQO9*q-=
z9qVp-_6-sRos_%4#E?{x!`7|$@g5=y(m$i-rO?$=rQF)^!XO>UuWmb>%hLbFuRW?x
z&l=ux<0F!}oK)?&A<yhn6nTuPcIxol_q2k^5->DdBwYe`17pRf(h2SZ(2u}+9u>1Y
zso3t-(DcNvEDp$*yT@{sFDm_h6JDcK<HC1rr5+25XhFG9@!FoxNSWx)){ZpjLq~<H
z=WC>N)DIC>JhgQkSMx(3Sc%OfQ5LG$|2IilVPM(SKJ$jp?=nJ_j-dV_A}hQ3rl_b$
z!+c*8T0i%+!cmX11x5(?pzZn=F|7HElxoxVWB*RppX+Yc=Fh2Ye6Ss~wlH&SC{n`c
zlL95TXIe;liM0b}sTb>dc4VSz<uRx!sY_=MW<?1bRz0&dPZkxqT(q-7w5cNPb(g6T
z$r}w51o*%?x$h>zm1XVMtU(#wNk5ja_c<A!&3Rw2+oL*f;4xVJLEb9iJ>ALij*gB!
z=0Tox2|0K4eQh5o40qFw{0P{YybM^OCbn=DOYoYmN8KKaQ7SL1is(Wrml1ho-@ojf
zDeuy~j?!tM8=;lavpE3`Aq1p%yfh|<HoTyEY<iz---ky4D;C%WvLt|G0$6=L8}q%y
zD_@U?IR3l9bm<@#m5{wJ#Dc+0yQDv3-FiZNZwO@iuYK@PXI06SK{Z{vGD;+Ev;12Z
zU1ZDEs0GUOv2I7EVw9YN4%!ZGi&}BoGk;A1qA!jeBnt@1pVMhBK2Xq5$af*!EIp~Q
zgD_4uabvXQ+!s<D{OV+*SNJ&|d)$U{t2r-(0JD(xdCDghZt9}K-1Me;^E%YL{U}Ml
z;LK2K^j|=|GBt)4pAJBv{Scf5y1lHrTy!Um9z1y94SM?L820a{=3#~{nE91c017So
zYtO~DZ5RY;$gf3L9$$p|gCYEWDaa%MROB<SoUI%L!Fz2zOv=;vI^Df(qx5&|1a7TV
z%b@F$EOaORPY)+{FMWNP!47!}a;x90Z)#WGf14=a7CEVXDWBK-a!}!w<`PH99{!XM
z|J140C9&wixWi@~{VIZThsEn0^sp!9W`zLqcyMcCT?OjF;*%#qOjCE-<X!IAAFglU
zc*i(D9A~s{!v=jCn~c1B&m96meJ*gs#|LUM^>#j!JrA(tO7+w|uaQQjNk2o3?`%Aa
zrsBR}Lmw-8$ZLGZ(B&n#)W9|SeUvE37F03&DgX~Gq3cl2w+e<TpLupY0IcIcnq{Xn
z>U(dId2{CHdz@OwmrsI_GvLamfo5_x?t294k3!mS+1v%$$_k3@qIN_;WlQgcmlBHa
zA#tSx(3fM&W%*Azeg|TYoQ84?Zm}!sn%DrJXEzkPp1kyqY1G+@gFP<v`o#bN8!kI~
z)t()~hD1gQJiTe=fQ1GRY`e%uG2D@ssW^%0bWJ2}Re4<B8<H$0l5I;6;mzzrq*Wn%
zV&7&~F)hZZB>#ua?_%D7T}IJ=)&t$oT+P|NA?6P#^neiVz6b!s8MW)z5d<PJ_*yDT
z;rX*?ZvbFSqhXA1fg31r06-Z8DQaHv;q5C$)gA5Hnm?Rm7`~->7+OmB`217d;T{)F
zK^(%T))nJCH?=I(^-*1=<3(NNIP?66FI`(L_qBqPuXU`$B{D;!!3_&pVbyk7txiMF
zCs%4W(aL=I92bi~xBh`DQpxM(ws4`AA%_u3p*qQf&~{9akWIg3>Kk5^DHuZ^$zDNU
znfH_lUOe*pgKfNP0Mr^$ZX;E*bY`%nVEtg#uchNbZf%QN*QPM&*#mWs-RWMFioZr-
zkxQ=<uGud|DX)~7vs5@Yt{!)7VQIfrna~&E#ybTr%6K;gr@qz@j+sGu4dsSi4kX_r
zE5pM0tqO9q>o&_Q7av-$FOzAru9HDI<w662#A2Drs_6jq_@<*GLWvaU%-fJ28lGEz
zW_4xBeIV~ZZ`x*jBLC~$WU+AvGW&ubN3CeOPfiSpjWUG(BWme*D)7>BX#ilfiJT#h
zrH<VfGC+EE)~v*S*%7pzN$MIJ_+=8Qx2<;pOwBlSX53f<DneJL4cX{GeV}8~1$rB<
zXV0<T#cGL({hNPueU$X+lMDpgnk;)g%3lcAk+Et}BARA1w)3TQI*Eoq>=^~)=GijY
zq--!y-=@fkf4%VqedMxBi+@Bb_gn6cqelm3hEeNW4zoTlqOGJPZ2Yg&y%FeucrLFS
z3>l~z-cEB=rHvwsaD?<PZJ?iWjF9<U4Q_8pEI5=8zureQnggubsfNFx@*ukgrv;7X
zL1Fq_;!4L}$9EX^f_&)hRm-=nfMr$1rwsGo1~oO9HI+jC1xQJ>*2%to`(}sh^3ww6
zgg`F{dv}kLMhi%PFI`$51O2Oki0`4@#NL+wibe~}5C<VCvobo8UdacO&Tpzp`QbNz
zxj<g(HYbR@Q%y-0lBpLIPe*+U4+C!m|Kz)Le||JkYEpJ_*0U=qL8#7x6Ur(dTv+BX
zolQLS+1cOBpzhGpEtr$rHr2((j;CgLJ9zDdE0;Sn718}4-j??-Z>txy|IKE3daT)b
z;a`Wj4gf06fx57LU(3HH1-&b@ti36yla>J*@2Qy0u|u3uAd!y%JwUP0FCbt0DS%YG
z3M$fXL4RLQU!QH?{{78)OU}71wk5A#oxlNd!`ZWEzpHlq1g;1+n~4&2>-#Wz<5Ivg
zLg1BARhGqY+RXL?HiVdP5WT8lrDa(BD9i_+Rh)OZh;V5kpv{S#Xg)7H0L3mRAYj5v
ziK}@w?#3nh`L>GVz4^+Yo&kh&yFfvvH)>q?s0)63z-ivby=i<*81*a4%LZVRnN>GM
z4XaesD`sMdD@DPBMyoZVe7m#AE~!;WoO{@?dufkiS5{=p!)#3o4AWR8gQ{iD>k4sN
zLS!yFLHAs1@MsrsJ;Qf`d;HHyP~&<9x_B=^*IrmhO<A#av#2Om$Noaisar=951NJT
zfT5@ecfE?nE3aIJY64!0T48Q+h?>8Vv}<p=w**MD-V;qn<h^GO(+YD4D%p-qsRgaQ
zp8)1M0JhU*P~PQI^cP>B`5Bsq*m77@LILXFbwFq3xeK__=V^_(L_8-bK$if%Yn3nI
z_12|@3m~sbQwp-|2i-!wC(s?*RY8A^QqoS8=Na%qw%}M>9(;q-Hty@$Lzz*L!e;uC
z+RPB`x`nn)I(|$(C;#@ns42P7JgRXHCw}R9)$3lb8C%}$r=f*RkeQ!j7H3Bwa`A1_
zR3(Do-37g3bjt=k(X_~P)*R(VR&yjt_&>u`C@o9@2aW|$t)T<(K4AsrTb|HniMSP*
zx3Z8uJ9t#WuQKKK@^Qa^(y2kR^Y9+%Evd@~ZR9gCk%8A^0_^7V^mVk39;j;FFe`I@
z3(i}LZQJX!^mV7MJ-q|UtGS@&rC)4&oAW3-T}<Hl^XF^eBRP@wM*YX2jk)gmIZzGS
ztX6|^{w|h@=>heL?+Fr%Z?}%?HQdHK&K~c)$=v@7x>I{|ac8nlYP{0x@W3+gsd$_d
z@2-0!*Xw0)DT+3yOyI0|o2=3PkiqZI3I1i&3A@(=?}e4kwPlz?l9qgdV*$<2;iCTp
zHq3t=HG$Ko3oqOM7^wrjtu;`b_j~{<dqJQnl&1su$l14FiI1W{v`$X~{6B25T?a-E
zu-?x+`h)|e-WlLSB*Z6#nTpISUmQK*>ECE{!gaVNiPrA~y9dqS_dDC<uh$z}h9UQs
zM3Ih&@7KD0#wY*Xd--;rVrdw&HzcaawK!&cdej41p_q~mx8rRuC#{HlDl5{X|6D{R
z-;zX7=5${#56*ue__OnH#Y<p(SLa@PM1w2XOwKJT_4%d<gUguq8C!_5bM9D$pS@In
zXs&_&FS~%+M|W-x^+KNq{jmnW2jB)q&cCN6IzRCET}ylWnJ_@!(F065L3(<6ou%Nq
zx;im65H&8|dGmP)bdiRI^>gkVRgokjMuc~0xOd9!^;pP<8!ykX_+359!*dI;h~vJz
zdv9Y0`GXFcOKSZ@|Nq+O{*ac9_Ok;q;+uw+7zy!<MltdCMdUC4tj*?v5b*oawv;DQ
zIB2O9Y4x*BpcWE$`!eWR!8LLTsMhg=gTWhnYP`b=d-Hp4)_dn_QqeRvN%;RXb%4nH
zeq`(LlNG<LMYsu1y@^#gp=*^MCxeGc)`eUurTIScB+3nRV&9j)Qp~P$t@&?U@)L>$
zH_d>eU}(b4AFnz}%8A+o@36NG!4)#}%md4g&}PE~qADQcI3&6YM>H=1Owv=^o<KL!
ziJzL8>hQcCmMzB$6^|11e=|&Kto}?jbv(IwK+yKT0X5a9Mz8_48EGr(SDq0P@KW>J
zMeAtmn@liTq=29$(6nM+&+uz~y#Bw;CW#T!xCj%rQB>J31nHzuu$LQVZmqQmS}NeA
zzx&in<LunLIml3ga#yW9Mvk>zUdP#E_5S~sqzm%teffYOjqk}%)(E7H2mp<OLXvj|
zNaH}uF^C9QsfX$(jz3Yg6nt_rmKHn->Ln*3$V1_V+7<nOjL7~@>m-gruGN&xsU*L$
zyFtGEk`j>fP-Q^#Qnv>CtmSkw6dNQz>}{KA`?|4uheWy7<2R^PWdn&H8`cI<07|Lb
zPWTTTTK)y(UPz98A%n%@q=T)G(&UkAphnhg875{?mC-IS^hpw$u#EP5_Fdt?zOZ|z
zG?xr|6Hp2NCfCs*whhP)erFlHhnQI6mB!Fr5(4z9ER+7mZ!7jx^@;;qiz@QuQ7lja
zkmZGGiCaD=2N(Oh`H{VU1#-}q|8Hc%LPI8&Y>+37m+#B-u3bO?skCuohdiA=x6mm9
z0s<(e{QUk>Fkp20;2@_a@7tQg>chNy=Kj^kprwNbw)CGnHaZ0Pt@ztY+S1@^GN}o=
zbR!(WqH*P?RzDb~zMiwJGd~V_Y6t`zRCAchyh&ic57g;m@INg~;kSjEK>mR7hraXn
zaSd80RVQ#|x`Q@R{le%7&TM0UNVaxrmGC(gp#?U~(X4|aPs}H3%jy5!UioWTs)p}x
z7_^nVl!@uOZEd3u*>5X;1+cHT_G#q>=%TK23_Im==vXsbcPn!FQ)Z9Xr4Y_#>Ae5x
zc9m#ux8@==F}+#phEMxs*HM_P6S!IV2(YW)0(yK-@vUN>15kjcHoF^RZ&=q?!K(tN
z7y|4L<os*Q|J!-kHu`wDb=i0YgB3W5ckCkT#OJ-Q+16%pyG7Pr86dl2t1KlE-Ta57
z#yNH+%w$**tn-`Ct)dd^WWjY??)}YmhrEmK=OINqTZ2XylU^?ia4_EhVdVRrHrWB<
zc&&|F%Y8hh+7Odv(a|_*5MC5@<JNkAC}dmz7qjl+KdK>rFf%~4i1pw@LuPL2R`GnV
zGtezChYJ`NO~PBh$Pa1rkt+wU2DN;Few4{FO|X7T4ZY0qE(}^JJbCB;sr}5Wf}p%n
zwUu%mlx<FyDH4?}p45bzfCQBzkjUOPK>@&5$^f-Ubg@=($^ocScCVuFMUJCk7p`U;
zKEIhW|MUNm9&zO7mt6_L)#G^I<e%MvG5bRRy2=2=H9dFt53ZK8j0X?C1J%(V037@>
z$uia6A7ak4C+%3-!MZ1j4`)xFDIdu$)MEYP@`(QniA>${1mqdbBlm*F;!QTXPGhlU
z>#PcFNr-ZQ43lS%<w>mrvWEUwa;~TCI;;L0Mg3o&2?EJBp`1qH9IBDH3;1dB?js@c
z)%q>!bSF`zsCz7Q*^Gp+tB;GSl$CP+k&42%2Aps)mmnzWGxp_gYS~}mvZi9Fjo?OW
zScW69H~B&&-Zg2_e+F|NdZ4!^bn>LC40i2FSlCZ6kTJVK7?`{0Gx-$^xe*nVMPY(b
zhc6s@G`F?(=Jza%=$WANF7m~9du8b$cPVPB{%?BEUw)*Xuo$~=s)}|yiZb;=0Mz{K
zv=|68U}7p&=Je^S;8$o~ZXjIDe5Bg5ON@gd<i=N@PftR$nmQFe2;%qW(~gG4;*4U+
zR?}&V@x9RJ?oGeDe>w$@^6K|1>a#d7{Jy!FH$it^Afo0nGyz6VrRp0R=B7|_fF|2T
zBhUBGHrInUOoWG^!ivJO!_1!mKH%jC5y#)E2ieYY34+PbhQA4k_ZY`}Cr}O<9WQ#p
z@Fwc~3g9Y+mV@*wHa$JvHEnjZ{TgPM8dT+n+@98~W;y_onRky%GwK|aW28G7;t-#A
zH|d-kPqtGE1XXbDYX2G6SI2z)Djg?%XpkR9F<61(kEkcHLS)CwEE)NmZYWe7-AM+2
zKrcz?Sl63Q;$vX~snNxBfmfE5fZIkSy>>V36Kg#@cktA|R8iRdQs*c4%k6*chMn(-
zt;<FFkjTIC3(LVpSF$&^#qT@KJWkkn>rT<Din8qFn_GBdt^<1q5HJfsqPeN0xaqW@
zm>B-)#^p>4(Su;xT0x9*CZFvbBkz+FUvw<e`SgF6!`Bfm=aqU%H)x816#2_7@+p*^
znYLN_yA5slO9`Hn?Lq^H9*1T_v4Ug<@a8e(z>pFH>yJ`ewQ*PaU6iP&DCF&9qEICZ
zCT3ESLC@Jjfc`V!2<m|`e4A;$Iarq-=%9nl4!LmaOhNNQ$ls{qg88Wyzh&eHZGSWW
z>p9mFmpWbsK~uj?(Y5uLPOf$fuBw3%g8Z|@sODGSqnHwQr^*hs@N?hJL(VBY!NFXm
zx*ZLpDfAQ@ZucL*jAlU|H(Otxo=q_7nayH=Le`W9Kb@;p@p@WFyZ?nO!)D_h7qCZF
zyK|Ghx9uy>{fH$3n6fM4=7ff;d?ImWE?P8-963u+Ts)Rt#8j=f$m=>P(B?E$bC&0D
z&%yMUlP1AhVRR2GCuOjQFVOC#_=m@;URPkbQ<!7E@<iL^4S1=>_V{79wa)uO!9}u5
zj6OkmFDZU^+)tLyc3vfMb$>=~-Ep@+>>|4^=OTwS-5=89VYRc$xxW1-dr=C!yQH2#
zNYNcFacw02<FM@-(bL7#sZSk%3P$<EEeK%^SG7j#Krtb$10|MDbF#OfrhoceR}AND
zcOEKmJuOS(2;}*?=`CJ!ROEim5{rpE<=L@4zt*R}&=6V${UW13wLQ1$`axjN*&l_6
zZ;(m796fhhVs3kPeb5O`DdbK}ENw*1J^d)v$lfJveN2yPS?0!9tK=5zDi1+!jRco9
zXhCk1V#U1o&VII`UC;`}Mt-&X`u?irsawyE;y^ohS<~Cw`vq4@zo#?=2uOkn=XI~j
z;G^{OjpH_m)5>ypi*D@=fo+R*Ea}}xi(8!#C^;6vNS)hS9ibV);Fn_^J{d+06u*`I
z5VUABRmM7*fat*{D|cyHCS_}*j<YYO-2fMzrmv7vcHy_ijiSNrc+iMTMeUA%wxExT
zhr14pTC{)9dHRUOV95X4rs2%FiDR%As1}VJK#J<C1^hnJuaVZoV!Dy;JfmDdoLwoi
z^ckLIWM<9;*R<RnzMf(Vv_CPH2O$}^pA3DyI=b)e<p8SJt}_i#x-(FCKBJlWI;HM{
z;q#P4LaNrh4wYj`X$$6MT4I;zDlhq}%%*0FeFE82D@L#71LUS-RC@nxpj(5tg&EJ_
zvGm=%M^JOfyKQc0m^~&o({5nvs_*EdiOGX(uaW#J)YzYNCwoV}gbGPW+?63lG4aaB
z(=wZOZNqc)s)Y()Z$GlDF*tEr@!}J`71H>=-<l2OXGzUmdX`eMvb4wI+CCsT6~soG
ziy&{ux|7B+?1_TpcHStmvn-s_-4~C=n{7yoP;JU+I@S3~SXg={-oI!#FgB9*FU?}1
zr8M`en<_!mbmwjukcO%mzdwF<aF6H>x+`kmLKycQi#UDdtR@?q`W-d38}AMrcyYFP
zBE>6_=ggrCS0o;*9r0%-=B4)-W#<uF31&*g#R^^3l0(yq9Ra?1TfRP3KGjrgr2+iD
zZX4CC%Dbo2BeOU`RKKeGZ8+c7+MYuffBe)783mjwXUZox3?a924xJ~BxKY7dcK{Ly
z7TbZ(epL72T6(8WY~7R3d=PD2Un;$6`f)wY3{2(e8~8K?jXR;F5Ct-#!FEL=+$!JV
z%IEj{$&YP|^q!{7GOD<4M}9-2txaNzdnq{iQfqhK>btwsk3zgd9!%{E2^K~JmJ?z&
z0=ywWMSJ@!G$9{C6d9WQP{$I40Hx{HF|&Zlgh-HNc9*HVYUQd~Zs3j#-c7y{RKO$&
zJq&GyYO2@1bw+EUL|&M@u+Vn7bB9-0TW&kz!Ou+g_1+Kz1G^VS%6^vkT|Xt;@P>=n
zn%iPcVocei(=p-jqRsV4jvibtI_9_>#?pIMJwj;O(pX`C??>VFV7>#I4CifOP!kg2
zKBV%vfL9A8mYO$%-QcGi@ywh@@#%gog8URAN=GnN&$)KJr(SWjRd0ZLPy3blTiKa6
zp%F%w0cY_OYoLdHSd)p`E;NK)OU@KVVKTiFze0C>epR;~Y~ct>Uw(<Z5{X8~)bvs_
zcQq7ZYNkFvnCz&*_FONzD7`sc70nq(+0DXQ=sE|F_&K`h8)y{sSc$Es**G>1%a>(R
zJLu_Vq57yp5F>T7l<8}M0(h;4uW)?3<(Wr@x)$4R1)Xp$F_o>E_f}^29_@4dVlt<>
zH^d!K1USGjs=Ld!p4X61uSgnf<vYx)iwjzzg6$%pCsu01Z@p9BRc+v3WUj_*UkDVS
zgyP=3kxv{X&gezj7@DJff4oDZ>quF<l{Lg&l(LWE$B}o3e&J?DB(yw>`Q7@kl!eNA
z*TzL-r);$RYxBT*KelaNnM~h>0&BCmUB$z(EO%O8L9y<UR^@PX4q}<~xl<>h-RQC%
zRVw{Z@$RtcR`I*7<;X^mMfC-XRE#_>9l7X}u}Af44VZ4kT*?y*j{0)ua?NVH0(bEN
zy{uEv#Lj47Fe@s}6kHil9XV$dM;>Lp`qPq;db;$=6`zX%q?bj<BgNJ?9$;5Da(F9p
z&~s~An=-qP-JInTq?q^mY$;AZdKp1n*ojV@+!5OhC`d)mwZ0git4EpCc%-EA2hH~-
zZmF>D&7fzK1PSC6zvO_Z=uq`qOQsfE8~=`G+M!-(B7S82CcnteHehM);yI5))OoAj
z3q4lBI}h6Brt_5=?s*P<KBg`Ec5?t<k$+#M>oDY5t1_HktR*7?51Qr4JQg|SbEK;y
zUn7Th*xIa8-F-hUjU77PySx4ZM;C=gvkXnDlsJ2=q-v~-WM@+x?G&j!bzHqnw@}E+
z(O6RjgKo}Kja`StmV3rVUh3rXdd>HtXscGs@caG9pE~)Nz>kxK+`TLh!1=w`i+d0~
z#$iVoE>EH6gtzms*4PstbaeR=%#<l}JCS-s<*XiUgVJNJmGkH78+(mQee(;3PSFm?
zGfm<hZZ`eKcc`Jj;7c8_RkrNijXvBXut-DCx!IU-M7bOJU84@EWLQ_U_mJX3^UhWx
zYlVBq7$sb%pj6L*w~$ApLlQq9sTI6hjr&;U8~$rI;N;2Ewby9pIB&6v_@(|ui-@H;
zX{ElDp39)yGQJ<)I`yoDLpjjYV||t57jCxCV*rjZN%8Fc32zBXtBRDAG4Sd0q||)#
zj6AMD+>OLtD)UAD`Pdf3hL+QDCs%4E?n*S;QOy6V8;clbE46ehkvuu<#jlv#qg~y|
zT)yq;F`M2#XEwaE$scgr+#uCf4nb{|&+CmTD6P(*9lIm5ypExd=ZrV9w$nw1%f7v<
zX9;&UQ~023K#&#-Tv$s~2^2pT#UIx+;NaFYJGd5}O!ZaAujs@Kd;}7eN*JFi7pc_r
z=%NoU<TZ2hyRnnbu;+BA!ms`Nx<4+~%j{xa1wWE0HCMARDCMqME^UeEa9Nv846(zm
z23tC1N1&A8z8{CBok${9)K<9-4g;6C@kugA3btZ1G03s;7Y_d0(LP&a0p?%Itw{Kk
z93fLP%Uu5jMS17@<~<sm<rxMmm%VZxNluu6h-_`7-JqZO$9hq2l1TdM=Nt8Ly*#wT
zTYIYUyPJPkO}&GWrJh1!&tm)^_B1s1aoSS1RbiJqg)s+heG`!>v!lMMq&-|&G1f5+
zyemtu00OT1apw_!)@-U#UAt<$2^c9sSx!04j~WnR-GTk!%TzG*+vWacQPtliMX{A0
z@*jHl>UaN~qjFGcWVfn9w-SRrrgS2T5y7y6RLAV0?WlA%6$OWE%%rueY{n%makp%=
zwOLH$wAE5HSgVDRf^W~_?-To%?GH_<%j^^`*PS#!u~UT-E9Bjqi*DB>$HiO|9m;mA
zJOWt;yBDO+wI;@n7@<Hn0M?hVeH^72U<*7!46_`|l>aM!O2tfKwNqm&Zs$LwJK6h4
zrMQP9@@%46&Ezk6gKq7hVSXJ4^5{P8>W5hd!`eP2)%{4Bi`uo@uP~FLqQlh{bCs)3
z1bv^i@5{uaEh2`nz|`(YZhrqP-~9aAKhIZnzr}{qn^$%~MIwGf85Pt7Ehh#I;Ck1a
zr`v@$l&0<AEql-5hy3zP6@cMzNRypg)`fxt7CR!iE!V3Jd@g@q7HzCuZ6$3z$lf6p
z@^U-5{b_5iq)s!UnDTO^4u}XJ?|j4u1pQ)(*Eh(n=eDZOw|UBs?W<la@5wLKBiJ%h
z4bA)##-^}VKg&}VIneUsld^uJXNOH;uxNwG7T4+`-&^^$f`532p!#dtip9pT*|QOu
z!>bz&v;H6Lbq!Kqn;#<3BFFMB?NEHg$@6<ny>uK4L#7$#m)3K^TUyyB)<~ef8<Wq|
z;c4EoOc-mqXvhO={$6T*k6@9zyQ-?H78nj963%H+X7byEq~nPq=fp}imkH@^KhK-4
zgbEoDTq=F<1{M@|<YP@ez6fLkV{{DC*=!rF&r-p?aZ7TtPV;VzEA>Uo!#yd@9v2&n
zTWai*9$?2~a(P$Lf2!9=X=6|;QVP3~@L(}uTGVjsTTldXXT^+N5Un(q87!aYaly^p
za+}-geAb}9Ca@OmbOy^3zn0QVI@NYJJ=*C7yK%X5pFLBWmoGFHcy%ZkeVn@P#G%)&
zg7uBc<N6)M|KUv>i-HCO#Ex(C!JR+$G*}6-n>>hXP=BdX)zDnOA8%IPxBP*4T*K1q
z-h0h2^Ol$jU-2xob;tZ};0=Mb$QBB2mQRpq4$f?<$Mk!|9m?9b)U8CF8p5-&F~Rm6
zpuw|p?NN;>Ji(qk&CBN-GkMZQKYjaJZ<?HMwjep#w6eKgkGoXhDEfy2OS^>^{onNY
zHD9XlsiwTyqy###w-lC`fBby<5Zj>|f=K<v;`XChO#|AW#;q7hx#5Wlul|HY5oIiu
zqj~%%f%_5uxF7YcZ!20}DjBY&78noIhDPI}hqr#L!gp0>DvBdIifR!B(dZcM-U08t
zu^K1hoLQRZ#_@Url9~yer2)gA=NOG18c9X&loEHRt^9O8u6YXo!sX2Md3Sdp-d4;~
zRY)Shnr!MVbZSh2DA0eNR09K_{GgW54i2}#mdEuLNNlGuX8U_D6N7%W{ssvN_grs4
zv`a|R>&Oy&fn}g;{}hFRLF2nfkxG4!N#5WEGYsLr*t0@hq-Eede}q&|{~I@Q)ppv2
zcFfiWEecz{{KHEE1pGw@jwl%G^D100)7D|oH;7L2|594tfFCS#OHyGohpiHUH!!HS
zph2U<al>z=&*uPCc)h&OHv9qc8j@&+nQISs0{Aa`imH)r=v=<qv7IPZHS1U3?Wc4X
z>=WnmhslC{Uan|-q`@&jEc3&!94}Y-wW5VVqXkBh!Qh7fn&&OkmR@!q<NRs4v_L%n
zpI^0=oy>2E6e-)ekC%|7q)cX`C2?)#!x&kBn6?r-s^-eZWP)LDfnt2}|8OM+0DFvv
z$qbq~Z>|??CkDeitm^}MhYi5u<RvY3w5)_gE2!o07CTE1Cyz#Jv0fVX_Wk251EXMc
zLyT08Y0bL~mbcE<Xry~M&|sMT`l6tfu%RjWeZ2_lhE>c+NsFsve#Zciko13iU;Zw_
zr^}7KT#51kzoHzTZaldDEsv&1Yu&=*o0tTZo^sgs`(Ow2YqV$~&-KqhPEBS#h(@Ek
z=6$OdOw17jp#V+rdgiMPzC|kupe}8;a`Y0+Fu=ArP{Ui{H2o(1<)e|}2+C&gGO8JN
z?~Z{1Zv<9xC4OwJ#dXnanDp!=7q%z4T+e`@PQDb!EWPWCm*|zRol0w@=~_X=M`b-Y
zjq#Z#c*fT!SiJXcap{*U1_q_6jj2Hf9+UthV9D;Sv(bmfrorb+7x@_YESy*0r*DW@
zZ6q@sMAjSlTyHYIwv%oLfP=uTSF+1Ry+hj%UEIM+)Dh&BXL@1_0LH~1GED>qu{9JJ
zJ`|zEn$}qk`z{h}U+IcQ&uQKm()sKA@${cpjfR&SaU<Y#)(SC%W93IvFq3*>m2dl8
z(y!5$>@#6u7upsfWM?s44*MBRI_{@Ly-8D0$<2Q~=GLprvqIF)SF9{Ro!rzZq!?Rx
zDhz>_z%!aTZ`T3Y8Kf>FX2#Pmhyv?{w{Q%*yMd!WcW2^++t@b~YWzPolX7|XOK}mV
zR$4X(?dTUs1MeDX2oJmj-N>BP7VSsTF``7+LQ#EZ3#R&#Yom_?_@w-+UpwF~tU&P&
z0C+V{$hZzNH>L#i2d~iNTCIV|5D+<GWCm{=+rsYvpa)KLj1hjv{v#!V2u>Iw4i%BN
z2&(}9IG&y2Lk<SEb&D2fyhuuvZ@b?a7=VVWJ)=?FyGyt5D_rx=cT)Ta7Bu_9DW!Rk
zMd}+>cE}hT3_~Yz4~Rb`If@#BZgcSWi5|`2r#wubUZt(1F{WI34L07WFt+m43STu@
zg#b?o;)7B8>y4y`KmGP3jxehUf_|yY;C&L@^37j>a|7p{M^s9p$+m8NeHHWeeB+7k
z!Z_trl9vH524Q7iK1&LVxh}KQ<ExXf<5hI>v{DO61-r3@oC*y^&$){<qHyxvAM@d+
zrIJ-{4&@H%JwIo-oQif&tOsjxflt>53g?%s2#|pd$eV{UU~x9MYy*Y&+lju(8xFkD
zcS(A?<r#FRJM~~AZHH>Q^+F7MzA^Jr98nJGhJ>g^Gn@-9+voCD;y#Wswh`8n-yf~9
zNi?w}-BbqpVFBfqBHyG9hwk<nOy*D)+*IRzagA$_QSPo;Cv;?1s398EWzx$PXT%=o
z&NXo<Zgz`yDHq^4hssL^N_6J!Uk0XErEg)+G6Ns^fb#7*Ybt!5P<rL6fsgEdRzzt_
ziB6vHb&?1{n$RVW%NS0V_2}d`dkLPbE;_uMqeo3+Jqkp3$$u%&0{r;Dg)oKCNZVqa
zqb9NeeZQ!c!FeVNuKXV!JGjX`pnPKp|6<QGO`T85DvbZ9hn`zk_6gQH2*mo`vnZX!
z+%XmU1So;&zvWeOqgLmy$4NZ5+FL~!<#3`81NrIS_y0m3OFs1ZrRiO;rP~cUDVx<@
zqP5Fd^1Zic`P#op;B`iO_ORP+NG$`Ew6bHdv3B{M0q>>1sr9yN`D$8+F0ovH_jG5M
z^9Jyf8}7=+IcV*oFQHZh2|^b-^(S1b`u287%sHd29~UT4ldVWM_SxEH;w!JhO3PM;
zLk8WmQh$@`86V|!gx0#)>DsnU#$!}Z5IFOhPXIq_y)<G}eQUeK(asy+=?XObUVScT
zj?&X?rFQ1YYB1tQywU2FYX&}nf0tFo^c6kJ_w8=ETk2{zF<9K3lky8zuK)SW`fg=X
zrbh0n85(gU?#1L(iqj&;%Kzc9nsA*|{N44)XP?1J)8uxmf<R2*0grY3+fu)LDxBU5
z?kF*&oIbt7Q@XxUO_AeQ1d+M>Z^j*pr+z8yxQj2>-#&es(aLm%w?#3LmG6H)wp?tz
zy)l<)hK3)Bdr~QQD`k^p#s|YlhX6%H{<k99opYyi=l}Y2JgBW&lr>(_z`kHT4IcjB
zzt=1|pv0m$WiYqZbJI#Kqy1b<tzD+Z>J>EuAJe}{*mk?{_%P{&58QMj@>z*>8sjlR
zDgorU1<YsA=xcvda*~jb;<};_Tq{BBbY~20A)7DZtU+@8xAYCd|0X=iu5{|kl2OjI
znK0}~+@t<`xi%;gDAf5>{mqNmR;_sw5h;O~%qor}oKNH$bzUts8~7=kbKe0^{`(fS
z=15?vn43!YyF-!BZiNZ__JY%)5g@kweUp{5<=9;&ADEVb*lCHq!wS>pURZw+WxDsX
z8+7M#{<q_4jjc-EN|ncCqQP1YjyEEA32z0{cabo6S~l}H<uI`-kRJ!fq{}W3_e5B#
zJ)Cha{kp9@fqaxRcbWTtKD##aHOG;p{Zo7@_rF?!w<gi=wlSPA=>Eba09y)LkTYV(
zQ-+ZY>q~l~IYVw|6v^-A9g$M8z0{58xSatqfq%i2GWOw^<Ge;P4g9zatpzT(Vclvp
zXb_4}_SV05yoe1y;D8*nPB*XJoI$4xX)Er&Hci%+?}VEg2rGPa8`@~nDqWVVqQS=A
z!Bu~Pa-B~yuV0app#n)|s{{uh2xv`0N#U&gh%ZYAuD=u@D?O^Fjv3XLW3XN>J9m`U
zUvNiAVXJhPhglIBeG3Elq9J901+{cT-qsexDx^YPKDZvFB?aUiy-?jC&#f58DuG6a
za$esY@k{34Df!s;WpIXjpl4C$DAgXh;{^6cL#YHR0XCFhd_1p^T94cMIo8IiXcTW9
z{Bg%MsMj^w5-Gp)q9Lec($6n{NSdvZpOo%KmEj<oBkIQhN<^2=W+?h%v<<@D1UCwj
z7k9l}VA`qH?UfPqNTp(5>}ZqpYNNySbgw1W7w>!Y@lPsceld4Q+EmfKnX_JDm|Te4
z8m$;SNE89{_xffkY>d8k>DDZdV;}VU6&dFjk4}|JOI}qjhk@6V?tDebz&<VbZSOx4
zcmI<+`7C8FI@mu;`Ui-64^NDS;!i+}WyJgSXUi#X3wn1KbxOCRpU4dDI=76Qdv_7J
zI>Jt$4QBCE-0%cj<{{9HaH8;?{?YG+AoKf~tn5YUcEi7Mx9>odd&YJN+R*m_K7wu}
zXt><dC51W}m(4~CGSQzGpW;<4s)=iX{RO0Q#ddja=N^9+MawEgj%0n~RcLcm2Ug`@
z2V3p!VQn1U8^XXJyzwm9?=~`ZrVCAfBUQUrduTXkSY7Su_Zw!e)#K@_U#=VYoUA**
zPiX*6-EB%4r%3j@6>~$%=WwTTW&`@qTLe9?SsT2?#mZj_PV;v00z=sbrJs&%8!|yE
zn?DVl%18YWn=di$qF9%V!H0Q7X}<Bz)xtucE@o)H{B|3hMa4s(N7(zFP9DpIOxe(~
z$(&Dw?~%B;d*puG^l`Cksr~QJTr@K=rT;v?7=$(^Ia#^2o)|}XwaVEohxy2yFRU3y
zmX@_%IAi>db`4*|q!BFGW9G(mH%^`55abrTCRWcrK>gsqy2s6>B*~|&Ke7YnF*LR^
zz~}7HD+(!2j?Ht|UkGC*!3Q2_u~iei?6s0>S>(!$-QTK5&~WJHH@hDDl&Q-AsgI4P
zQ(@MI-x;TOs*D3Gk8_cxFXu30F+Lu>$BN3Ft(#3y9K?X#a4ypOp%+?W1D;A(sVck4
z9QK0^+l}KpTWO74y~Y!}8fm_|mjag0a+9X@rEU9awrlzA1DGb@W0kPw7H;XpfqfzP
z&0%E3<Z$s_kEsN#&~W)mBUgFx5Gi<b9G<PW`r3c+#!Fj=rrmLA4^%Cy5PR@;XE(eN
z>86O%HfQzxr-{5ag-(bOmhz6ZJD6anzvyw%>GSZD7dL-R_d1$e377TL@|5u(@x?{<
zl$KXJ-M6tOK~Z1!sDg>X6%B4{S(g)a71H-{Sr&k7!c5CY$8NiseDD}>F7B{|`L>C1
z7!6YIw7^k;(<{-jtBoouZ;A+J?B+R_FkWpKTGBJV{d_&DE8EJ~t7WH>X)m7tNzgLf
z*lFjssjZl&3Z})s$e+K$^3*&h7F#wc9X<Vhetg*^tBtNP1-uflVs)57##I5l5}vCi
z0BRz!mLFSO-jVO%o^nzYEZZL9C;sepP0)cCzVgG!NV_Hzqum_)&q<P0$dzJJ7>;p*
zVm2*^Zosa8)n0Sx_Pv{q0z++|Iz;%@)UHQ-I*)PQw%7<f#qX_eF!0+GnSu^>n6dYB
zKepo5m8u8hZmwn&?_B||x5pT$Y^p~1bxkWX(ZlV&LdF$Kvh|$<D3ih?X9yl6x}`4T
zs&pr>a2KDd5bNc|ZT}dLB=}MT#$WqHH`Z?d#FPa<{)3f9QavJy0UyW0Rd#H~G3l1p
zgog2N?+!9n8<P^tMF%%eF(nE$kFB2iFmC!RU(nbZ&UKp4d(b5Manxt8Y$eu<8`N%(
zdX47kTM7ps;M`r5(p2URseO*ojoBwM+^vVqI1py<jd%JS(?z6*=7Q>bv8$mCTNDjB
z76me;A@HCh&8*nk*b>>i)2UzOqo>yrqRSVA&(ks~xC-A&gG_CvZprd)#cgG)vKw(7
z^AoG%rOTX$6SF$(Dq_6T4ZEAQ7tBKUh3s{tJPKP*RKa~)k5U;7_n-)36>FA<z8E+8
zfA!}YAYkxEqJ0mENK_svfrDhv@ret0oQVzM^Oy^ZR<T`Ds()4Quj8*iK=AD^F0$Z*
zR(Ok7E~Gmvri;0isPFpSk`rA?RlUPbY!PbDAK#uf8zA1LxRc=AEqk3{_*$oWl9Z(w
z{!#vTZ3U>`aFHgJo}c2u3o5qvC@dJZN|ELx>l4k<9wtn<<@dKDFIM>rJb7rYy!OqD
zdp^JdrgK=laI0U8O{>N_Y3v<0(FrEFDxgi?{<h$X`!)$~tYP(AuC{wrF50<yM8uLc
z_}K$ElQx+}aKd_smVA!OE!PHpS;xB6@WWJA<TuU7NlfX&Zyl*APn`s}N*on(J94{1
zpEVs=9qylgwJIh6Y9DA^>eD02o6&S9M_I57sOs^Kjd`aQCOX_v*USOgm9<E2Ny?^=
zYNF7=^U(Q?P7_aBhWI|@ZQt`S?KxmqlETJV0us^lDUI_0o8)YL(<&7EBkromZmEWC
z)sa4rq4SB>kfv@zS)}BNq_z{im-Irf#&JHocr5FP+T9PC#eC<t4y>LZdnNb1$~PMV
zmqw38kxmw3m!1Z36NUE~Y^9w4OvA?*rwwK>5EavLu~w{D#-u|`ywN9GF9Ze12}z`c
zUX3f9hm0mlbqDp9q9+z&;ugEmrv1<1!hkSYO1X5Zx#DwR!&Td`Owy@)uWL33xUFrf
z(muZ4UJ5@9R=4>G343R=@ZE(2TCup+*WmTNsOYx8x+eRsaXgrb*A2CKT)u{wrfkJD
zkksV&ugaUCsNx1SD#4#`gm!FdWNa!G*DY>ePGoHEY#e1Zh+3cJ&%~wfX(b7gI_w^-
zZXHnWB32d`Ru%kAJSkboeIA+s?^WFKY*SCu*@B<Yq0_eWZRlSiU(~p&hKrfV6n#ev
zc$L2<e@)v22e}<S!s*Z~*1uXle7|>k<%@=aluXpp7h=iYkS7i)j|U3H&894f4s6#c
zyW7=^-DR3r${Jk=h_jw=pjghJy>~xq-kOw?2o({{2D%dH<I8?HNqX--A#J<w6A%4o
zm@jvB&PQdj^Nb_se{8B`cXmGIJWL+{=8jC_yU5B8Wrdl>cZDR?bBG<k;@-T#da$L>
z_5&ooyZ+D?{~^ufi3d0#o^u>7eXdlP|L|a9`mhX7zvVw+4It6U(TnaLp1e!H*L|=-
zD1SD}y*2nvDB)XwpT|=RCa6oLqNLaY(mYUKMM3g4WeQe^>~{-*G(#Hr3vfI4LtEtC
zZ#tV#1&Q*C#q|$4yLGv_^nOAB@aLAVvKnE=-2y9LX81r^IigvCqRtV4IcASdk(4sX
z^t`QsQS-0rZ_A<(EcE+A2neO5ek2Ur01nZanThSPp<8z_MVqHi3|CXiMsd%T3tpS?
zbtvh;`P9_pzg63&`jnxPcwe%rqS3kcpRgjbC)w^@Eh53~6L5hUA@>ya>d(7W8I13e
z*U7Q{j|Ur=9#P63?NN=mvFEdQ*^T&qpFzR)ciQxyjn~?CwgMbUk-bMLICi{42{>oH
zqgwpQPO8g&WUtBiY#^i8Y^1X}E&So`F9y!<bx<&dG#@3XXfrK5xW|P`x^#4ai5;4f
z<d>^QtuMz*w9>0;KYD-L^0T$#j;zS#aN&WL-rTt*!Z(ampU~$Mjd!n>dMk~y;hD{;
zA$l-}W6_eU51^*p8nUk*@UU3Lv55xxJykHCm|n|CTuw<Fc$`cc&hJ)Af8&N+w+vj_
z+$ljCmYy}Aq}vqO08b<jp9*Q=r?(3j%8w@Y`QyPZ8|J-vSBAbjO~_ZzN}!502f)en
zl=sBk)^^gSuK`{P7k#k^+h6WTQcn{cXzFD(;*+$>mVa><iFGaO-!<JR_1kwOj>U6p
z%FK-@wkL1y4b20kCPJ~;8CEt3;aX56U3SgmL|t|n-G~E+lxs5hm&uVQPOZZynv7F2
z>F%W*|FRHY1HWu(CB|MriPhNZyT%ChvydizrU#G>dKvo<K?4@wQcaNN)^SM^7bOKk
zyWG2^L3EgPu#k<1#M90Vib7Fu(R6R-X3CFk_kaoJGb7eno+%>Qn&03Do*dX;*~4S~
z9XIc08Z8BA${@e!E=$6RLJt$A<}1#qDO0cO)J{CTKA;Faq*&Wp3@b~ue?12_{$zIb
zIo&-(QUuLcU#*cLm%X1|_h2{(F)nxd=-Tpo;zYBp8dZKBj>vM^e-H{#StZ<-f@;>O
z4J*KSX$y)`m%(o_bR;&y?e+p86|0dmDK`Tz1sQY?ihpImb{&Q8p7DTZ!;~bnkV(E2
zm9#yBuIvoigPvy9ADqCMkF3xuDXDfn8D>-iXw!5ju`?|jV;8adW$L=Vb;NYni>!iB
z6rFj>B&wN7v9Mn4XX{GI{t(TO8wWJM)H^tma~lMBp}XHYr)DHY3zSg)2gOGn5d&-g
z%Ep)h8Rm7cxfc?Ojg8)*oWF};RxNzr?6Qnxu49DA)-3b1Uu(F65gyCrS%G4mN<Eot
zeNjmxYeQOYgxwfUZK#RWdTA)ma$c&L|9PjCT3QdZ1Uf6OfLQzWk(Sz$V%PkUG0puU
z&dDZ`tuw7ev(!C`cH>9c$u4!J$Ka+?H+i*{EH{eJK7Q83bT1uQP_}i9d@lpd#SAi1
z1rfS?6a$<UU%<_lE$fcTPrCj~<7^jY5PLJ)kecqVMX{PqLI#)owRnLSHnKO-`I%Ul
zZT!G&*5_p>6x(Py;}R)t%Q_Woev0j3@^Z-lNJDBwZISL=Moe4p(#@P|6CX=Iec|v{
z7F;?p{f%HAn>zi1{kd!>prv9u#2_SP=D@!5l2-=8K6cG(k{k>Q7QneIg~X|2Zi+kc
zDtkY+k~DBJKad{r17okGQ$VPE-ep;7X{EeapY+sY5L}TXOO=$oRLdXiegB+wAdcX^
zN0sVdx6&TOZ(gkuJUEJaHZFhl`=&<e%ssl3o(ILTS6gpg!_cn`Xh=ZckilE4(S+sN
z#9_beS{BC=7m~)u-_DIIlSOR1Px*@4!~K{10-z;IE(d-ym>iOUj`;1}&O&EO#!nO3
z)XeQuuZBE+I?t}_Z{agt2Y0E=Iy_+0!;pQ-Beli`U;9A7cud}xT;>5R?d#$YaYuC=
zGw=Mt#N&sD-EkG1^=?D1zWIb=t*aGP__Lx0?nmM_kL+YrgHWM!<k&gOD7Y-$$(60X
zbJ#sM?p5M-vo})N+r-W`_|o$F_d|Q!EHak!zVCghcL=hMy}^y(mBhj`=v61e_3Sl{
zG&_;|=`^GfGhw9DO1E9k@XRU~pc7L2%7K47@oLvzpK|0db>?f)^R6@^Gh*|xxKpnc
z1%0XbRmihTvHY*!D5<==LQ|-wX8nnJo1-VXXw`!UZAGwxpVG_(V5<Gjd+nl$ub5@r
zV2w+m2CWY#wgq;sWPq=<ikmxs&*U1IK;BVgk(?q^C#r>aq%1J}6-39pqJGl*^I*jR
z!Ii8HU8#B+@?gUIRJw@d@qmIs_F&kP;ytcG6PY;Qa2g1^a5oqQCxv}Nw(+W|*`7=m
z`!sVjxS8RsfdQ?e(&Ef)Tev*VLv|rd^l}N^dbO{?YxFYz!F=D|$Z>fO8en|S8`gbL
z82-UyD7(fYUOZ;{XptYPxkwV>(&Dmp@Op%zt-=IWUm;+*T;^bfH}6}%x2M$^Ntq$z
zRqC^|mXs|qB)l`Mw{#HY`y`yadcS9s16y;yMP55{l1<D~7%7RTxQR+{Z;+hS55E<X
zy!@)RY2gM!B&pGrl=uB-LD@j=$J={qDAI_qSKE4;4L&LR8>=Y2k&K)s>Fj;2zi!y2
zsmWO=>_^t%*+L20^L&N>OuE^}ZK?F^vb4l9_)Xt~WL`%+1I7MGu>A=6WbpY@`SQz;
zow!M74`&QvG*W&U+rp_8xvdx959OkUjjN+Q;#E#uvRyWQqW9!(>%ABk-#&PH!;Tn+
zOoGq*TRicc^<x_NE*AE=lu&W;T2;)+i*R0D+pUkvJ5n38#C%!f{2pDtaMc*AJIdsD
zJw(2*H})uK-=6B7F=<31#pgv`P&Z{;94fU$*V}O}O>jnsaS8miBGy-OeG?IljHJZH
zmXWu=IN6n{AO|*wh4CtT>V2^}ibs7xZQ%5}Uq4ow_4H}USIeNuQ)7pMBi8RKi|3-B
z>Uu9FrwuE-y5=IIKb~JZ>mf@@W!Kp9z*K#g;gS^TI_-O1t~1?g(SEKAJv3oIP<}#7
zUMVd@n<%STOG-8bpOj5_J>pqyu>p!m0x3t*3HI|{I9^b$DWcwGA0gcdFV~+0Km6>t
zU6M@?s%VFBtd#DiLD0a+7q^b@`)1_Y&_hzGRagE|(Vi1YGHJ^FHe{~Sm+h1B|F!qs
zUrhyFyP+4UN)_oUi1g4~sG@)n1XQFd7(klz8bS%6(nXpe7^NAhA{|0e2})BshNd(j
z^iCkjJ?i_dyO!_$2ku&5eqiy#oIT~7*|X<)o;`CPfm$yndA;c}(1*z$w~#*w9?<Dn
z)_(f^X!mkmp6{nOR=@=%z$Uu<y0YtE8l_dZ#NO{SgO;~xN5A*RMS#p0jMx+{2!7h6
zF4YOxp{+?Nko~a*U|2h~;V*BDOarhi4lKWulF+yF`r?+z(NyVQc4IH@BvcO4*_o1>
zxa;=m^=|~fkd8lWkqq^=2&%XI0rcsy)l#-!Keu-idE$Fy5xUjwU+`G;!k%J)(q5qP
zo|K4uy6U!Q!QpQD(aZm5(FMbs#9oWx2`Q`kN8RF%$y;YGYMkuEq=cvnHdxjk{1*;;
zzObSgQ{3<MrpZA63d3d7=L$VNSyTJ%h;0fJtFWGb0rhzNLR-je+wjQnnAAueP{Uk&
zI=9328?Uy8N;Hetzb*ferWX3`!Zb;2WhGO>M&Fr39)NEc#ylH7jhZdR<e>MH|3z+D
zumF&=2Ob_89&t+px_c*N=MNt+n?2<4nNGoKjrm{p<b^$Jo-g6;@VDV1sgYWsNlP)d
z^|PIP>qNv+dog;-``<<li3a%XG$8~%-k=TiyXbBDlmQV;I9_jHj8?^J&FjDH-+Ytx
zCihK&4cd)8O|Hvl!|q7bY??UlUK?d<)%;(4T8Y8Re8x@EFN@(mDXaI7x_O*Oea>8*
zZ0V14AX{jSzj<)*U!>CehW;@OFh=)R7BeO8=sPFM&j5WEm;#8$f&n@Wt2NVq+r*C^
z#twfT?vWa)0cvo0?E5h6M=G$R&5E)LYyYoz_ET6auTO@A(S9*`LowlzKf`6SJeB1I
z3u(-j;1Q@4J$3)zA+p$cX6$t+79h9hV_YjKGI&4SXRy$k<l8?|PxjVgLq9m+`4>cY
z%!iMxk(GNSYfh8k)kQ{J?8`Ssa|1>v>{~YDW=QISY*w}E|MmF^$pDH3viIl&4DmR$
zp-2sQ%2%lXo~&x$uscdea?p}RuZ%1I%WDPRIOx~H=GpFe?_{EH14Q#U3{ZE5_On52
zkN>SgF)K8vzi}}lpHN?P7BDYYclYAU5~{a>S^s`|#W$92Y}`1gv+NKN(fqP#d*-Vk
zt@)b3zhj?tW9E0kI?FboSpTh~fZ4wT+5QGl0{uJoX-%1gOtt!=6;AdvT8BJ<4eg0P
zhZyZS0s)eU|C(K_(s=8YJ1e-%2e<U~@7xYKK{5=G9qS!5(pd}9YJvZDEeyvWc>Jm0
zjmsN%8?-GuyIap`fEYnIuQ4ly{R9LXZ<*0(HNwBVPMF2YXn@k>$+eZ+D-)S3H}&;t
z?_&VkXzKmXW5~&PIjg1vpQBZP|GzZJLH_XD;qu|i;c6+X@<)%D9$Ooy<N}zV7W_xj
z0qL@!t5v{%@udfu!*6~+)km<>*N^MCX{#SVer*Pq6MP6%@b5f0NTxgIw_@BUcs!0S
z3Izu2?pDWd_q8)}y)JaMLd?FL8G}}9`gatX9ag^WDG?XvhD(OuNLjso^eB<uTL4fV
z)Shbd{Dzf22B@5Vx_egqnwW@bWd)W<t7Zd&|MY-Wz###Snkt^*o{|dq)#+`Mh$q_F
zL`-|Lh=>fzb!CxK-;6?V<ZhQ83hO`ODyx7(A`qBqj%h3$#P>B>0$~m4axy@d|6mQD
zKZ9slCb_uB)Bq|L;J5OyXu5E{kW|RhcgljDeObfrnSdPLqXyDO{#K$@z&iZ57$A57
zP}Y6Qy^8@ZH6#x9w483St`tG?c-Zs-imtNp;4yhVrtWXq6=+dKA*QguaJu&tl$|{%
zr1xgi#>dC$q<1^^bi_-7?SC5y=#xyt_;(coF6`Z}=?6@jcRvA6p=-uVdD|VPrOPyt
zM#5!p{Vm&96j7L5m|s}Yd+ItnJ1=}Q!6f%L|KE6kaLzcc#Ng8qOEJxVtC!dl2@u^A
z4REf>2aL*noGfbN0jRPbz6c_Y^|z9&w!{9f)iARay%qfxgOx5AQCvjCDf@zv*>4?y
zv5m*hhxk7s;%wz#qf;__LIJ{i!U6Yc>;bFN_ZkOx><^l(9ZvdLnu%M<So;b%FQCH$
zeh&_=5$pF@5!<rBBX?`Geix5B)nqU4Hi)+$ckS6YA$37Cb_z~<0DZ^r9~N`J^}u08
zYX!2>4$}l0qWN*VNGu6hwkzL8>?a)9N8qW*xd`jO=2ynk3CU_Hp(#z3*5t>JuCmrZ
zmu0ZPtJqf0gpaiXOLiCdC@cPNvy&s<EM<l+HLn+8UiZ6?hKY!Xz&GET*<?&>0`Hi7
zbeYfd2={g)blf`X?*p&L0a?o-xpby<u5{^nz>Bt(%OWBc3#~<liG0fIl|Z=r5w5<9
z86ehc9OjKNTK?$(Kh2E-^Aht)^Lq0(MZ5|-dzu?;e~{T&2H1Dn+x=WKCI$k#pBD0H
zy#1j!iKg&xRWEi%<MU-AT9$NJp;_jD0Y$vxiW2ZJ`;Wb%+-LSTBPLqvW_1%L>$jBK
zf|dY%*q#yT(|;cRkQJKHlt4&OOAIZu2A&3v67ql6Xf!6F*|hs+tWOO$v~m1ZqJHE3
zn9hNn&#@k&r|tdU#s;k1Ff+ep?qKd>?yZQIUXfXm1?tS{dRK(Y%D+Vzl&oo4UoOf7
zjy-pnfyJx;*7uoB?r$d4$|(Km?1+7)+T+m2O^*qW)t-c+t^b$;s!zO=DKS;%Z?x=f
zOb+_JO!#lr%LFXl!F&4e4D3U;^M_=b7nT8rVcMPH&j38QWjv)v)Z^FJ)VI@z>%SN-
z%PP$+Ehv4hudm-FdB%9gs>yy?tB5;`gxVFH6Eg{Rt7vpUknb5|=IT<)_H%Nz>PK&R
zkw*8MryGEYH3k3r47{$6U%?e%X5)@YNNyl088P{L`Iq_Ib3pIE*L^{mOe(0pg0e?>
zO}K5utfP`q`KSsM2Gx(6My<PzLWXsQ^@a_G4TrA*AB$#(jS++mEz`ko6eOcru?rmK
zV=O2E*Vuv-#s2BUib-nd{FeSohP$3s+c|D@Xk1eqAx<qm6gWQme}DRg35TxBK1Ie0
z%893w#fhdEU;1!)%S!ieCvhxQp``-&@(DtC<1N*9{`l|(g;Y#Rcz(;++Y5LYmxWPV
zPNm0(VpM2<`ZizOXUd{BnHPEG1vynSFK!Ux2eHQ9;zT!|X-t*Hvewa;<@r4nK@yWU
zBLq&O(%Z|K2l7cD40=gc>6hjoi*574gyxJZCG6DObpE`R(A+-bgh8u!d#_{3mo#M@
zRH^G2M(}qowV|j^WGz!_J^XSgS39AxS*JT^j0Zm7Vu_kCLfAFAyjzE(LMuzeI0W2C
zcA<AL@0bnaS5ya{&&ZmL%Z%^4{%}EszL3eVw+z)?=I;xr%#3*1wyfaexBMDtBJ%6A
zCGd@?S4;&idbKj_Jci`c=m7@Sb#Jv5YgNmEkhdpsyuEjd1~M?h|I^H@Zvvo*^SgBJ
zL!F6CAZ~_{V`W3XqX1N>2UYu%WoD4m?R45mzu1*lf&RpIY4=JgX4Q4~@)+U_pA2M?
z9F}#Dn86JD-^g(iRWCG~*XD}IOd}qjd?q~gIzSJ1-#uKm+0GcE2Vg!2mIBTK&4RJ`
z8r+r3uP=a*%|@&Pw}49n9zD8xpm%kz0!kx(2|z1%kmxE0dCQt=+Gt(R*ukS6zSAmu
z$x7Zm-aNF_%o-2@h!`tK47fT}&Di5BEzI>>+)zU?m!;%Q!eaRSbl_N6zbxr9Xh-q5
zII520e)qgPv!B8SmX3>0;;8KfQ<{Ub33zw9%B-;$A6%Mkm!prEuLGDQ!^ey9XovS<
zG5}m0!+uOP;Pap|W#G}ly(&_7*Z}?~@j(#53rNTk*c^{pqgs0j+14ml>~5~!EepR-
ziF2=STl!iV96p`$W3wCzzv(`&te`4kSwFN?k^y9O@3nwU;IJV+y<_6tDp2Y{B<2H=
z7N<2Q0?(8S5IVHwH4bhS)!gw@=x{5h{#Ft%aq?UMXw#1&6rm-s-`iCe^w~vjoOI@(
zx<0-Ga;i(TD?+RO%^E8*e~s`NK*XFhiBU_PLG9XKgF>!TC6%<_#?FD2uJ?K+cyOU-
zTc^e)05>Bd!Vi03U?6t8W;Y7TdEu}UYL_mhl@$oQ8EkFe-Z!$8rowwJz75uqTFhWz
z8$&sGpZ-d}G|_A@F4{Z2u+MS61T@2szL!4a#gO?^$(yx&>^Ol!D0ZB9P2E%qC*(%g
zQrv1p=DJfbM{s-n^pq~=l~sNM&Xebk83hUIYvZ2*)u9MAaD~epY9uq4$6oruQUcvR
z8%(TT^VZYUzI(I{?ZZ4-s{riACRED6+eo%e_Jc5?eQ|iX|LZ|AH~N_LDugd~CFe8Y
z@Z64d7QjF6+~v#TdX6!h3_61G!5WI{h3jk8X|g+?W3CA%c^;ko!d=>ZZKF4GV9!GI
zoPbOZ%}APvYYJI@^7HtxR2AmR%*6l-vFm;J!!OZufVrF|B7$*~rSE>{yQN&RAt8{U
zK`Vlx!_O-}3$NQvK_POHV5Ij^na2K>x*kG(I;AoL0Gl_S852(9CY0My3_5f3U&#V&
z_FqYK)jE^|M~Tj=w2{q7(m*!+5yp@wt5Yk`+I32Byx`u$DQVDoOJ~6o!=1WcwJ64H
z5u|1!#d2K%!yfUGJ6G|s6!8&BbzD%eOM?3DaV*IqaQVY(Pz;TyDlkD-MLmLgJT#-g
z8*oC1CNJ7YG<H^&D0!HO;9QB)nmHqbBn2jhKv?!5=c_j4;tEs3B1*e-UAaMnOFU-=
z<)TSF<7xXdpHWRPSkqd(Zh&1Mz0;Xv7d53V+n`f>OJ8_CI>~zhtL-BIYL084{RHJq
zn~}DY^H<_HF)3wauK4gN<sut&O%VM`nCl4Ot#ow7w@RRW>q3^tx0j3h0v){TUbnw~
z*sN@MaQ`CZT^@?g52!XC;ggrYz86l!)p0;MzUeWje+LpyrQm9okoGF0QeC=yTn4d^
zK&jn%$sR9JgwNFCM86VeK2?H5OQBdd*bM|e@!~Ph-_QApitA98ozYW_kz@v4Y7!s6
zjRBl#PnZ!%FUlzGC<EHcjf~y$!3YwBt7a8_nM=y`YMG-|roo-5^fi61dIjGVEWwIL
z8VvFbnohTk8#>@et2vsV@UMjyd_tsSI_6GyZ$4*`6VJcJJhEiwh;?G`lJi>}>I(>m
z8T%|<3;`YXKsA*%gi!+Edl+sswy|;ohJo5)p9G4<lPt;r_d%_->!<r7C>|{Zj|yCM
zVJEN?)2r?sLu+wzmd6V*#J;n*MGUoSFKwuIY3IdJ&%F0@5gO71IkM|>Mc8-}hJzPC
z>ETD_r38`Vr8B6|0tO~p(fwGBlb?R5wizz8ps)}VPTCSR@#&g6q0blvY57T93aN6~
z3@sJ|&DRJQ&IL?ypouA|K0?Kcfb)@;X8$rY5Q1qL^)C=Z-ZlAI>pALyZQC`}wcn2i
zrav*-bIX3q{;OC{fKjvms_F$x;tL<IYG^AKS=!>Vd*<*)>3Wc3X%`h??rvYeZi!bc
zrVrC07#Uo?>Xg`%o~}gI;nH=`H*YfSpjD66b$@9J?Q^O!+cp&M5*YmkIV@xFkJ_kg
zkm(wTuF4PCE$9x)#UJW%B3fQb6$s+Tfz0IU*a7lmsAul|E-6C4Ri9D_NJYzX#(FYE
z?m7JF)_gWhy=$JHn0N3q0ToJgpdZot<bF?sS|9};<&z+nxy09|w+->2z_o}uT>9<j
zT6?^%+pVAYrd>_zX#*0^Nt;>3w8c0m=e->m{?bix1@T9}>H1X1NL{7a9D(=h?;16r
z6lNXQb9qc<W1_T0Vnu!pLy44I!@!)`ZqSHetjqw#M4zG;at(qyBrrZXKk7YcTRi^y
z<rlND`|8vs$UvW5;26)WpE4JiK~d1G1^-+moTD-KJU((3NxA*rnV&-Fpd{PmF*x_9
zhcqS4cR0GzfQI;(^ohuDw}Ca*o3ZlorU)Fq_7b!x7xA>*<q8Y_Qi}zB)?u%y*=P}k
zP{*beDpX1pspSEV`zco@ocMU2!jZ{lC&#6CQ#f06b54_%!0tKrDCv3Lu&tm=*b7c{
zwC0o;gG%sqx3ue^?tLd8mX@ql2=!cA6lFV+aL1AjRz-4|Ae?(b5Zlf`H7#-<2b|3W
zp(t<{ULsp_ds?zCAw|jx@<D7W%;^UzFNCp&A26ob>GEF>Db^^pzgIjhqoQ;kMN~DK
zh<9$g9N)40`0(fhN&s}4i=DK145>m?4O}@S*M64YRHLVbe6_vkaqrmY%I*0ES+$4;
z*p({XQVi92JiTE~xiZi0#JGd61GAN5W~ZP+_QDsf{LVzp^)ttcPLQ77Ns}Knj}Sbw
zJ4<premg#!htR!TW#z1p>n#GhKRE>A<-B}L=^~L|sk1{4dz`l7`2@HRrD1J98jED>
z3fRzVpkwX<Wq3unU{zZoqU;Hq)=u7z69~BE57>vxa?qZex8Pwv%|<={>GM_gn4&W~
ze7g8It|6fDWEwriu;f6j@|d{_`>w2$+X0lbJ%obBCy-M36!`3-M{5ZB{V9+tTv04e
zQf~Dzg;34ljcmCNf@f|IHJhY`#`XU5FMrhWtg?e{n0&%MV!8nGz~<^I;$mjy;Yl2^
z93iScHyICN-1QXk7sQC+PU9a<+CJy>YW)C23M~3eAcm3~{j4KlI@#)~I|OK-W8yp*
z<R|Vpp;Z}=eL^t`rPPjwWX>^_alrh2_-gQx2iZ&KsJiG&7>aS@<&7rLevit-h$S|V
zeOe{c2p%C}Eg%j$pVucW*-f=`jsV%PGGhA0KzY&Sbhr;+hoG!+x}#S7%b36+KG2ov
zv*?^oYA20bb&rwkbI(10yAk;3yTt;y@!G}{JTz8fOHO2&lu8%577Q<iX_AF4!GWT3
zkr0gk9#oJ{D7hp&?gSF|rImVync9Q;Z439RLyQinJGdi)drEVcZY!IdCK|h$2;3x7
zDzG8iw`Ze}m6Oq6?TUDzO`0{}6Y%q0RDNFn$uVgn7}=@nM*c$89=J(Ig~{S+_BI;@
z<I*r@FRwc0r&zQL(v54)u^<v>SpnZ|;_|(w^7pz0EL=#7EqYz?ZuP!<D78&9v%3!r
zn?b1-3G|gJ0gph*I0m1?gF_N5DMyJqr_8WZo6^v<_6eJSeb#&7)uxN@`wO4c5N^zs
z<pcgPHpEn;FZt)0d!XzA-#gWf`9qk<DpADhB#+2xko??bmr@F$g3rRJ=Wh(!4RaAl
z3CnJ(cCn`mSDBjW!y>I^*BA#m1Y{7JjWL{Kav<CDHZ$wso}|0p0-zhRXWD%Uu<wX_
zf})X8C_vj{B8hz{wP1{D@_fB0zae*5*OYYB$Aw-y&DFIOXjn@5>HWAE$izkMN(FN{
z7z@ORb0+=I_;dq8kG0vns#(8rTimPC%obC|VK~t6Dt>CV56mDxo{b1@^b(uPfT=rZ
zeEnz}d!~;408A;?Ro3x4D3}Rk7>O_x|F9OmznV+^B&|M{H1Vi=r=@AP=Ncc0zEcb9
z!hw^suO_M}F>`kAoq)sN6&YL}iw!-F+F#q#Yn3W#XVITjw%CE#v%3nOMa$(LY;;1&
zR3_^5Ai;rKN<Vg|gTH&KiDp5Q*C3|}1!m#Nf}V9K%>gm8B>nk0m+JezIPKG~Iv{!h
z%gEm?lD9vDIkQhJ3Q>p0FJ;h1BK<YmcK0^pGtDk~@_>2#2CeImec=T~P8R3mUcOya
z{`1DKx_mxWZ2Hc%l;s4GHj|o07fH>)A%LXL+x}?60<-*h@AwebOm8qMfg+?aL`!{#
zikr`tNb;gBwxpSf+_Gfz9VJ-Uxva+Eow`7Z*^TstZa|t)DRzMsG*_m^_{EA=Hvi|f
zU%C`o>2y~C4+zU*C&&%mH!7tpONmFnbXTU@+;UcZ3e+laus<`I8GQB}7<7xbeV1c2
zj|U|G_HwLt;zQ>Q9cLIzA%zeH7Z^W&kwWOx;T_cT7iVJLxiNrJyrr?$F_*%NC+qC}
zat^Xd&(aYQ?49ngcjgbzoD5!MKA{Jh8E1-)#i0OYPd+#b)Uock*^uRezt^_7+x@$N
zP2FW0FoT875cWwEGtkwqx|@n@NIeZm7sn$Cp$#}AF(wC4Z3gSTJ9ii2$hZ38A)++8
ztD~Drdmp09J#9gti<;-wtu<p)E2eNnLQ=mbTi-r0k6-(=b-pwBs22JHb7Wiky~@Gt
zRByM2^iGzP)SU^rVcZg2Z959;-E$kl7VZ@i2Gc&RCRew4#DO|Hv(HMLJAm>?f~d8)
z>-g(G?^`dX2!?M|6|{+3jv3#FGl4|IGsC&W;z*GPghkJ-<CKk8Ku1VhG1`Z(7RhuX
zte|D@7Oiz-&_Iw7N_@tmWl;f?`0xujN^T$x9#!Stm&r8}Y|?;@I8Dy>zhRuf*mcbd
z4C46X-S3I_(n{Q>S5&b}76A+~DBQkSWpwu<E&kH0NDCbOP5Ns$#NWFbmKx6L$PqgL
zLvVYRwV0MXQ@TiwR!OjpWV~)-=Ve;O-vSQ^US&k_K53EQe`O3x)t;cIs2+zkzPrdq
zF~F^dFwf&oZ4XPwU;+V6W&T-omAF<GhiQt53#pL8Q8M+Y#g{(5(Q|rn8%o(OsGPO)
zN|3&7NJ|#DP=!~yC5|In#Pr32A~r_}n5;X*=z*<TXdi)>sqV$VNuHSHK=~BYg-&tU
z<mt{27f_*s+q{pEx*C<s3?OfYkzhU7k>o?V#|@rc+?l*$H+cAN7XYiFD>CL%0fUtd
z%!dNp`OE#t0~##~yH=gDMB#X@5;J3K8X|6qpr@F=yk8QoA$;y<>v^_#*4PzahD+3M
zk+B;6^P{<48t(Bc0qn0LF29FJg+4Hm0ww+sfyY2ts%GS_EE1z+Qv(IvrNKM?FG#t3
zCX7Tx<ZR7rDe{b>=P+_ZL(7=JG|7z~iuAk5U?q4T(NCovt&tg0a3feCamoh*6I^Iq
zoM{es0FG@^Q;@BQs3R}Zwk%Q4w8)(3NXHTru!7?BT)*F9nsk&1fvk)7(9orgW-<jA
z%Ju;VzCG^=sp8g4RH2gKnt`mjn{PEPhEl3LIR`^-1}`sP<B;qvdt4(+`+5inowzpG
zYq*DgW<;+(hSFsC5TR7sPh_9e`Cp;k;PcGZeD==4Pje&88Gtp4`xZQbNaoyETXMea
z-WV3oTjnYVESf6Lnl-!@!6uB~cAxbrfC_^bA~^7ukmWu~R!nKr<F=+$ZvcJW@V!i?
z8iu(<y<nTt&fIw5tIz;mux!Er@j6@zUG-yhk-($DqzszZyr!4fkk5NPM#tf9+u0eI
z=D=e%iG{CF&#as~+Nqo~V+~2FdgPRW!VTq(ISCZ}Ywo*irR&C<CJ&$LI?S=MPhyPx
z?3g5ZsRj=mH=wmWz{&ESnPpBMKQOyey8h$hmTPqhNzY~7llOM3H2~F<DuUzM<B}9C
zN#3TEk}vwBg0rxO3+>Ikv=CS+Z1;3$Ngh9X;9?wZigHGZdhx?uK;*KKsFGY`rpxr`
z$VrE>01xiN0UBt9xti`{QDId)!<l76v)BhP$IvA0uG0i=xI&z7bL@j-_fJ=@kEifD
zg|LKxc#fyheGByPXM4yevZ|IT29RsklJ7}ZIl;8NV?2DKKrH$-ykt>UiwGVjUp@aj
zY!|Vkt=ScSLVt=rHk-NKgn}|%MO2a%r0gYak<BLQ^W}6$`x%skY<3a_HJx(o1Dm|>
z`ab&F1}}GS{~~ib!bYn!8J`s>w2E0C99!1(Ir^)Cs9_aQXP=b|HGTB0!OR@ay?IZV
z6hYKaj2KDodI5|9OuYh3BVk{{diFl3@Shp72#E&}9$q$$O${@K5tnvJ@$YN|eGET(
z44YV75I;IrA2C?L59lh}m-;<-83b~*_Kk)Xo(8%Bu2%oh#JmYnN|^b9BIGS|dw9pf
zZ0rSpFnWdv&hW2L2(6CW8qT~-+0puTD@0&9u?Gb#Yq(vr3Yo6sPwsQ@V&(!n=LQO)
zfrTchX8~_U_!a8=AyrvBG0-Up^M2IJm>l2UkD{`U&Aq3Kq{ziMP2jM2EwhJLL7n66
zO)*ac^Ir8eDTKSz2~*#d^-OEeNsZe~9w-P{HZh&uanvLd0mx7$cqX5I50vVYI2K@1
zYR`>ZA{rHNSEgVK9d;rHli``zmnk{|AX65rD#I${nWHPErnvk3b4)2Z-E>>6LW%cq
z?_KiFn-E-UnBiGV*Gl&iVCuR$>xSbp0l}5ySpoy+E$84);o&Y0coJ_EHvYlMEWDUJ
zD0aKer-t82vw-FxrGN${<jEr{LLHYVCxeuzdd&~samA7Z?{bMdF%2XT;{%b%_fAYv
zfWGjiloJE-xzCg8Y|zsU=PvH9H$0I;^^G#eqPWFs-V$EyPr+m@ZuHt#e3FXN7M68A
zEBP@yCjK!0x6@!x6NrpHNQDwRg$y_;sG=;9oqG*m)xe%Z(7t(;CaSjr^;}BF)gZ02
zj6ulb*sYXN<~tCJt$gHFP21BA_|EAm?kc{nYWF~#&ABj^*+3a($x?#|=9hCul{$rd
z!*Fq?=i+xZw}8*FH#oVQeCrw?a%GP!VMCWE{Y7agHSyr^DKu6`&f`9e`z|d`Oyy-{
zu8d`gtS_Qd?$F0o3M$$=`-;E70674_0yJeFSp!LHpI*t!KN}lMuGQPiUV}+*@o>zx
zT&cSzsID~ZGI@>{opY1jOHAdpj1XC@^mX@Awbs<k`z-|HJ%Jc&h4B3M)>3X%;j~D|
zR_lV_7f2YvJnvfi;QEfu1xwV0k`q9@)E>Ub8a1X|6QIr09>I1O0c^+GfK5wEn9(ot
zBz<{Kh=L*)gB86tTlNR0H&g(2+_ff;6W0p-!mUTHojMHXVjbBra&d`3g@mcWm}tQo
z3^VJjBXehD-{lQNQWDft3~Z)d1YA&1Vp=3r>zzgEbd8lRC#h=6A$}hs+ku}?VOb_=
zmvSG_?ehyP_4M5rvV_?dY6`0s;MH{ll1#sMX`fYM8F3oQT<Uf2A$PY+-X7ig?iS=(
zO3=ob8PO0k4pg(?8P$^f8<D(4Cmr#S;!Tpg*4HeC2aPjh$GJ+n+Fe<P*0yKR6X+us
zokPUEU*3kUT?aQ=W<nqgBRqP|cwPLZnM6+wx-J|&@fo&=D5$#^y1X=*Qg<RaesGlh
zc!qt~plj$tZoT2p+x_D?7}B|}ArVWb_kZHcM~`|PfgM2q)4ZE1R;ld)p^jYacl`yT
z!&Tn*+n-LWK7~L!-V`mVO!!;6n+bNFQhU8evfaP<8}xC>yW*PJY|Q|>(uxn+19=>2
z)igEks;w$*0>%GhUFZsM{iNxId5*F0i!;Ns97d+>i>eQw3*%-(kw`?Ik&K8FpL9lt
z*7W)?;?GA8u)Y)zTdR}wj`0$vr))Q#x}z4x==m6HkSk|hkdJI#67g4knYGz@aVGR5
ziqM&i8KFOJ_J9BU-x1LKyi(-8aM?wqCezD06uRtH)bq^CR!!9n{QP#Mf30&4xyDPO
z%A<6;Ib;XmcM2ql<{00^&A7oNBgCC1v*o}d<J$MxTP7?cE=>>X?C(s-T5edVL(B;T
zQZe{NN*ubN&lPi{$qIAhFHzT&ZeuLie~Tqe(J1B1EtYgJzz~t8hy~EtbjiwerLzVn
zMY~V`)OrPMwp?j+>-y*#8~VVj(RnHnxwLDtytKpD1#*yvU$ES804UU&J>?guGCuTD
zJ4TFZX8=l`ywM29V@APQ7RF=;j4y4~)zYr<ga7Q20G>rI5434kbZg;qd<Sw60q{zF
zgSH2;vUQa0dwQHLeXiL7-Nzpjo4ez66%)3ve{SOaoK~c~co_>a&Lq9v?HEi!q8&Q#
z6_p@Y;94&my{-(MqL}>{{>fjvJ;=KhW>PH8j-H639wJNG<#d5_YK?>XZuJXASQBtx
zXRro;#-Pxc_d3O_Es9Y&`!g4>JH>48D?5-1I0T5ACKZ48PCb65)%Jo7>zpydfp8!z
zuQT#~5U54EXes{3-hIk;_3aFX0)V9oz8#^LvNa0r(<*fMgH}Jfnzh+DXlyJf(|Xa8
z;`e^vS|0iEAQrtQ_`K(oqW2QP8k<3qvIM&wLdL42eNfM7g6@{C^9kbdeK@+G#zkWB
zXz@0PCGNz7M+I~q^tf^%YEVxvz{m!PuUpHNr1~U{Bih%Ih<$tB1EWhGH$v@71=1s<
zE!+V((X)hNam?u8)QDSPQ06p0p|bI6_QzvicxhdTS!qQUHh|4<nj3u(f3S!eJY&S|
zJ-t5y^^7@Ke)5%kXgiZO#tBh&EM^@e-h&CGd|oY>(&fYyvn9%cA6q<`xajG9nF#yt
z%C-*xLPOH$=lqfi_$%Y<ygl5^JKVM20^CIC3=pTdOKv|`3t_G=07lTHaxJ^_QXfS8
zg{F;};03m&0HU-d?sI|I*y+i_-53-?@JFo=T!*0d#YQ(k%4A<}x}+d><&KX=Js{$5
z3gkwTh(A&XY3|&ygeQ<bW3KQKKpR`GF<%dlpsl9)VkIk!M8Ds=0%r%8V;2|v%sKSL
zi^PzYtlgnNo%6)!?iu#{=)j=Z2a6kp@6Ai69ZY#kn2Cqkg9bKtERTnuEs}H6u9Axp
z^K5r~`R4qPaE*0i4VSLxM^AB4)u~v=MNubgEviY&b$)wr=6uiJ87m)mbhDJvp0Z%d
zIEB?J&EzUTOmf6kW@=dx5}>hqNLjL5-Rn($NgzaD^pR#6fgi9~ef!+xnd|voP6|i{
zq{oXlUGidO8S#sd+vTpK7R&pY6(^J%zY>8wMV-OkDP0jeA|0IYaSZU&_G2NarYq;h
zB+jj#jYRs$Jkv5KIPZ|R40V6v7qmQfehrc@w+inTP%=}8_YQ^>`VT-~)E)RP8>7_9
zf9u4xqXVcjv=4*K{rFK7DIxbtVJmWPrHIvR(`J;$`K-bWmE<kxR8VfAoqDi$NQzmH
zxA@WB(mEX#J&1ENK#WLQM%_YwjXlW3&MWTlwsD~&J`;!PMEmG;M`o6B$Reb!MGQQ4
z$)L3B*a(Cr>4>MtMW)0(5S*`NAKxOVj$|JXW{G+;S|N@MOFKQMsn+>ossSoeb#QFL
ziwU#Lr$Mg2u=Yz-L$K}|j{*65%*$`oM9A7$_<l=?aY+*G-3P#UN#ck;?*M#t8cfhi
z`Lq;lTsn>0#mdt^;GJoJ2urW=h*~es(ATVt*Yl9+V|Y*HDE48FD+=vGnYE)I?yH(A
z4*X!fG!RFE!m(f5WQgZ?H;i}F-wzDRsPL=hkM`q@$+tTjdIJX?&HsAGE+M~oiP@&s
z-GI-8Bt5G`0Qs)566fGH9@SxL2fW2+GYVKSZ^t#Qw{h5#NU`y=Kt3T4GAK<0WGie*
zqZ%6&IwcWJVeceK$YK9P3{FY|>yE_jxozrveaf}2da`{v<yx+S%A}|z4#>j2$``G)
z-s4vsxwKc6?^(#UT9v=aLa$5jm(7sTAAJwTKJa++G(BgdgPP4HR#1*-x9?d6iXWHp
ztY)L^r>lroPXmx=!jpS%^Dzmeta%ocFTv0M64K~{G`Gb|u+dO*QR;;2d01}eLup(*
zn$szSJlN4M6S2C{;AugE)7gH?C}Gd!p3xm%c2E&9=V4r(50`i^FWNCag6A$`j%Wrk
zi?EDk%RUBj10AR?%HD&RxY)?|Bu1tX&!7JAe!PCL+EHPrA4=o9WoR8-zTxU-9uMp`
zz3WZ(>+!m^{G_j`aW%-$@WthEZ3kzKIZYU?+(&p8M8B{~=3V~4i?!|WBT)l23xCpJ
ziJoGv(o@>V_BnQeAsLbI{={;ho8V)n168s2{pS{L4TUf#Dzj5+=W~O(gvpaR0o`l6
zA%7zfUJ*A(xjRB0FWcuhs`A8i-9cB=ns2;2_hZ+%PN$0NGgbS9u42sv$X;qye2y5$
zs~EXwpof)<@;Z(z;G%}4p<@-!lpH0uQvKAfUcu-Lm4u|F9=MXLbiHW*ojJ=N9F(fj
ze_&C{OG|Mpm?O7B?eQS99Wu4{B;-XD$B)*>Kc)d3T<dZ)jaJDyN(LC~C)p^#>+7lZ
zK0Nlr$j2N!#S+8oNMe0^)FG<K8ftE&yC9QQ^G5yqQpcOZ^}rd$)P$`-lv-bU<WsDH
zZUgIuHTsg<5LIjLADS+sE=W)+NIc%+%Q9;kR`2+wV*lt907ql*dHa!;!7yIp{Jl0x
zN^Q!|3ReY;tH^STo1tw_r!w_GJdjGuefad4ou2z4cJ&rzUHa32G@i{axtR83$kX`^
zQDW>sY-&n%;L9am^24fNKj2yt2sawn7_NQ3e_xb|MmIbYmlSjIRPFe*laTECyO|OS
zAI2GLj|A(w|2RVZY}CiHMAjwcUi}CTT4^p-W;uX)a=+@yg~nfFH~DC7*zjObniV9<
z(Tg<0KAYx1TRTl2VBVYa#hnIiPv0ml0e6e|#Mv<At0W~`T+9hE1Ytcd$F2C^f}to%
zKsS29%@2f<nMdG5ysGRlb|C$!x3b4T$jG6wE@`rWQh=V+KprLx0eF^}ZonOH_3WuD
zIG!?gP;upDUxU)=p^ihZ(z=W|XKmSzP=Y?`dBKC}I{oG1J~E=I$>x+CeGWT?BV)%r
zR(G$b3#r%wk^4moD9yJZJx{Fya)bnF>48i(?<9;-8=&uCx2!$&I+(>nhBk85NzzYh
z;G{Z7=d-8W!wbMRaT1z+8qJ&ZMx*k@_c9lyvqTw(fxTt=we7tbhLKDCM+BnYi(m=a
z4nKA@2HV>lj`(s7a3W-vE9Z}?jtK!M#^oD7B^$(aiST&^N3S_nrgoDDrYHd*zj#3_
zD3-7<@L+m>0`@!O&125*o<o8xtf+=I=R&tqu*3Em7Ji8~5y+Eq<fqp~5bT>R_Wy=B
z&LHs?@#!2N9d^TSsx>Y&>s;uik@Zjq4mbc6n4hIme`3mb9oGKI9mdQBx^cBDHGXJ@
zK9s&aoB4xNqCRlI`KtI&T(wW>cVDB+vd3}vDcPb=MJwiu;#>lS$Zd_gUtq^kj$FdH
z&t@;E7*7D+!0$YMb_-go?#0F+UB@~?+?j}1)nfoLyru_#XLqUgK?}1zWM^3dIH<%Z
zu+BrqlGpFN=PSV(?&(`GG?Q(1ARY&c5`cTpO421OO5xaDer(Bn9boX=8GHJiHgdNy
zPXXi%<Hd;u_dnJNWCA(O%+H+2z3zQPL#Zu*5SO(`T`N8vxih)5C&)_7qQ827QmCb8
z41ApNP{N_0Ke15o^efG=mL%B}UZ1u7jz9nu{lbO%E}#?KBxO&9XP8Ll(<Zfp5))!*
z9<&RZjSC>Ey6b@y&hi})TZ6q2{@2pUk2uewnn+%{k~$fwVGm=_xR;U}(kE^}o%Elj
zBNREM+bTQcrW_Y}Qh@LGLEAT5yMrlVJ6jst7V=S0A==11kt-e7g)z4s5}xqrmuHp%
zz!{_ecebj9lHBUPnfrKOU}^|_vrYcw`DGo|Bo%kxBV|Y;M=|jFuQsK-lNaxb87M(g
z<$1Q=&+!x%YVZ}yfX+Xfdp5sIuuBX6a<NPhee8EOh^0fK6L3koDN&{~3n6DHKSG^V
z0IGIVB}48)M4+6r$u$RXWmI4<N6(M<H@)zdpsPH(pa|~>J<6J^AuP>`S&d5PiQ;hP
zwD2jw%&fl$jBBxs7mZB@j4aN*L0vj>GBXCk<q3Jq4Y6Zs#LMhEhf~Q>l-iFFg+hf1
z58+0mwYjm~w=?t61KHRmJ%?+;4Hr&|5g#pThKw)jyJihagTf6<F?!>4!KN+wA?dgj
z#*ux%+Z-)v0oi?P@5<-4`TnlP&W_^q^w#3P8odNQ%|czu{h!e2?X`%9TTFA25Mc)!
z_paUS@;N*o9TzR+IvaWM%THEWSt$0ZiNT~LH`SRliePJYb1vekkJ`N^iTcb;N?>`e
zGcHN7eZ6}h8eItfdzH8;aJS?D6%G6?a2IWbg%xxrVy3C)xfe}jtwNya?U&W~EA^W%
zXW*W><Vv|1;L2{fEY-S*2$BMMBmT&6i8_+zZfnIW?=h|Q4e{`|o)0foNqD?13bgvV
zs~T%Z^K{A?v8zD7=x}Zaj0nvp1OE$C%rahffb>=>zH1MzeO%}1>F&_~&K*wk2%*(#
zpK<eTxhxj6=~6Ow%u|eRri`nv4@qGFy*<6=hY8CjCk>t4Q3=}MtI24B7FD}T9w+KH
z5GR*aB7%AYTFOAOwbT2&wek)wR>l!~yP;MYaPrY<3zT_BkW|iPNi}6q*{e#wn$Z!v
zsM`g#l<l0E0tGWSt8Cvsm}|N-=8yrZ0yXY9)!v4D^rUcHYa<KnJa0g6o-GLKD*LWQ
z>q?Aqgyi4iBUI}K#BRPwD9|FFG#eY6AkOVH)kJUNDcdcr3H#Yt=p5#FF&n>O9ogJp
zR8jN7n3&g0*<M>7n*{=a@|MZ``&0AWb0XrU--}X<Z`NsoBo)NVN`0%1UW+-!1zGiv
z{2V`ae``m0^#omnQHFiB&K{AEy-{#>hiFx5bz$t-8?9y24-Cnx)vg9;8Cw;)C4a`M
z*0QJ7=Z@DUq-VsCea9{F@fKp^o%%~m_=~N%^WVAKv|zWtlESWn#=*T!)iSG>9bI40
z_#ybf=f>YE^3{x*Uw|I(89UKddJYT?$;X;Y4J}o^{K*62l2)Sibg)91b)DoP5>>U5
z_O`P@Rqbzx+qoZ3x+;j%f!vf4zP;6NUu^McTON%^ls>4Q-KTS+RUq^FO|U{7e6`G~
ztK4sOZ+qC5*QsH#T?TkJ94-*<-s>se;!eVNPeT?P-=nX;oqk|@_I2h|%vV3EcC*9~
zf&F^U?b`nSAN%fjTMEYxE2-7Bw+_kH47YPP(Z0t=ZjCJx6tf8rZ9v&wHAIW<mgsk!
zvjNwC5yk`F;SI?ehf59gH^S^)$3FMRBoB3=_c*(Q&XpmwIeyk*98!4S!M$uPtJc@1
zlz}5<fR+|Jjo-HV>3|*aTV{2Vp7Cjz<OBtV@8?%s2HsiAPsRhZBOX?l+e#VZrHg<_
zL#V29yMs)76gX8*_7OuaSSIm^z3|Ty+s&i5RLptIuh!~5DfW8Srnv|%54DR7NcW>`
z-(7oQ<+fHZ#7qz{mA0DLt*_&6_Ie&R(`Y3l$m56L)Y!*#<_o6q3TXiMBzca~JivRq
zN?4XfyWV<ub8q_l*mataFq?<M!{|W|iB)yl$V+9n?=C-Q9AHce1SW_@>4Ar>XS&C#
zTK;}Dz9vGULfYn&BW_enMiTC48*crXu6}_rj4>xDRzRvAF@%R(a$3s{H66gM7H(tv
z<$if1ZmHS`)nrPsp=14myg=D^98NtOC44W4T_d&tM+<la*jJ=t-fUK_P1^L*L}rbA
z!C_X$Z5Sz<Z>~C#vV3Y!*Z2j|w75Tjhbj8#IF5bdyrW!eXO&u!Dhmv8>{yZ$t|n#f
z23trA%l_)foZ?yqsA{FNzn}vAdw>l&-dQDWqvm9kXymNUaT3bWS-Bh<n!H)w0;lXA
zl)IC9hM9eFht9UlDfXF|UKH`*tK+g-dCf|5XCsb77o!BbKX@JCo#9ifGF`M)DY7hz
zb!ZW?;^Mj*esq|Sp8R&-DQZ(7u5fHj6DMh9#p!Ry{b=<&WFz_Qz^j(Zx+2e$FWIO8
zJK<|fx3JDm{$uZ!2#49YhhsCgJw8~&A-R|H5sZ;-vR-Yb#eU`PQ=Ut<PUf-GE?o;Q
z3>fia;QQqr7zrqg+!n(+0Nnvi7J(|zuO*(XzkO0C@d|D7eQ&Z&w^d?d{RVEcAQ>EG
z9D1Ui;?r>X7d240VRT3icjj$eb7k}g33_*PgkP`A#E5{M@qD%2ky>rZDaBcs6~~zU
z<ZH^`wgpzpeU5UHm(Uqo@lFxKA8O<ZEUbY`4W6~nwaURVR-JJ36u-;;@9%#{;Q#Lt
c2=+db5w2K5GleD<{qD&%15^D9UB~GE1F7&OMF0Q*

literal 0
HcmV?d00001

diff --git a/llvm/include/llvm/Telemetry/Telemetry.h b/llvm/include/llvm/Telemetry/Telemetry.h
new file mode 100644
index 0000000000000..ed519448315c2
--- /dev/null
+++ b/llvm/include/llvm/Telemetry/Telemetry.h
@@ -0,0 +1,162 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides the basic framework for Telemetry.
+/// Refer to its documentation at llvm/docs/Telemetry.rst for more details.
+//===---------------------------------------------------------------------===//
+
+#ifndef LLVM_TELEMETRY_TELEMETRY_H
+#define LLVM_TELEMETRY_TELEMETRY_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+namespace llvm {
+namespace telemetry {
+
+class Serializer {
+public:
+  virtual Error init() = 0;
+  virtual void write(StringRef KeyName, bool Value) = 0;
+  virtual void write(StringRef KeyName, StringRef Value) = 0;
+
+  template <typename T>
+  std::enable_if_t<std::is_integral_v<T>> write(StringRef KeyName, T Value) {
+    if constexpr (std::is_signed_v<T>)
+      writeSigned(KeyName, Value);
+    else
+      writeUnsigned(KeyName, Value);
+  }
+
+  template <typename T, typename = typename T::mapped_type>
+  void write(StringRef KeyName, const T &Map) {
+    static_assert(std::is_convertible_v<typename T::key_type, StringRef>,
+                  "KeyType must be convertible to string");
+    beginObject(KeyName);
+    for (const auto &KeyVal : Map)
+      write(KeyVal.first, KeyVal.second);
+    endObject();
+  }
+
+  virtual void beginObject(StringRef KeyName) = 0;
+  virtual void endObject() = 0;
+
+  virtual Error finalize() = 0;
+
+private:
+  virtual void writeUnsigned(StringRef KeyName, unsigned long long) = 0;
+  virtual void writeSigned(StringRef KeyName, long long) = 0;
+};
+
+/// Configuration for the Manager class.
+/// This stores configurations from both users and vendors and is passed
+/// to the Manager upon construction. (Any changes to the config after
+/// the Manager's construction will not have any effect on it).
+///
+/// This struct can be extended as needed to add additional configuration
+/// points specific to a vendor's implementation.
+struct Config {
+  // If true, telemetry will be enabled.
+  const bool EnableTelemetry;
+  Config(bool E) : EnableTelemetry(E) {}
+
+  virtual std::optional<std::string> makeSessionId() { return std::nullopt; }
+};
+
+/// For isa, dyn_cast, etc operations on TelemetryInfo.
+typedef unsigned KindType;
+/// This struct is used by TelemetryInfo to support isa<>, dyn_cast<>
+/// operations.
+/// It is defined as a struct (rather than an enum) because it is
+/// expected to be extended by subclasses which may have
+/// additional TelemetryInfo types defined to describe different events.
+struct EntryKind {
+  static const KindType Base = 0;
+};
+
+/// TelemetryInfo is the data courier, used to move instrumented data
+/// from the tool being monitored to the Telemetry framework.
+///
+/// This base class contains only the basic set of telemetry data.
+/// Downstream implementations can define more subclasses with
+/// additional fields to describe different events and concepts.
+///
+/// For example, The LLDB debugger can define a DebugCommandInfo subclass
+/// which has additional fields about the debug-command being instrumented,
+/// such as `CommandArguments` or `CommandName`.
+struct TelemetryInfo {
+  // This represents a unique-id, conventionally corresponding to
+  // a tool's session - i.e., every time the tool starts until it exits.
+  //
+  // Note: a tool could have multiple sessions running at once, in which
+  // case, these shall be multiple sets of TelemetryInfo with multiple unique
+  // IDs.
+  //
+  // Different usages can assign different types of IDs to this field.
+  std::string SessionId;
+
+  TelemetryInfo() = default;
+  virtual ~TelemetryInfo() = default;
+
+  virtual void serialize(Serializer &serializer) const;
+
+  // For isa, dyn_cast, etc, operations.
+  virtual KindType getKind() const { return EntryKind::Base; }
+  static bool classof(const TelemetryInfo *T) {
+    return T->getKind() == EntryKind::Base;
+  }
+};
+
+/// This class presents a data sink to which the Telemetry framework
+/// sends data.
+///
+/// Its implementation is transparent to the framework.
+/// It is up to the vendor to decide which pieces of data to forward
+/// and where to forward them.
+class Destination {
+public:
+  virtual ~Destination() = default;
+  virtual Error receiveEntry(const TelemetryInfo *Entry) = 0;
+  virtual StringLiteral name() const = 0;
+};
+
+/// This class is the main interaction point between any LLVM tool
+/// and this framework.
+/// It is responsible for collecting telemetry data from the tool being
+/// monitored and transmitting the data elsewhere.
+class Manager {
+public:
+  // Optional callback for subclasses to perform additional tasks before
+  // dispatching to Destinations.
+  virtual Error preDispatch(TelemetryInfo *Entry) = 0;
+
+  // Dispatch Telemetry data to the Destination(s).
+  // The argument is non-const because the Manager may add or remove
+  // data from the entry.
+  virtual Error dispatch(TelemetryInfo *Entry);
+
+  // Register a Destination.
+  void addDestination(std::unique_ptr<Destination> Destination);
+
+private:
+  std::vector<std::unique_ptr<Destination>> Destinations;
+};
+
+} // namespace telemetry
+} // namespace llvm
+
+#endif // LLVM_TELEMETRY_TELEMETRY_H
diff --git a/llvm/lib/CMakeLists.txt b/llvm/lib/CMakeLists.txt
index 503c77cb13bd0..f6465612d30c0 100644
--- a/llvm/lib/CMakeLists.txt
+++ b/llvm/lib/CMakeLists.txt
@@ -41,6 +41,7 @@ add_subdirectory(ProfileData)
 add_subdirectory(Passes)
 add_subdirectory(TargetParser)
 add_subdirectory(TextAPI)
+add_subdirectory(Telemetry)
 add_subdirectory(ToolDrivers)
 add_subdirectory(XRay)
 if (LLVM_INCLUDE_TESTS)
diff --git a/llvm/lib/Telemetry/CMakeLists.txt b/llvm/lib/Telemetry/CMakeLists.txt
new file mode 100644
index 0000000000000..8208bdadb05e9
--- /dev/null
+++ b/llvm/lib/Telemetry/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_llvm_component_library(LLVMTelemetry
+  Telemetry.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  "${LLVM_MAIN_INCLUDE_DIR}/llvm/Telemetry"
+)
diff --git a/llvm/lib/Telemetry/Telemetry.cpp b/llvm/lib/Telemetry/Telemetry.cpp
new file mode 100644
index 0000000000000..de8e77d52623c
--- /dev/null
+++ b/llvm/lib/Telemetry/Telemetry.cpp
@@ -0,0 +1,26 @@
+#include "llvm/Telemetry/Telemetry.h"
+
+namespace llvm {
+namespace telemetry {
+
+void TelemetryInfo::serialize(Serializer &serializer) const {
+  serializer.write("SessionId", SessionId);
+}
+
+Error Manager::dispatch(TelemetryInfo *Entry) {
+  if (Error Err = preDispatch(Entry))
+    return std::move(Err);
+
+  Error AllErrs = Error::success();
+  for (auto &Dest : Destinations) {
+    AllErrs = joinErrors(std::move(AllErrs), Dest->receiveEntry(Entry));
+  }
+  return AllErrs;
+}
+
+void Manager::addDestination(std::unique_ptr<Destination> Dest) {
+  Destinations.push_back(std::move(Dest));
+}
+
+} // namespace telemetry
+} // namespace llvm
diff --git a/llvm/unittests/CMakeLists.txt b/llvm/unittests/CMakeLists.txt
index 8892f3e75729a..81abce51b8939 100644
--- a/llvm/unittests/CMakeLists.txt
+++ b/llvm/unittests/CMakeLists.txt
@@ -63,6 +63,7 @@ add_subdirectory(Support)
 add_subdirectory(TableGen)
 add_subdirectory(Target)
 add_subdirectory(TargetParser)
+add_subdirectory(Telemetry)
 add_subdirectory(Testing)
 add_subdirectory(TextAPI)
 add_subdirectory(Transforms)
diff --git a/llvm/unittests/Telemetry/CMakeLists.txt b/llvm/unittests/Telemetry/CMakeLists.txt
new file mode 100644
index 0000000000000..a40ae4b2f5560
--- /dev/null
+++ b/llvm/unittests/Telemetry/CMakeLists.txt
@@ -0,0 +1,9 @@
+set(LLVM_LINK_COMPONENTS
+  Telemetry
+  Core
+  Support
+  )
+
+add_llvm_unittest(TelemetryTests
+  TelemetryTest.cpp
+  )
diff --git a/llvm/unittests/Telemetry/TelemetryTest.cpp b/llvm/unittests/Telemetry/TelemetryTest.cpp
new file mode 100644
index 0000000000000..05523f1bcfaa2
--- /dev/null
+++ b/llvm/unittests/Telemetry/TelemetryTest.cpp
@@ -0,0 +1,242 @@
+//===- llvm/unittest/Telemetry/TelemetryTest.cpp - Telemetry unittests ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Telemetry/Telemetry.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Error.h"
+#include "gtest/gtest.h"
+#include <optional>
+#include <vector>
+
+namespace llvm {
+namespace telemetry {
+// Testing parameters.
+//
+// These are set by each test to force certain outcomes.
+struct TestContext {
+  // Controlling whether there is vendor plugin.  In "real" implementation, the
+  // plugin-registration framework will handle the overrides but for tests, we
+  // just use a bool flag to decide which function to call.
+  bool HasVendorPlugin = false;
+
+  // This field contains data emitted by the framework for later
+  // verification by the tests.
+  std::string Buffer = "";
+
+  // The expected Uuid generated by the fake tool.
+  std::string ExpectedUuid = "";
+};
+
+class StringSerializer : public Serializer {
+public:
+  const std::string &getString() { return Buffer; }
+
+  Error init() override {
+    if (Started)
+      return createStringError("Serializer already in use");
+    Started = true;
+    Buffer.clear();
+    return Error::success();
+  }
+
+  void write(StringRef KeyName, bool Value) override {
+    writeHelper(KeyName, Value);
+  }
+
+  void write(StringRef KeyName, StringRef Value) override {
+    writeHelper(KeyName, Value);
+  }
+
+  void beginObject(StringRef KeyName) override {
+    Children.push_back(std::string("\n"));
+    ChildrenNames.push_back(KeyName.str());
+  }
+
+  void endObject() override {
+    assert(!Children.empty() && !ChildrenNames.empty());
+    std::string ChildBuff = Children.back();
+    std::string Name = ChildrenNames.back();
+    Children.pop_back();
+    ChildrenNames.pop_back();
+    writeHelper(Name, ChildBuff);
+  }
+
+  Error finalize() override {
+    assert(Children.empty() && ChildrenNames.empty());
+    if (!Started)
+      return createStringError("Serializer not currently in use");
+    Started = false;
+    return Error::success();
+  }
+
+private:
+  template <typename T> void writeHelper(StringRef Name, T Value) {
+    assert(Started && "serializer not started");
+    if (Children.empty())
+      Buffer.append((Name + ":" + Twine(Value) + "\n").str());
+    else
+      Children.back().append((Name + ":" + Twine(Value) + "\n").str());
+  }
+
+  void writeUnsigned(StringRef KeyName, unsigned long long Value) override {
+    writeHelper(KeyName, Value);
+  }
+
+  void writeSigned(StringRef KeyName, long long Value) override {
+    writeHelper(KeyName, Value);
+  }
+
+  bool Started = false;
+  std::string Buffer;
+  std::vector<std::string> Children;
+  std::vector<std::string> ChildrenNames;
+};
+
+namespace vendor {
+struct VendorConfig : public Config {
+  VendorConfig(bool Enable) : Config(Enable) {}
+  std::optional<std::string> makeSessionId() override {
+    static int seed = 0;
+    return std::to_string(seed++);
+  }
+};
+
+std::shared_ptr<Config> getTelemetryConfig(const TestContext &Ctxt) {
+  return std::make_shared<VendorConfig>(/*EnableTelemetry=*/true);
+}
+
+class TestStorageDestination : public Destination {
+public:
+  TestStorageDestination(TestContext *Ctxt) : CurrentContext(Ctxt) {}
+
+  Error receiveEntry(const TelemetryInfo *Entry) override {
+    if (Error Err = serializer.init())
+      return Err;
+
+    Entry->serialize(serializer);
+    if (Error Err = serializer.finalize())
+      return Err;
+
+    CurrentContext->Buffer.append(serializer.getString());
+    return Error::success();
+  }
+
+  StringLiteral name() const override { return "TestDestination"; }
+
+private:
+  TestContext *CurrentContext;
+  StringSerializer serializer;
+};
+
+struct StartupInfo : public TelemetryInfo {
+  std::string ToolName;
+  std::map<std::string, std::string> MetaData;
+
+  void serialize(Serializer &serializer) const override {
+    TelemetryInfo::serialize(serializer);
+    serializer.write("ToolName", ToolName);
+    serializer.write("MetaData", MetaData);
+  }
+};
+
+struct ExitInfo : public TelemetryInfo {
+  int ExitCode;
+  std::string ExitDesc;
+  void serialize(Serializer &serializer) const override {
+    TelemetryInfo::serialize(serializer);
+    serializer.write("ExitCode", ExitCode);
+    serializer.write("ExitDesc", ExitDesc);
+  }
+};
+
+class TestManager : public Manager {
+public:
+  static std::unique_ptr<TestManager>
+  createInstance(Config *Config, TestContext *CurrentContext) {
+    if (!Config->EnableTelemetry)
+      return nullptr;
+    CurrentContext->ExpectedUuid = *(Config->makeSessionId());
+    std::unique_ptr<TestManager> Ret = std::make_unique<TestManager>(
+        CurrentContext, CurrentContext->ExpectedUuid);
+
+    // Add a destination.
+    Ret->addDestination(
+        std::make_unique<TestStorageDestination>(CurrentContext));
+
+    return Ret;
+  }
+
+  TestManager(TestContext *Ctxt, std::string Id)
+      : CurrentContext(Ctxt), SessionId(Id) {}
+
+  Error preDispatch(TelemetryInfo *Entry) override {
+    Entry->SessionId = SessionId;
+    return Error::success();
+  }
+
+  std::string getSessionId() { return SessionId; }
+
+private:
+  TestContext *CurrentContext;
+  const std::string SessionId;
+};
+} // namespace vendor
+
+std::shared_ptr<Config> getTelemetryConfig(const TestContext &Ctxt) {
+  if (Ctxt.HasVendorPlugin)
+    return vendor::getTelemetryConfig(Ctxt);
+
+  return std::make_shared<Config>(false);
+}
+
+TEST(TelemetryTest, TelemetryDisabled) {
+  TestContext Context;
+  Context.HasVendorPlugin = false;
+
+  std::shared_ptr<Config> Config = getTelemetryConfig(Context);
+  auto Manager = vendor::TestManager::createInstance(Config.get(), &Context);
+  EXPECT_EQ(nullptr, Manager);
+}
+
+TEST(TelemetryTest, TelemetryEnabled) {
+  const std::string ToolName = "TelemetryTestTool";
+
+  // Preset some params.
+  TestContext Context;
+  Context.HasVendorPlugin = true;
+  Context.Buffer.clear();
+
+  std::shared_ptr<Config> Config = getTelemetryConfig(Context);
+  auto Manager = vendor::TestManager::createInstance(Config.get(), &Context);
+
+  EXPECT_STREQ(Manager->getSessionId().c_str(), Context.ExpectedUuid.c_str());
+
+  vendor::StartupInfo S;
+  S.ToolName = ToolName;
+  S.MetaData["a"] = "A";
+  S.MetaData["b"] = "B";
+
+  Error startupEmitStatus = Manager->dispatch(&S);
+  EXPECT_FALSE(startupEmitStatus);
+  std::string ExpectedBuffer =
+      "SessionId:0\nToolName:TelemetryTestTool\nMetaData:\na:A\nb:B\n\n";
+  EXPECT_EQ(ExpectedBuffer, Context.Buffer);
+  Context.Buffer.clear();
+
+  vendor::ExitInfo E;
+  E.ExitCode = 0;
+  E.ExitDesc = "success";
+  Error exitEmitStatus = Manager->dispatch(&E);
+  EXPECT_FALSE(exitEmitStatus);
+  ExpectedBuffer = "SessionId:0\nExitCode:0\nExitDesc:success\n";
+  EXPECT_EQ(ExpectedBuffer, Context.Buffer);
+}
+
+} // namespace telemetry
+} // namespace llvm

From 2e1128d50e6174e857833cfde72f29f61a4a7b56 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 20 Dec 2024 16:03:10 +0000
Subject: [PATCH 208/209] [AMDGPU] Remove some unnecessary !casts

---
 llvm/lib/Target/AMDGPU/MIMGInstructions.td | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 8d115268d505b..a16a175dc94ab 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -917,7 +917,7 @@ class MIMG_Atomic_gfx90a<mimgopc op, string asm, RegisterClass data_rc,
 class MIMG_Atomic_gfx10<mimgopc op, string opcode,
                         RegisterClass DataRC, RegisterClass AddrRC,
                         bit enableDisasm = 0>
-  : MIMG_gfx10<!cast<int>(op.GFX10M), (outs DataRC:$vdst),
+  : MIMG_gfx10<op.GFX10M, (outs DataRC:$vdst),
                !if(enableDisasm, "GFX10", "")> {
   let Constraints = "$vdst = $vdata";
 
@@ -930,7 +930,7 @@ class MIMG_Atomic_gfx10<mimgopc op, string opcode,
 class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
                             RegisterClass DataRC, int num_addrs,
                             bit enableDisasm = 0>
-  : MIMG_nsa_gfx10<!cast<int>(op.GFX10M), (outs DataRC:$vdst), num_addrs,
+  : MIMG_nsa_gfx10<op.GFX10M, (outs DataRC:$vdst), num_addrs,
                    !if(enableDisasm, "GFX10", "")> {
   let Constraints = "$vdst = $vdata";
 
@@ -945,7 +945,7 @@ class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
 class MIMG_Atomic_gfx11<mimgopc op, string opcode,
                         RegisterClass DataRC, RegisterClass AddrRC,
                         bit enableDisasm = 0>
-  : MIMG_gfx11<!cast<int>(op.GFX11), (outs DataRC:$vdst),
+  : MIMG_gfx11<op.GFX11, (outs DataRC:$vdst),
                !if(enableDisasm, "GFX11", "")> {
   let Constraints = "$vdst = $vdata";
 
@@ -958,7 +958,7 @@ class MIMG_Atomic_gfx11<mimgopc op, string opcode,
 class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode,
                             RegisterClass DataRC, int num_addrs,
                             bit enableDisasm = 0>
-  : MIMG_nsa_gfx11<!cast<int>(op.GFX11), (outs DataRC:$vdst), num_addrs,
+  : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdst), num_addrs,
                    !if(enableDisasm, "GFX11", "")> {
   let Constraints = "$vdst = $vdata";
 
@@ -972,7 +972,7 @@ class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode,
 
 class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterClass DataRC,
                           int num_addrs, bit enableDisasm = 0>
-  : VIMAGE_gfx12<!cast<int>(op.GFX12), (outs DataRC:$vdst), num_addrs,
+  : VIMAGE_gfx12<op.GFX12, (outs DataRC:$vdst), num_addrs,
                   !if(enableDisasm, "GFX12", "")> {
   let Constraints = "$vdst = $vdata";
 

From 412e1af19a248fd5650e6828695c78454a9195fb Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs@arm.com>
Date: Fri, 20 Dec 2024 16:14:57 +0000
Subject: [PATCH 209/209] Revert "[AArch64] Lower alias mask to a whilewr"
 (#120261)

Reverts llvm/llvm-project#100769

A bug in the lowering (the subtraction should be reversed) was found
after merging and it will all be replaced by #117007 anyway.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  121 --
 llvm/test/CodeGen/AArch64/whilewr.ll          | 1086 -----------------
 2 files changed, 1207 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/whilewr.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a27c030237c87..505fae4e840f7 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1539,7 +1539,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
       setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
       setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
-      setOperationAction(ISD::OR, VT, Custom);
 
       setOperationAction(ISD::SELECT_CC, VT, Expand);
       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
@@ -14329,128 +14328,8 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
   return ResultSLI;
 }
 
-/// Try to lower the construction of a pointer alias mask to a WHILEWR.
-/// The mask's enabled lanes represent the elements that will not overlap across
-/// one loop iteration. This tries to match:
-/// or (splat (setcc_lt (sub ptrA, ptrB), -(element_size - 1))),
-///    (get_active_lane_mask 0, (div (sub ptrA, ptrB), element_size))
-SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG,
-                         const AArch64Subtarget &Subtarget) {
-  if (!Subtarget.hasSVE2())
-    return SDValue();
-  SDValue LaneMask = Op.getOperand(0);
-  SDValue Splat = Op.getOperand(1);
-
-  if (Splat.getOpcode() != ISD::SPLAT_VECTOR)
-    std::swap(LaneMask, Splat);
-
-  if (LaneMask.getOpcode() != ISD::INTRINSIC_WO_CHAIN ||
-      LaneMask.getConstantOperandVal(0) != Intrinsic::get_active_lane_mask ||
-      Splat.getOpcode() != ISD::SPLAT_VECTOR)
-    return SDValue();
-
-  SDValue Cmp = Splat.getOperand(0);
-  if (Cmp.getOpcode() != ISD::SETCC)
-    return SDValue();
-
-  CondCodeSDNode *Cond = cast<CondCodeSDNode>(Cmp.getOperand(2));
-
-  auto ComparatorConst = dyn_cast<ConstantSDNode>(Cmp.getOperand(1));
-  if (!ComparatorConst || ComparatorConst->getSExtValue() > 0 ||
-      Cond->get() != ISD::CondCode::SETLT)
-    return SDValue();
-  unsigned CompValue = std::abs(ComparatorConst->getSExtValue());
-  unsigned EltSize = CompValue + 1;
-  if (!isPowerOf2_64(EltSize) || EltSize > 8)
-    return SDValue();
-
-  SDValue Diff = Cmp.getOperand(0);
-  if (Diff.getOpcode() != ISD::SUB || Diff.getValueType() != MVT::i64)
-    return SDValue();
-
-  if (!isNullConstant(LaneMask.getOperand(1)) ||
-      (EltSize != 1 && LaneMask.getOperand(2).getOpcode() != ISD::SRA))
-    return SDValue();
-
-  // The number of elements that alias is calculated by dividing the positive
-  // difference between the pointers by the element size. An alias mask for i8
-  // elements omits the division because it would just divide by 1
-  if (EltSize > 1) {
-    SDValue DiffDiv = LaneMask.getOperand(2);
-    auto DiffDivConst = dyn_cast<ConstantSDNode>(DiffDiv.getOperand(1));
-    if (!DiffDivConst || DiffDivConst->getZExtValue() != Log2_64(EltSize))
-      return SDValue();
-    if (EltSize > 2) {
-      // When masking i32 or i64 elements, the positive value of the
-      // possibly-negative difference comes from a select of the difference if
-      // it's positive, otherwise the difference plus the element size if it's
-      // negative: pos_diff = diff < 0 ? (diff + 7) : diff
-      SDValue Select = DiffDiv.getOperand(0);
-      // Make sure the difference is being compared by the select
-      if (Select.getOpcode() != ISD::SELECT_CC || Select.getOperand(3) != Diff)
-        return SDValue();
-      // Make sure it's checking if the difference is less than 0
-      if (!isNullConstant(Select.getOperand(1)) ||
-          cast<CondCodeSDNode>(Select.getOperand(4))->get() !=
-              ISD::CondCode::SETLT)
-        return SDValue();
-      // An add creates a positive value from the negative difference
-      SDValue Add = Select.getOperand(2);
-      if (Add.getOpcode() != ISD::ADD || Add.getOperand(0) != Diff)
-        return SDValue();
-      if (auto *AddConst = dyn_cast<ConstantSDNode>(Add.getOperand(1));
-          !AddConst || AddConst->getZExtValue() != EltSize - 1)
-        return SDValue();
-    } else {
-      // When masking i16 elements, this positive value comes from adding the
-      // difference's sign bit to the difference itself. This is equivalent to
-      // the 32 bit and 64 bit case: pos_diff = diff + sign_bit (diff)
-      SDValue Add = DiffDiv.getOperand(0);
-      if (Add.getOpcode() != ISD::ADD || Add.getOperand(0) != Diff)
-        return SDValue();
-      // A logical right shift by 63 extracts the sign bit from the difference
-      SDValue Shift = Add.getOperand(1);
-      if (Shift.getOpcode() != ISD::SRL || Shift.getOperand(0) != Diff)
-        return SDValue();
-      if (auto *ShiftConst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
-          !ShiftConst || ShiftConst->getZExtValue() != 63)
-        return SDValue();
-    }
-  } else if (LaneMask.getOperand(2) != Diff)
-    return SDValue();
-
-  SDValue StorePtr = Diff.getOperand(0);
-  SDValue ReadPtr = Diff.getOperand(1);
-
-  unsigned IntrinsicID = 0;
-  switch (EltSize) {
-  case 1:
-    IntrinsicID = Intrinsic::aarch64_sve_whilewr_b;
-    break;
-  case 2:
-    IntrinsicID = Intrinsic::aarch64_sve_whilewr_h;
-    break;
-  case 4:
-    IntrinsicID = Intrinsic::aarch64_sve_whilewr_s;
-    break;
-  case 8:
-    IntrinsicID = Intrinsic::aarch64_sve_whilewr_d;
-    break;
-  default:
-    return SDValue();
-  }
-  SDLoc DL(Op);
-  SDValue ID = DAG.getConstant(IntrinsicID, DL, MVT::i32);
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), ID,
-                     StorePtr, ReadPtr);
-}
-
 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
                                              SelectionDAG &DAG) const {
-  if (SDValue SV =
-          tryWhileWRFromOR(Op, DAG, DAG.getSubtarget<AArch64Subtarget>()))
-    return SV;
-
   if (useSVEForFixedLengthVectorVT(Op.getValueType(),
                                    !Subtarget->isNeonAvailable()))
     return LowerToScalableOp(Op, DAG);
diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll
deleted file mode 100644
index 9f1ea85079238..0000000000000
--- a/llvm/test/CodeGen/AArch64/whilewr.ll
+++ /dev/null
@@ -1,1086 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve2 -o - | FileCheck %s
-; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2
-
-define <vscale x 16 x i1> @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    whilewr p0.b, x1, x2
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_8:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    sub x8, x1, x2
-; CHECK-NOSVE2-NEXT:    cmp x8, #0
-; CHECK-NOSVE2-NEXT:    cset w9, lt
-; CHECK-NOSVE2-NEXT:    whilelo p0.b, xzr, x8
-; CHECK-NOSVE2-NEXT:    sbfx x8, x9, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p1.b, xzr, x8
-; CHECK-NOSVE2-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %c14 = ptrtoint ptr %c to i64
-  %b15 = ptrtoint ptr %b to i64
-  %sub.diff = sub i64 %b15, %c14
-  %neg.compare = icmp slt i64 %sub.diff, 0
-  %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
-  %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
-  ret <vscale x 16 x i1> %active.lane.mask.alias
-}
-
-define <vscale x 16 x i1> @whilewr_commutative(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_commutative:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    whilewr p0.b, x1, x2
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_commutative:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    sub x8, x1, x2
-; CHECK-NOSVE2-NEXT:    cmp x8, #0
-; CHECK-NOSVE2-NEXT:    cset w9, lt
-; CHECK-NOSVE2-NEXT:    whilelo p0.b, xzr, x8
-; CHECK-NOSVE2-NEXT:    sbfx x8, x9, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p1.b, xzr, x8
-; CHECK-NOSVE2-NEXT:    mov p0.b, p1/m, p1.b
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %c14 = ptrtoint ptr %c to i64
-  %b15 = ptrtoint ptr %b to i64
-  %sub.diff = sub i64 %b15, %c14
-  %neg.compare = icmp slt i64 %sub.diff, 0
-  %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
-  %active.lane.mask.alias = or <vscale x 16 x i1> %.splat, %ptr.diff.lane.mask
-  ret <vscale x 16 x i1> %active.lane.mask.alias
-}
-
-define <vscale x 8 x i1> @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    whilewr p0.h, x1, x2
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_16:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    sub x8, x1, x2
-; CHECK-NOSVE2-NEXT:    cmn x8, #1
-; CHECK-NOSVE2-NEXT:    add x8, x8, x8, lsr #63
-; CHECK-NOSVE2-NEXT:    cset w9, lt
-; CHECK-NOSVE2-NEXT:    sbfx x9, x9, #0, #1
-; CHECK-NOSVE2-NEXT:    asr x8, x8, #1
-; CHECK-NOSVE2-NEXT:    whilelo p0.h, xzr, x9
-; CHECK-NOSVE2-NEXT:    whilelo p1.h, xzr, x8
-; CHECK-NOSVE2-NEXT:    mov p0.b, p1/m, p1.b
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %b14 = ptrtoint ptr %b to i64
-  %c15 = ptrtoint ptr %c to i64
-  %sub.diff = sub i64 %b14, %c15
-  %diff = sdiv i64 %sub.diff, 2
-  %neg.compare = icmp slt i64 %sub.diff, -1
-  %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
-  ret <vscale x 8 x i1> %active.lane.mask.alias
-}
-
-define <vscale x 4 x i1> @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    whilewr p0.s, x1, x2
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_32:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    sub x8, x1, x2
-; CHECK-NOSVE2-NEXT:    add x9, x8, #3
-; CHECK-NOSVE2-NEXT:    cmp x8, #0
-; CHECK-NOSVE2-NEXT:    csel x9, x9, x8, lt
-; CHECK-NOSVE2-NEXT:    cmn x8, #3
-; CHECK-NOSVE2-NEXT:    cset w8, lt
-; CHECK-NOSVE2-NEXT:    asr x9, x9, #2
-; CHECK-NOSVE2-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p1.s, xzr, x9
-; CHECK-NOSVE2-NEXT:    whilelo p0.s, xzr, x8
-; CHECK-NOSVE2-NEXT:    mov p0.b, p1/m, p1.b
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %b12 = ptrtoint ptr %b to i64
-  %c13 = ptrtoint ptr %c to i64
-  %sub.diff = sub i64 %b12, %c13
-  %diff = sdiv i64 %sub.diff, 4
-  %neg.compare = icmp slt i64 %sub.diff, -3
-  %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
-  ret <vscale x 4 x i1> %active.lane.mask.alias
-}
-
-define <vscale x 2 x i1> @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    whilewr p0.d, x1, x2
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_64:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    sub x8, x1, x2
-; CHECK-NOSVE2-NEXT:    add x9, x8, #7
-; CHECK-NOSVE2-NEXT:    cmp x8, #0
-; CHECK-NOSVE2-NEXT:    csel x9, x9, x8, lt
-; CHECK-NOSVE2-NEXT:    cmn x8, #7
-; CHECK-NOSVE2-NEXT:    cset w8, lt
-; CHECK-NOSVE2-NEXT:    asr x9, x9, #3
-; CHECK-NOSVE2-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p1.d, xzr, x9
-; CHECK-NOSVE2-NEXT:    whilelo p0.d, xzr, x8
-; CHECK-NOSVE2-NEXT:    mov p0.b, p1/m, p1.b
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %b12 = ptrtoint ptr %b to i64
-  %c13 = ptrtoint ptr %c to i64
-  %sub.diff = sub i64 %b12, %c13
-  %diff = sdiv i64 %sub.diff, 8
-  %neg.compare = icmp slt i64 %sub.diff, -7
-  %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
-  ret <vscale x 2 x i1> %active.lane.mask.alias
-}
-
-define <vscale x 1 x i1> @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: no_whilewr_128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub x8, x1, x2
-; CHECK-NEXT:    index z0.d, #0, #1
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    add x9, x8, #15
-; CHECK-NEXT:    cmp x8, #0
-; CHECK-NEXT:    csel x9, x9, x8, lt
-; CHECK-NEXT:    cmn x8, #15
-; CHECK-NEXT:    asr x9, x9, #4
-; CHECK-NEXT:    cset w8, lt
-; CHECK-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NEXT:    mov z1.d, x9
-; CHECK-NEXT:    whilelo p1.d, xzr, x8
-; CHECK-NEXT:    cmphi p0.d, p0/z, z1.d, z0.d
-; CHECK-NEXT:    punpklo p1.h, p1.b
-; CHECK-NEXT:    punpklo p0.h, p0.b
-; CHECK-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: no_whilewr_128:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    sub x8, x1, x2
-; CHECK-NOSVE2-NEXT:    index z0.d, #0, #1
-; CHECK-NOSVE2-NEXT:    ptrue p0.d
-; CHECK-NOSVE2-NEXT:    add x9, x8, #15
-; CHECK-NOSVE2-NEXT:    cmp x8, #0
-; CHECK-NOSVE2-NEXT:    csel x9, x9, x8, lt
-; CHECK-NOSVE2-NEXT:    cmn x8, #15
-; CHECK-NOSVE2-NEXT:    asr x9, x9, #4
-; CHECK-NOSVE2-NEXT:    cset w8, lt
-; CHECK-NOSVE2-NEXT:    sbfx x8, x8, #0, #1
-; CHECK-NOSVE2-NEXT:    mov z1.d, x9
-; CHECK-NOSVE2-NEXT:    whilelo p1.d, xzr, x8
-; CHECK-NOSVE2-NEXT:    cmphi p0.d, p0/z, z1.d, z0.d
-; CHECK-NOSVE2-NEXT:    punpklo p1.h, p1.b
-; CHECK-NOSVE2-NEXT:    punpklo p0.h, p0.b
-; CHECK-NOSVE2-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %b12 = ptrtoint ptr %b to i64
-  %c13 = ptrtoint ptr %c to i64
-  %sub.diff = sub i64 %b12, %c13
-  %diff = sdiv i64 %sub.diff, 16
-  %neg.compare = icmp slt i64 %sub.diff, -15
-  %.splatinsert = insertelement <vscale x 1 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 1 x i1> %.splatinsert, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 1 x i1> %ptr.diff.lane.mask, %.splat
-  ret <vscale x 1 x i1> %active.lane.mask.alias
-}
-
-define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB6_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    whilewr p0.b, x1, x2
-; CHECK-NEXT:    mov w9, w3
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    whilelo p1.b, xzr, x9
-; CHECK-NEXT:    cntp x10, p0, p0.b
-; CHECK-NEXT:    and x10, x10, #0xff
-; CHECK-NEXT:  .LBB6_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x0, x8]
-; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x1, x8]
-; CHECK-NEXT:    add z0.b, z1.b, z0.b
-; CHECK-NEXT:    st1b { z0.b }, p1, [x2, x8]
-; CHECK-NEXT:    add x8, x8, x10
-; CHECK-NEXT:    whilelo p1.b, x8, x9
-; CHECK-NEXT:    b.mi .LBB6_2
-; CHECK-NEXT:  .LBB6_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_loop_8:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB6_3
-; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT:    sub x9, x1, x2
-; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    cmp x9, #0
-; CHECK-NOSVE2-NEXT:    cset w10, lt
-; CHECK-NOSVE2-NEXT:    whilelo p0.b, xzr, x9
-; CHECK-NOSVE2-NEXT:    sbfx x9, x10, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p1.b, xzr, x9
-; CHECK-NOSVE2-NEXT:    mov w9, w3
-; CHECK-NOSVE2-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT:    whilelo p1.b, xzr, x9
-; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.b
-; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT:  .LBB6_2: // %vector.body
-; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT:    ld1b { z0.b }, p1/z, [x0, x8]
-; CHECK-NOSVE2-NEXT:    ld1b { z1.b }, p1/z, [x1, x8]
-; CHECK-NOSVE2-NEXT:    add z0.b, z1.b, z0.b
-; CHECK-NOSVE2-NEXT:    st1b { z0.b }, p1, [x2, x8]
-; CHECK-NOSVE2-NEXT:    add x8, x8, x10
-; CHECK-NOSVE2-NEXT:    whilelo p1.b, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB6_2
-; CHECK-NOSVE2-NEXT:  .LBB6_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %cmp11 = icmp sgt i32 %n, 0
-  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %c14 = ptrtoint ptr %c to i64
-  %b15 = ptrtoint ptr %b to i64
-  %wide.trip.count = zext nneg i32 %n to i64
-  %sub.diff = sub i64 %b15, %c14
-  %neg.compare = icmp slt i64 %sub.diff, 0
-  %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
-  %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
-  %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count)
-  %0 = zext <vscale x 16 x i1> %active.lane.mask.alias to <vscale x 16 x i8>
-  %1 = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %0)
-  %2 = zext i8 %1 to i64
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %3 = and <vscale x 16 x i1> %active.lane.mask, %active.lane.mask.alias
-  %4 = getelementptr inbounds i8, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %4, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison)
-  %5 = getelementptr inbounds i8, ptr %b, i64 %index
-  %wide.masked.load16 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, <vscale x 16 x i1> %3, <vscale x 16 x i8> poison)
-  %6 = add <vscale x 16 x i8> %wide.masked.load16, %wide.masked.load
-  %7 = getelementptr inbounds i8, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %6, ptr %7, i32 1, <vscale x 16 x i1> %3)
-  %index.next = add i64 %index, %2
-  %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %8 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
-  br i1 %8, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
-define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB7_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    mov w8, w3
-; CHECK-NEXT:    whilewr p1.h, x1, x2
-; CHECK-NEXT:    mov x9, xzr
-; CHECK-NEXT:    whilelo p0.h, xzr, x8
-; CHECK-NEXT:    and p0.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:  .LBB7_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0, x9, lsl #1]
-; CHECK-NEXT:    ld1h { z1.h }, p0/z, [x1, x9, lsl #1]
-; CHECK-NEXT:    add z0.h, z1.h, z0.h
-; CHECK-NEXT:    st1h { z0.h }, p0, [x2, x9, lsl #1]
-; CHECK-NEXT:    inch x9
-; CHECK-NEXT:    whilelo p0.h, x9, x8
-; CHECK-NEXT:    b.mi .LBB7_2
-; CHECK-NEXT:  .LBB7_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_loop_16:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB7_3
-; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT:    mov w9, w3
-; CHECK-NOSVE2-NEXT:    sub x10, x1, x2
-; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    whilelo p0.h, xzr, x9
-; CHECK-NOSVE2-NEXT:    cmn x10, #1
-; CHECK-NOSVE2-NEXT:    add x10, x10, x10, lsr #63
-; CHECK-NOSVE2-NEXT:    cset w11, lt
-; CHECK-NOSVE2-NEXT:    sbfx x11, x11, #0, #1
-; CHECK-NOSVE2-NEXT:    asr x10, x10, #1
-; CHECK-NOSVE2-NEXT:    whilelo p1.h, xzr, x11
-; CHECK-NOSVE2-NEXT:    whilelo p2.h, xzr, x10
-; CHECK-NOSVE2-NEXT:    cnth x10
-; CHECK-NOSVE2-NEXT:    mov p1.b, p2/m, p2.b
-; CHECK-NOSVE2-NEXT:    and p0.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT:  .LBB7_2: // %vector.body
-; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
-; CHECK-NOSVE2-NEXT:    ld1h { z1.h }, p0/z, [x1, x8, lsl #1]
-; CHECK-NOSVE2-NEXT:    add z0.h, z1.h, z0.h
-; CHECK-NOSVE2-NEXT:    st1h { z0.h }, p0, [x2, x8, lsl #1]
-; CHECK-NOSVE2-NEXT:    add x8, x8, x10
-; CHECK-NOSVE2-NEXT:    whilelo p0.h, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB7_2
-; CHECK-NOSVE2-NEXT:  .LBB7_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %cmp11 = icmp sgt i32 %n, 0
-  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %b14 = ptrtoint ptr %b to i64
-  %c15 = ptrtoint ptr %c to i64
-  %wide.trip.count = zext nneg i32 %n to i64
-  %0 = tail call i64 @llvm.vscale.i64()
-  %1 = shl nuw nsw i64 %0, 3
-  %active.lane.mask.entry = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count)
-  %sub.diff = sub i64 %b14, %c15
-  %diff = sdiv i64 %sub.diff, 2
-  %neg.compare = icmp slt i64 %sub.diff, -1
-  %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
-  %2 = and <vscale x 8 x i1> %active.lane.mask.alias, %active.lane.mask.entry
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 8 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %3 = getelementptr inbounds i16, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %3, i32 2, <vscale x 8 x i1> %active.lane.mask, <vscale x 8 x i16> poison)
-  %4 = getelementptr inbounds i16, ptr %b, i64 %index
-  %wide.masked.load16 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %4, i32 2, <vscale x 8 x i1> %active.lane.mask, <vscale x 8 x i16> poison)
-  %5 = add <vscale x 8 x i16> %wide.masked.load16, %wide.masked.load
-  %6 = getelementptr inbounds i16, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> %5, ptr %6, i32 2, <vscale x 8 x i1> %active.lane.mask)
-  %index.next = add i64 %index, %1
-  %active.lane.mask.next = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %7 = extractelement <vscale x 8 x i1> %active.lane.mask.next, i64 0
-  br i1 %7, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
-define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB8_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    mov w8, w3
-; CHECK-NEXT:    whilewr p1.s, x1, x2
-; CHECK-NEXT:    mov x9, xzr
-; CHECK-NEXT:    whilelo p0.s, xzr, x8
-; CHECK-NEXT:    and p0.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:  .LBB8_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, x9, lsl #2]
-; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x1, x9, lsl #2]
-; CHECK-NEXT:    add z0.s, z1.s, z0.s
-; CHECK-NEXT:    st1w { z0.s }, p0, [x2, x9, lsl #2]
-; CHECK-NEXT:    incw x9
-; CHECK-NEXT:    whilelo p0.s, x9, x8
-; CHECK-NEXT:    b.mi .LBB8_2
-; CHECK-NEXT:  .LBB8_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_loop_32:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB8_3
-; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT:    mov w9, w3
-; CHECK-NOSVE2-NEXT:    sub x10, x1, x2
-; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    whilelo p0.s, xzr, x9
-; CHECK-NOSVE2-NEXT:    add x11, x10, #3
-; CHECK-NOSVE2-NEXT:    cmp x10, #0
-; CHECK-NOSVE2-NEXT:    csel x11, x11, x10, lt
-; CHECK-NOSVE2-NEXT:    cmn x10, #3
-; CHECK-NOSVE2-NEXT:    cset w10, lt
-; CHECK-NOSVE2-NEXT:    asr x11, x11, #2
-; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p2.s, xzr, x11
-; CHECK-NOSVE2-NEXT:    whilelo p1.s, xzr, x10
-; CHECK-NOSVE2-NEXT:    cntw x10
-; CHECK-NOSVE2-NEXT:    mov p1.b, p2/m, p2.b
-; CHECK-NOSVE2-NEXT:    and p0.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT:  .LBB8_2: // %vector.body
-; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
-; CHECK-NOSVE2-NEXT:    ld1w { z1.s }, p0/z, [x1, x8, lsl #2]
-; CHECK-NOSVE2-NEXT:    add z0.s, z1.s, z0.s
-; CHECK-NOSVE2-NEXT:    st1w { z0.s }, p0, [x2, x8, lsl #2]
-; CHECK-NOSVE2-NEXT:    add x8, x8, x10
-; CHECK-NOSVE2-NEXT:    whilelo p0.s, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB8_2
-; CHECK-NOSVE2-NEXT:  .LBB8_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %cmp9 = icmp sgt i32 %n, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %b12 = ptrtoint ptr %b to i64
-  %c13 = ptrtoint ptr %c to i64
-  %wide.trip.count = zext nneg i32 %n to i64
-  %0 = tail call i64 @llvm.vscale.i64()
-  %1 = shl nuw nsw i64 %0, 2
-  %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
-  %sub.diff = sub i64 %b12, %c13
-  %diff = sdiv i64 %sub.diff, 4
-  %neg.compare = icmp slt i64 %sub.diff, -3
-  %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
-  %2 = and <vscale x 4 x i1> %active.lane.mask.alias, %active.lane.mask.entry
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 4 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %3 = getelementptr inbounds i32, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %3, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
-  %4 = getelementptr inbounds i32, ptr %b, i64 %index
-  %wide.masked.load14 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %4, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
-  %5 = add <vscale x 4 x i32> %wide.masked.load14, %wide.masked.load
-  %6 = getelementptr inbounds i32, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %5, ptr %6, i32 4, <vscale x 4 x i1> %active.lane.mask)
-  %index.next = add i64 %index, %1
-  %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %7 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
-  br i1 %7, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
-define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB9_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    mov w8, w3
-; CHECK-NEXT:    whilewr p1.d, x1, x2
-; CHECK-NEXT:    mov x9, xzr
-; CHECK-NEXT:    whilelo p0.d, xzr, x8
-; CHECK-NEXT:    and p0.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:  .LBB9_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, x9, lsl #3]
-; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x1, x9, lsl #3]
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    st1d { z0.d }, p0, [x2, x9, lsl #3]
-; CHECK-NEXT:    incd x9
-; CHECK-NEXT:    whilelo p0.d, x9, x8
-; CHECK-NEXT:    b.mi .LBB9_2
-; CHECK-NEXT:  .LBB9_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_loop_64:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB9_3
-; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT:    mov w9, w3
-; CHECK-NOSVE2-NEXT:    sub x10, x1, x2
-; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    whilelo p0.d, xzr, x9
-; CHECK-NOSVE2-NEXT:    add x11, x10, #7
-; CHECK-NOSVE2-NEXT:    cmp x10, #0
-; CHECK-NOSVE2-NEXT:    csel x11, x11, x10, lt
-; CHECK-NOSVE2-NEXT:    cmn x10, #7
-; CHECK-NOSVE2-NEXT:    cset w10, lt
-; CHECK-NOSVE2-NEXT:    asr x11, x11, #3
-; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p2.d, xzr, x11
-; CHECK-NOSVE2-NEXT:    whilelo p1.d, xzr, x10
-; CHECK-NOSVE2-NEXT:    cntd x10
-; CHECK-NOSVE2-NEXT:    mov p1.b, p2/m, p2.b
-; CHECK-NOSVE2-NEXT:    and p0.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT:  .LBB9_2: // %vector.body
-; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
-; CHECK-NOSVE2-NEXT:    ld1d { z1.d }, p0/z, [x1, x8, lsl #3]
-; CHECK-NOSVE2-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NOSVE2-NEXT:    st1d { z0.d }, p0, [x2, x8, lsl #3]
-; CHECK-NOSVE2-NEXT:    add x8, x8, x10
-; CHECK-NOSVE2-NEXT:    whilelo p0.d, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB9_2
-; CHECK-NOSVE2-NEXT:  .LBB9_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %cmp9 = icmp sgt i32 %n, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %b12 = ptrtoint ptr %b to i64
-  %c13 = ptrtoint ptr %c to i64
-  %wide.trip.count = zext nneg i32 %n to i64
-  %0 = tail call i64 @llvm.vscale.i64()
-  %1 = shl nuw nsw i64 %0, 1
-  %active.lane.mask.entry = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count)
-  %sub.diff = sub i64 %b12, %c13
-  %diff = sdiv i64 %sub.diff, 8
-  %neg.compare = icmp slt i64 %sub.diff, -7
-  %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
-  %2 = and <vscale x 2 x i1> %active.lane.mask.alias, %active.lane.mask.entry
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 2 x i1> [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %3 = getelementptr inbounds i64, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %3, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i64> poison)
-  %4 = getelementptr inbounds i64, ptr %b, i64 %index
-  %wide.masked.load14 = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %4, i32 8, <vscale x 2 x i1> %active.lane.mask, <vscale x 2 x i64> poison)
-  %5 = add <vscale x 2 x i64> %wide.masked.load14, %wide.masked.load
-  %6 = getelementptr inbounds i64, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> %5, ptr %6, i32 8, <vscale x 2 x i1> %active.lane.mask)
-  %index.next = add i64 %index, %1
-  %active.lane.mask.next = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %7 = extractelement <vscale x 2 x i1> %active.lane.mask.next, i64 0
-  br i1 %7, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
-define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_multiple_8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB10_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    whilewr p0.b, x0, x2
-; CHECK-NEXT:    mov w9, w3
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    whilewr p1.b, x1, x2
-; CHECK-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; CHECK-NEXT:    whilelo p1.b, xzr, x9
-; CHECK-NEXT:    cntp x10, p0, p0.b
-; CHECK-NEXT:    and x10, x10, #0xff
-; CHECK-NEXT:  .LBB10_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x0, x8]
-; CHECK-NEXT:    ld1b { z1.b }, p1/z, [x1, x8]
-; CHECK-NEXT:    add z0.b, z1.b, z0.b
-; CHECK-NEXT:    st1b { z0.b }, p1, [x2, x8]
-; CHECK-NEXT:    add x8, x8, x10
-; CHECK-NEXT:    whilelo p1.b, x8, x9
-; CHECK-NEXT:    b.mi .LBB10_2
-; CHECK-NEXT:  .LBB10_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_8:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB10_3
-; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT:    sub x9, x0, x2
-; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    cmp x9, #0
-; CHECK-NOSVE2-NEXT:    cset w10, lt
-; CHECK-NOSVE2-NEXT:    whilelo p0.b, xzr, x9
-; CHECK-NOSVE2-NEXT:    sub x9, x1, x2
-; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p1.b, xzr, x10
-; CHECK-NOSVE2-NEXT:    cmp x9, #0
-; CHECK-NOSVE2-NEXT:    cset w10, lt
-; CHECK-NOSVE2-NEXT:    whilelo p3.b, xzr, x9
-; CHECK-NOSVE2-NEXT:    mov w9, w3
-; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT:    whilelo p2.b, xzr, x10
-; CHECK-NOSVE2-NEXT:    sel p1.b, p3, p3.b, p2.b
-; CHECK-NOSVE2-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; CHECK-NOSVE2-NEXT:    whilelo p1.b, xzr, x9
-; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.b
-; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT:  .LBB10_2: // %vector.body
-; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT:    ld1b { z0.b }, p1/z, [x0, x8]
-; CHECK-NOSVE2-NEXT:    ld1b { z1.b }, p1/z, [x1, x8]
-; CHECK-NOSVE2-NEXT:    add z0.b, z1.b, z0.b
-; CHECK-NOSVE2-NEXT:    st1b { z0.b }, p1, [x2, x8]
-; CHECK-NOSVE2-NEXT:    add x8, x8, x10
-; CHECK-NOSVE2-NEXT:    whilelo p1.b, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB10_2
-; CHECK-NOSVE2-NEXT:  .LBB10_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %cmp11 = icmp sgt i32 %n, 0
-  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %c14 = ptrtoint ptr %c to i64
-  %a15 = ptrtoint ptr %a to i64
-  %b16 = ptrtoint ptr %b to i64
-  %wide.trip.count = zext nneg i32 %n to i64
-  %sub.diff = sub i64 %a15, %c14
-  %neg.compare = icmp slt i64 %sub.diff, 0
-  %.splatinsert = insertelement <vscale x 16 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 16 x i1> %.splatinsert, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff)
-  %active.lane.mask.alias = or <vscale x 16 x i1> %ptr.diff.lane.mask, %.splat
-  %sub.diff18 = sub i64 %b16, %c14
-  %neg.compare20 = icmp slt i64 %sub.diff18, 0
-  %.splatinsert21 = insertelement <vscale x 16 x i1> poison, i1 %neg.compare20, i64 0
-  %.splat22 = shufflevector <vscale x 16 x i1> %.splatinsert21, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
-  %ptr.diff.lane.mask23 = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff18)
-  %active.lane.mask.alias24 = or <vscale x 16 x i1> %ptr.diff.lane.mask23, %.splat22
-  %0 = and <vscale x 16 x i1> %active.lane.mask.alias, %active.lane.mask.alias24
-  %active.lane.mask.entry = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %wide.trip.count)
-  %1 = zext <vscale x 16 x i1> %0 to <vscale x 16 x i8>
-  %2 = tail call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> %1)
-  %3 = zext i8 %2 to i64
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 16 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %4 = and <vscale x 16 x i1> %active.lane.mask, %0
-  %5 = getelementptr inbounds i8, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %5, i32 1, <vscale x 16 x i1> %4, <vscale x 16 x i8> poison)
-  %6 = getelementptr inbounds i8, ptr %b, i64 %index
-  %wide.masked.load25 = tail call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %6, i32 1, <vscale x 16 x i1> %4, <vscale x 16 x i8> poison)
-  %7 = add <vscale x 16 x i8> %wide.masked.load25, %wide.masked.load
-  %8 = getelementptr inbounds i8, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> %7, ptr %8, i32 1, <vscale x 16 x i1> %4)
-  %index.next = add i64 %index, %3
-  %active.lane.mask.next = tail call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %9 = extractelement <vscale x 16 x i1> %active.lane.mask.next, i64 0
-  br i1 %9, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
-define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_multiple_16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB11_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    whilewr p0.h, x0, x2
-; CHECK-NEXT:    mov w9, w3
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    whilewr p1.h, x1, x2
-; CHECK-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; CHECK-NEXT:    whilelo p1.h, xzr, x9
-; CHECK-NEXT:    cntp x10, p0, p0.h
-; CHECK-NEXT:    and x10, x10, #0xff
-; CHECK-NEXT:  .LBB11_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:    ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
-; CHECK-NEXT:    ld1h { z1.h }, p1/z, [x1, x8, lsl #1]
-; CHECK-NEXT:    add z0.h, z1.h, z0.h
-; CHECK-NEXT:    st1h { z0.h }, p1, [x2, x8, lsl #1]
-; CHECK-NEXT:    add x8, x8, x10
-; CHECK-NEXT:    whilelo p1.h, x8, x9
-; CHECK-NEXT:    b.mi .LBB11_2
-; CHECK-NEXT:  .LBB11_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_16:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB11_3
-; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT:    sub x9, x0, x2
-; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    cmn x9, #1
-; CHECK-NOSVE2-NEXT:    add x9, x9, x9, lsr #63
-; CHECK-NOSVE2-NEXT:    cset w10, lt
-; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT:    asr x9, x9, #1
-; CHECK-NOSVE2-NEXT:    whilelo p0.h, xzr, x10
-; CHECK-NOSVE2-NEXT:    sub x10, x1, x2
-; CHECK-NOSVE2-NEXT:    whilelo p1.h, xzr, x9
-; CHECK-NOSVE2-NEXT:    add x9, x10, x10, lsr #63
-; CHECK-NOSVE2-NEXT:    cmn x10, #1
-; CHECK-NOSVE2-NEXT:    cset w10, lt
-; CHECK-NOSVE2-NEXT:    asr x9, x9, #1
-; CHECK-NOSVE2-NEXT:    mov p0.b, p1/m, p1.b
-; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p3.h, xzr, x9
-; CHECK-NOSVE2-NEXT:    mov w9, w3
-; CHECK-NOSVE2-NEXT:    whilelo p2.h, xzr, x10
-; CHECK-NOSVE2-NEXT:    sel p1.b, p3, p3.b, p2.b
-; CHECK-NOSVE2-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; CHECK-NOSVE2-NEXT:    whilelo p1.h, xzr, x9
-; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.h
-; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT:  .LBB11_2: // %vector.body
-; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT:    ld1h { z0.h }, p1/z, [x0, x8, lsl #1]
-; CHECK-NOSVE2-NEXT:    ld1h { z1.h }, p1/z, [x1, x8, lsl #1]
-; CHECK-NOSVE2-NEXT:    add z0.h, z1.h, z0.h
-; CHECK-NOSVE2-NEXT:    st1h { z0.h }, p1, [x2, x8, lsl #1]
-; CHECK-NOSVE2-NEXT:    add x8, x8, x10
-; CHECK-NOSVE2-NEXT:    whilelo p1.h, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB11_2
-; CHECK-NOSVE2-NEXT:  .LBB11_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %cmp11 = icmp sgt i32 %n, 0
-  br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %c14 = ptrtoint ptr %c to i64
-  %a15 = ptrtoint ptr %a to i64
-  %b16 = ptrtoint ptr %b to i64
-  %wide.trip.count = zext nneg i32 %n to i64
-  %sub.diff = sub i64 %a15, %c14
-  %diff = sdiv i64 %sub.diff, 2
-  %neg.compare = icmp slt i64 %sub.diff, -1
-  %.splatinsert = insertelement <vscale x 8 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 8 x i1> %.splatinsert, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 8 x i1> %ptr.diff.lane.mask, %.splat
-  %sub.diff18 = sub i64 %b16, %c14
-  %diff19 = sdiv i64 %sub.diff18, 2
-  %neg.compare20 = icmp slt i64 %sub.diff18, -1
-  %.splatinsert21 = insertelement <vscale x 8 x i1> poison, i1 %neg.compare20, i64 0
-  %.splat22 = shufflevector <vscale x 8 x i1> %.splatinsert21, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
-  %ptr.diff.lane.mask23 = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff19)
-  %active.lane.mask.alias24 = or <vscale x 8 x i1> %ptr.diff.lane.mask23, %.splat22
-  %0 = and <vscale x 8 x i1> %active.lane.mask.alias, %active.lane.mask.alias24
-  %active.lane.mask.entry = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count)
-  %1 = zext <vscale x 8 x i1> %0 to <vscale x 8 x i8>
-  %2 = tail call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> %1)
-  %3 = zext i8 %2 to i64
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 8 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %4 = and <vscale x 8 x i1> %active.lane.mask, %0
-  %5 = getelementptr inbounds i16, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %5, i32 2, <vscale x 8 x i1> %4, <vscale x 8 x i16> poison)
-  %6 = getelementptr inbounds i16, ptr %b, i64 %index
-  %wide.masked.load25 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %6, i32 2, <vscale x 8 x i1> %4, <vscale x 8 x i16> poison)
-  %7 = add <vscale x 8 x i16> %wide.masked.load25, %wide.masked.load
-  %8 = getelementptr inbounds i16, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16> %7, ptr %8, i32 2, <vscale x 8 x i1> %4)
-  %index.next = add i64 %index, %3
-  %active.lane.mask.next = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %9 = extractelement <vscale x 8 x i1> %active.lane.mask.next, i64 0
-  br i1 %9, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
-define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_multiple_32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB12_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    whilewr p0.s, x0, x2
-; CHECK-NEXT:    mov w9, w3
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    whilewr p1.s, x1, x2
-; CHECK-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; CHECK-NEXT:    whilelo p1.s, xzr, x9
-; CHECK-NEXT:    cntp x10, p0, p0.s
-; CHECK-NEXT:    and x10, x10, #0xff
-; CHECK-NEXT:  .LBB12_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:    ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
-; CHECK-NEXT:    ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
-; CHECK-NEXT:    add z0.s, z1.s, z0.s
-; CHECK-NEXT:    st1w { z0.s }, p1, [x2, x8, lsl #2]
-; CHECK-NEXT:    add x8, x8, x10
-; CHECK-NEXT:    whilelo p1.s, x8, x9
-; CHECK-NEXT:    b.mi .LBB12_2
-; CHECK-NEXT:  .LBB12_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_32:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB12_3
-; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT:    sub x9, x0, x2
-; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    add x10, x9, #3
-; CHECK-NOSVE2-NEXT:    cmp x9, #0
-; CHECK-NOSVE2-NEXT:    csel x10, x10, x9, lt
-; CHECK-NOSVE2-NEXT:    cmn x9, #3
-; CHECK-NOSVE2-NEXT:    asr x9, x10, #2
-; CHECK-NOSVE2-NEXT:    cset w10, lt
-; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p0.s, xzr, x9
-; CHECK-NOSVE2-NEXT:    sub x9, x1, x2
-; CHECK-NOSVE2-NEXT:    whilelo p1.s, xzr, x10
-; CHECK-NOSVE2-NEXT:    add x10, x9, #3
-; CHECK-NOSVE2-NEXT:    cmp x9, #0
-; CHECK-NOSVE2-NEXT:    csel x10, x10, x9, lt
-; CHECK-NOSVE2-NEXT:    cmn x9, #3
-; CHECK-NOSVE2-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT:    cset w9, lt
-; CHECK-NOSVE2-NEXT:    asr x10, x10, #2
-; CHECK-NOSVE2-NEXT:    sbfx x9, x9, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p3.s, xzr, x10
-; CHECK-NOSVE2-NEXT:    whilelo p2.s, xzr, x9
-; CHECK-NOSVE2-NEXT:    mov w9, w3
-; CHECK-NOSVE2-NEXT:    sel p1.b, p3, p3.b, p2.b
-; CHECK-NOSVE2-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; CHECK-NOSVE2-NEXT:    whilelo p1.s, xzr, x9
-; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.s
-; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT:  .LBB12_2: // %vector.body
-; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT:    ld1w { z0.s }, p1/z, [x0, x8, lsl #2]
-; CHECK-NOSVE2-NEXT:    ld1w { z1.s }, p1/z, [x1, x8, lsl #2]
-; CHECK-NOSVE2-NEXT:    add z0.s, z1.s, z0.s
-; CHECK-NOSVE2-NEXT:    st1w { z0.s }, p1, [x2, x8, lsl #2]
-; CHECK-NOSVE2-NEXT:    add x8, x8, x10
-; CHECK-NOSVE2-NEXT:    whilelo p1.s, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB12_2
-; CHECK-NOSVE2-NEXT:  .LBB12_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %cmp9 = icmp sgt i32 %n, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %c12 = ptrtoint ptr %c to i64
-  %a13 = ptrtoint ptr %a to i64
-  %b14 = ptrtoint ptr %b to i64
-  %wide.trip.count = zext nneg i32 %n to i64
-  %sub.diff = sub i64 %a13, %c12
-  %diff = sdiv i64 %sub.diff, 4
-  %neg.compare = icmp slt i64 %sub.diff, -3
-  %.splatinsert = insertelement <vscale x 4 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 4 x i1> %.splatinsert, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 4 x i1> %ptr.diff.lane.mask, %.splat
-  %sub.diff16 = sub i64 %b14, %c12
-  %diff17 = sdiv i64 %sub.diff16, 4
-  %neg.compare18 = icmp slt i64 %sub.diff16, -3
-  %.splatinsert19 = insertelement <vscale x 4 x i1> poison, i1 %neg.compare18, i64 0
-  %.splat20 = shufflevector <vscale x 4 x i1> %.splatinsert19, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
-  %ptr.diff.lane.mask21 = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff17)
-  %active.lane.mask.alias22 = or <vscale x 4 x i1> %ptr.diff.lane.mask21, %.splat20
-  %0 = and <vscale x 4 x i1> %active.lane.mask.alias, %active.lane.mask.alias22
-  %active.lane.mask.entry = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count)
-  %1 = zext <vscale x 4 x i1> %0 to <vscale x 4 x i8>
-  %2 = tail call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> %1)
-  %3 = zext i8 %2 to i64
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %4 = and <vscale x 4 x i1> %active.lane.mask, %0
-  %5 = getelementptr inbounds i32, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %5, i32 4, <vscale x 4 x i1> %4, <vscale x 4 x i32> poison)
-  %6 = getelementptr inbounds i32, ptr %b, i64 %index
-  %wide.masked.load23 = tail call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %6, i32 4, <vscale x 4 x i1> %4, <vscale x 4 x i32> poison)
-  %7 = add <vscale x 4 x i32> %wide.masked.load23, %wide.masked.load
-  %8 = getelementptr inbounds i32, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %7, ptr %8, i32 4, <vscale x 4 x i1> %4)
-  %index.next = add i64 %index, %3
-  %active.lane.mask.next = tail call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %9 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
-  br i1 %9, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
-define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) {
-; CHECK-LABEL: whilewr_loop_multiple_64:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmp w3, #1
-; CHECK-NEXT:    b.lt .LBB13_3
-; CHECK-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NEXT:    whilewr p0.d, x0, x2
-; CHECK-NEXT:    mov w9, w3
-; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    whilewr p1.d, x1, x2
-; CHECK-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; CHECK-NEXT:    whilelo p1.d, xzr, x9
-; CHECK-NEXT:    cntp x10, p0, p0.d
-; CHECK-NEXT:    and x10, x10, #0xff
-; CHECK-NEXT:  .LBB13_2: // %vector.body
-; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NEXT:    ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
-; CHECK-NEXT:    ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
-; CHECK-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NEXT:    st1d { z0.d }, p1, [x2, x8, lsl #3]
-; CHECK-NEXT:    add x8, x8, x10
-; CHECK-NEXT:    whilelo p1.d, x8, x9
-; CHECK-NEXT:    b.mi .LBB13_2
-; CHECK-NEXT:  .LBB13_3: // %for.cond.cleanup
-; CHECK-NEXT:    ret
-;
-; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_64:
-; CHECK-NOSVE2:       // %bb.0: // %entry
-; CHECK-NOSVE2-NEXT:    cmp w3, #1
-; CHECK-NOSVE2-NEXT:    b.lt .LBB13_3
-; CHECK-NOSVE2-NEXT:  // %bb.1: // %for.body.preheader
-; CHECK-NOSVE2-NEXT:    sub x9, x0, x2
-; CHECK-NOSVE2-NEXT:    mov x8, xzr
-; CHECK-NOSVE2-NEXT:    add x10, x9, #7
-; CHECK-NOSVE2-NEXT:    cmp x9, #0
-; CHECK-NOSVE2-NEXT:    csel x10, x10, x9, lt
-; CHECK-NOSVE2-NEXT:    cmn x9, #7
-; CHECK-NOSVE2-NEXT:    asr x9, x10, #3
-; CHECK-NOSVE2-NEXT:    cset w10, lt
-; CHECK-NOSVE2-NEXT:    sbfx x10, x10, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p0.d, xzr, x9
-; CHECK-NOSVE2-NEXT:    sub x9, x1, x2
-; CHECK-NOSVE2-NEXT:    whilelo p1.d, xzr, x10
-; CHECK-NOSVE2-NEXT:    add x10, x9, #7
-; CHECK-NOSVE2-NEXT:    cmp x9, #0
-; CHECK-NOSVE2-NEXT:    csel x10, x10, x9, lt
-; CHECK-NOSVE2-NEXT:    cmn x9, #7
-; CHECK-NOSVE2-NEXT:    sel p0.b, p0, p0.b, p1.b
-; CHECK-NOSVE2-NEXT:    cset w9, lt
-; CHECK-NOSVE2-NEXT:    asr x10, x10, #3
-; CHECK-NOSVE2-NEXT:    sbfx x9, x9, #0, #1
-; CHECK-NOSVE2-NEXT:    whilelo p3.d, xzr, x10
-; CHECK-NOSVE2-NEXT:    whilelo p2.d, xzr, x9
-; CHECK-NOSVE2-NEXT:    mov w9, w3
-; CHECK-NOSVE2-NEXT:    sel p1.b, p3, p3.b, p2.b
-; CHECK-NOSVE2-NEXT:    and p0.b, p0/z, p0.b, p1.b
-; CHECK-NOSVE2-NEXT:    whilelo p1.d, xzr, x9
-; CHECK-NOSVE2-NEXT:    cntp x10, p0, p0.d
-; CHECK-NOSVE2-NEXT:    and x10, x10, #0xff
-; CHECK-NOSVE2-NEXT:  .LBB13_2: // %vector.body
-; CHECK-NOSVE2-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NOSVE2-NEXT:    and p1.b, p1/z, p1.b, p0.b
-; CHECK-NOSVE2-NEXT:    ld1d { z0.d }, p1/z, [x0, x8, lsl #3]
-; CHECK-NOSVE2-NEXT:    ld1d { z1.d }, p1/z, [x1, x8, lsl #3]
-; CHECK-NOSVE2-NEXT:    add z0.d, z1.d, z0.d
-; CHECK-NOSVE2-NEXT:    st1d { z0.d }, p1, [x2, x8, lsl #3]
-; CHECK-NOSVE2-NEXT:    add x8, x8, x10
-; CHECK-NOSVE2-NEXT:    whilelo p1.d, x8, x9
-; CHECK-NOSVE2-NEXT:    b.mi .LBB13_2
-; CHECK-NOSVE2-NEXT:  .LBB13_3: // %for.cond.cleanup
-; CHECK-NOSVE2-NEXT:    ret
-entry:
-  %cmp9 = icmp sgt i32 %n, 0
-  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:
-  %c12 = ptrtoint ptr %c to i64
-  %a13 = ptrtoint ptr %a to i64
-  %b14 = ptrtoint ptr %b to i64
-  %wide.trip.count = zext nneg i32 %n to i64
-  %sub.diff = sub i64 %a13, %c12
-  %diff = sdiv i64 %sub.diff, 8
-  %neg.compare = icmp slt i64 %sub.diff, -7
-  %.splatinsert = insertelement <vscale x 2 x i1> poison, i1 %neg.compare, i64 0
-  %.splat = shufflevector <vscale x 2 x i1> %.splatinsert, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
-  %ptr.diff.lane.mask = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff)
-  %active.lane.mask.alias = or <vscale x 2 x i1> %ptr.diff.lane.mask, %.splat
-  %sub.diff16 = sub i64 %b14, %c12
-  %diff17 = sdiv i64 %sub.diff16, 8
-  %neg.compare18 = icmp slt i64 %sub.diff16, -7
-  %.splatinsert19 = insertelement <vscale x 2 x i1> poison, i1 %neg.compare18, i64 0
-  %.splat20 = shufflevector <vscale x 2 x i1> %.splatinsert19, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
-  %ptr.diff.lane.mask21 = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff17)
-  %active.lane.mask.alias22 = or <vscale x 2 x i1> %ptr.diff.lane.mask21, %.splat20
-  %0 = and <vscale x 2 x i1> %active.lane.mask.alias, %active.lane.mask.alias22
-  %active.lane.mask.entry = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count)
-  %1 = zext <vscale x 2 x i1> %0 to <vscale x 2 x i8>
-  %2 = tail call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> %1)
-  %3 = zext i8 %2 to i64
-  br label %vector.body
-
-vector.body:
-  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
-  %active.lane.mask = phi <vscale x 2 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
-  %4 = and <vscale x 2 x i1> %active.lane.mask, %0
-  %5 = getelementptr inbounds i64, ptr %a, i64 %index
-  %wide.masked.load = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %5, i32 8, <vscale x 2 x i1> %4, <vscale x 2 x i64> poison)
-  %6 = getelementptr inbounds i64, ptr %b, i64 %index
-  %wide.masked.load23 = tail call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr %6, i32 8, <vscale x 2 x i1> %4, <vscale x 2 x i64> poison)
-  %7 = add <vscale x 2 x i64> %wide.masked.load23, %wide.masked.load
-  %8 = getelementptr inbounds i64, ptr %c, i64 %index
-  tail call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> %7, ptr %8, i32 8, <vscale x 2 x i1> %4)
-  %index.next = add i64 %index, %3
-  %active.lane.mask.next = tail call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 %index.next, i64 %wide.trip.count)
-  %9 = extractelement <vscale x 2 x i1> %active.lane.mask.next, i64 0
-  br i1 %9, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
-declare i64 @llvm.vscale.i64()
-
-declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64)
-
-declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr nocapture, i32 immarg, <vscale x 16 x i1>, <vscale x 16 x i8>)
-
-declare void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8>, ptr nocapture, i32 immarg, <vscale x 16 x i1>)
-
-declare <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64, i64)
-
-declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr nocapture, i32 immarg, <vscale x 8 x i1>, <vscale x 8 x i16>)
-
-declare void @llvm.masked.store.nxv8i16.p0(<vscale x 8 x i16>, ptr nocapture, i32 immarg, <vscale x 8 x i1>)
-
-declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64, i64)
-
-declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr nocapture, i32 immarg, <vscale x 4 x i1>, <vscale x 4 x i32>)
-
-declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr nocapture, i32 immarg, <vscale x 4 x i1>)
-
-declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64, i64)
-
-declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr nocapture, i32 immarg, <vscale x 2 x i1>, <vscale x 2 x i64>)
-
-declare void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64>, ptr nocapture, i32 immarg, <vscale x 2 x i1>)