diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/mem57-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/mem57-cpp.rst
index cc0c729aaacdc..b359d85ad0cdc 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cert/mem57-cpp.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cert/mem57-cpp.rst
@@ -4,7 +4,7 @@ cert-mem57-cpp
 ==============
 
 The `cert-mem57-cpp` is an aliaes, please see
-`bugprone-default-operator-new-on-overaligned-type <../bugprone/default-operator-new-on-overaligned-type>`_
+`bugprone-default-operator-new-on-overaligned-type <../bugprone/default-operator-new-on-overaligned-type.html>`_
 for more information.
 
 This check corresponds to the CERT C++ Coding Standard rule
diff --git a/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst b/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst
index ec9ef1c60913c..6c994a48d83de 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst
@@ -14,3 +14,21 @@ should be generally avoided.
   // becomes
 
   static std::string Moo = (Twine("bark") + "bah").str();
+
+The ``Twine`` does not own the memory of its contents, so it is not
+recommended to use ``Twine`` created from temporary strings or string literals.
+
+.. code-block:: c++
+
+  static Twine getModuleIdentifier(StringRef moduleName) {
+    return moduleName + "_module";
+  }
+  void foo() {
+    Twine result = getModuleIdentifier(std::string{"abc"} + "def");
+    // temporary std::string is destroyed here, result is dangling
+  }
+
+After applying this fix-it hints, the code will use ``std::string`` instead of
+``Twine`` for local variables. However, ``Twine`` has lots of methods that
+are incompatible with ``std::string``, so the user may need to adjust the code
+manually after applying the fix-it hints.
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 48ef8be9fb782..6f099a7027a10 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -180,6 +180,8 @@ struct MissingFeatures {
   static bool atomicSyncScopeID() { return false; }
   static bool atomicTypes() { return false; }
   static bool atomicUseLibCall() { return false; }
+  static bool atomicMicrosoftVolatile() { return false; }
+  static bool atomicOpenMP() { return false; }
 
   // Global ctor handling
   static bool globalCtorLexOrder() { return false; }
diff --git a/clang/lib/AST/ByteCode/Program.cpp b/clang/lib/AST/ByteCode/Program.cpp
index e0b2852f0e906..2425373ab2ef8 100644
--- a/clang/lib/AST/ByteCode/Program.cpp
+++ b/clang/lib/AST/ByteCode/Program.cpp
@@ -218,21 +218,42 @@ UnsignedOrNone Program::createGlobal(const ValueDecl *VD, const Expr *Init) {
     return std::nullopt;
 
   Global *NewGlobal = Globals[*Idx];
+  // Note that this loop has one iteration where Redecl == VD.
   for (const Decl *Redecl : VD->redecls()) {
-    unsigned &PIdx = GlobalIndices[Redecl];
+
+    // If this redecl was registered as a dummy variable, it is now a proper
+    // global variable and points to the block we just created.
+    if (auto DummyIt = DummyVariables.find(Redecl);
+        DummyIt != DummyVariables.end()) {
+      assert(!Globals[DummyIt->second]->block()->hasPointers());
+      Globals[DummyIt->second] = NewGlobal;
+      DummyVariables.erase(DummyIt);
+    }
+    // If the redeclaration hasn't been registered yet at all, we just set its
+    // global index to Idx. If it has been registered yet, it might have
+    // pointers pointing to it and we need to transfer those pointers to the new
+    // block.
+    auto [Iter, Inserted] = GlobalIndices.try_emplace(Redecl);
+    if (Inserted) {
+      GlobalIndices[Redecl] = *Idx;
+      continue;
+    }
+
     if (Redecl != VD) {
-      if (Block *RedeclBlock = Globals[PIdx]->block();
+      if (Block *RedeclBlock = Globals[Iter->second]->block();
           RedeclBlock->isExtern()) {
-        Globals[PIdx] = NewGlobal;
+
         // All pointers pointing to the previous extern decl now point to the
         // new decl.
         // A previous iteration might've already fixed up the pointers for this
         // global.
         if (RedeclBlock != NewGlobal->block())
           RedeclBlock->movePointersTo(NewGlobal->block());
+
+        Globals[Iter->second] = NewGlobal;
       }
     }
-    PIdx = *Idx;
+    Iter->second = *Idx;
   }
 
   return *Idx;
diff --git a/clang/lib/AST/ByteCode/Program.h b/clang/lib/AST/ByteCode/Program.h
index 28fcc97f5339d..cc9127dc77860 100644
--- a/clang/lib/AST/ByteCode/Program.h
+++ b/clang/lib/AST/ByteCode/Program.h
@@ -205,7 +205,6 @@ class Program final {
     const Block *block() const { return &B; }
 
   private:
-    /// Required metadata - does not actually track pointers.
     Block B;
   };
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
index 7db6e283ec0a5..cd4c1f0e5b769 100644
--- a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
@@ -27,6 +27,7 @@ class AtomicInfo {
   CharUnits atomicAlign;
   CharUnits valueAlign;
   TypeEvaluationKind evaluationKind = cir::TEK_Scalar;
+  bool useLibCall = true;
   LValue lvalue;
   mlir::Location loc;
 
@@ -62,8 +63,8 @@ class AtomicInfo {
       assert(!cir::MissingFeatures::atomicInfo());
       cgf.cgm.errorNYI(loc, "AtomicInfo: non-simple lvalue");
     }
-
-    assert(!cir::MissingFeatures::atomicUseLibCall());
+    useLibCall = !ctx.getTargetInfo().hasBuiltinAtomic(
+        atomicSizeInBits, ctx.toBits(lvalue.getAlignment()));
   }
 
   QualType getValueType() const { return valueTy; }
@@ -75,6 +76,8 @@ class AtomicInfo {
     assert(!cir::MissingFeatures::atomicInfoGetAtomicPointer());
     return nullptr;
   }
+  bool shouldUseLibCall() const { return useLibCall; }
+  const LValue &getAtomicLValue() const { return lvalue; }
   Address getAtomicAddress() const {
     mlir::Type elemTy;
     if (lvalue.isSimple()) {
@@ -96,6 +99,8 @@ class AtomicInfo {
 
   bool emitMemSetZeroIfNecessary() const;
 
+  mlir::Value getScalarRValValueOrNull(RValue rvalue) const;
+
   /// Cast the given pointer to an integer pointer suitable for atomic
   /// operations on the source.
   Address castToAtomicIntPointer(Address addr) const;
@@ -105,6 +110,9 @@ class AtomicInfo {
   /// copy the value across.
   Address convertToAtomicIntPointer(Address addr) const;
 
+  /// Converts a rvalue to integer value.
+  mlir::Value convertRValueToInt(RValue rvalue, bool cmpxchg = false) const;
+
   /// Copy an atomic r-value into atomic-layout memory.
   void emitCopyIntoMemory(RValue rvalue) const;
 
@@ -195,6 +203,12 @@ Address AtomicInfo::createTempAlloca() const {
   return tempAlloca;
 }
 
+mlir::Value AtomicInfo::getScalarRValValueOrNull(RValue rvalue) const {
+  if (rvalue.isScalar() && (!hasPadding() || !lvalue.isSimple()))
+    return rvalue.getValue();
+  return nullptr;
+}
+
 Address AtomicInfo::castToAtomicIntPointer(Address addr) const {
   auto intTy = mlir::dyn_cast<cir::IntType>(addr.getElementType());
   // Don't bother with int casts if the integer size is the same.
@@ -211,10 +225,38 @@ bool AtomicInfo::emitMemSetZeroIfNecessary() const {
     return false;
 
   cgf.cgm.errorNYI(loc,
-                   "AtomicInfo::emitMemSetZeroIfNecessary: emit memset zero");
+                   "AtomicInfo::emitMemSetZeroIfNecaessary: emit memset zero");
   return false;
 }
 
+/// Return true if \param valueTy is a type that should be casted to integer
+/// around the atomic memory operation. If \param cmpxchg is true, then the
+/// cast of a floating point type is made as that instruction can not have
+/// floating point operands.  TODO: Allow compare-and-exchange and FP - see
+/// comment in CIRGenAtomicExpandPass.cpp.
+static bool shouldCastToInt(mlir::Type valueTy, bool cmpxchg) {
+  if (cir::isAnyFloatingPointType(valueTy))
+    return isa<cir::FP80Type>(valueTy) || cmpxchg;
+  return !isa<cir::IntType>(valueTy) && !isa<cir::PointerType>(valueTy);
+}
+
+mlir::Value AtomicInfo::convertRValueToInt(RValue rvalue, bool cmpxchg) const {
+  // If we've got a scalar value of the right size, try to avoid going
+  // through memory. Floats get casted if needed by AtomicExpandPass.
+  if (mlir::Value value = getScalarRValValueOrNull(rvalue)) {
+    if (!shouldCastToInt(value.getType(), cmpxchg))
+      return cgf.emitToMemory(value, valueTy);
+
+    cgf.cgm.errorNYI(
+        loc, "AtomicInfo::convertRValueToInt: cast scalar rvalue to int");
+    return nullptr;
+  }
+
+  cgf.cgm.errorNYI(
+      loc, "AtomicInfo::convertRValueToInt: cast non-scalar rvalue to int");
+  return nullptr;
+}
+
 /// Copy an r-value into memory as part of storing to an atomic type.
 /// This needs to create a bit-pattern suitable for atomic operations.
 void AtomicInfo::emitCopyIntoMemory(RValue rvalue) const {
@@ -815,6 +857,79 @@ RValue CIRGenFunction::emitAtomicExpr(AtomicExpr *e) {
       e->getExprLoc());
 }
 
+void CIRGenFunction::emitAtomicStore(RValue rvalue, LValue dest, bool isInit) {
+  bool isVolatile = dest.isVolatileQualified();
+  auto order = cir::MemOrder::SequentiallyConsistent;
+  if (!dest.getType()->isAtomicType()) {
+    assert(!cir::MissingFeatures::atomicMicrosoftVolatile());
+  }
+  return emitAtomicStore(rvalue, dest, order, isVolatile, isInit);
+}
+
+/// Emit a store to an l-value of atomic type.
+///
+/// Note that the r-value is expected to be an r-value of the atomic type; this
+/// means that for aggregate r-values, it should include storage for any padding
+/// that was necessary.
+void CIRGenFunction::emitAtomicStore(RValue rvalue, LValue dest,
+                                     cir::MemOrder order, bool isVolatile,
+                                     bool isInit) {
+  // If this is an aggregate r-value, it should agree in type except
+  // maybe for address-space qualification.
+  mlir::Location loc = dest.getPointer().getLoc();
+  assert(!rvalue.isAggregate() ||
+         rvalue.getAggregateAddress().getElementType() ==
+             dest.getAddress().getElementType());
+
+  AtomicInfo atomics(*this, dest, loc);
+  LValue lvalue = atomics.getAtomicLValue();
+
+  if (lvalue.isSimple()) {
+    // If this is an initialization, just put the value there normally.
+    if (isInit) {
+      atomics.emitCopyIntoMemory(rvalue);
+      return;
+    }
+
+    // Check whether we should use a library call.
+    if (atomics.shouldUseLibCall()) {
+      assert(!cir::MissingFeatures::atomicUseLibCall());
+      cgm.errorNYI(loc, "emitAtomicStore: atomic store with library call");
+      return;
+    }
+
+    // Okay, we're doing this natively.
+    mlir::Value valueToStore = atomics.convertRValueToInt(rvalue);
+
+    // Do the atomic store.
+    Address addr = atomics.getAtomicAddress();
+    if (mlir::Value value = atomics.getScalarRValValueOrNull(rvalue)) {
+      if (shouldCastToInt(value.getType(), /*CmpXchg=*/false)) {
+        addr = atomics.castToAtomicIntPointer(addr);
+        valueToStore =
+            builder.createIntCast(valueToStore, addr.getElementType());
+      }
+    }
+    cir::StoreOp store = builder.createStore(loc, valueToStore, addr);
+
+    // Initializations don't need to be atomic.
+    if (!isInit) {
+      assert(!cir::MissingFeatures::atomicOpenMP());
+      store.setMemOrder(order);
+    }
+
+    // Other decoration.
+    if (isVolatile)
+      store.setIsVolatile(true);
+
+    assert(!cir::MissingFeatures::opLoadStoreTbaa());
+    return;
+  }
+
+  cgm.errorNYI(loc, "emitAtomicStore: non-simple atomic lvalue");
+  assert(!cir::MissingFeatures::opLoadStoreAtomic());
+}
+
 void CIRGenFunction::emitAtomicInit(Expr *init, LValue dest) {
   AtomicInfo atomics(*this, dest, getLoc(init->getSourceRange()));
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 4fb178df0e508..422fa1cf5ad2e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -311,7 +311,8 @@ static LValue emitGlobalVarDeclLValue(CIRGenFunction &cgf, const Expr *e,
 
 void CIRGenFunction::emitStoreOfScalar(mlir::Value value, Address addr,
                                        bool isVolatile, QualType ty,
-                                       bool isInit, bool isNontemporal) {
+                                       LValueBaseInfo baseInfo, bool isInit,
+                                       bool isNontemporal) {
   assert(!cir::MissingFeatures::opLoadStoreThreadLocal());
 
   if (const auto *clangVecTy = ty->getAs<clang::VectorType>()) {
@@ -333,7 +334,13 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, Address addr,
 
   value = emitToMemory(value, ty);
 
-  assert(!cir::MissingFeatures::opLoadStoreAtomic());
+  assert(!cir::MissingFeatures::opLoadStoreTbaa());
+  LValue atomicLValue = LValue::makeAddr(addr, ty, baseInfo);
+  if (ty->isAtomicType() ||
+      (!isInit && isLValueSuitableForInlineAtomic(atomicLValue))) {
+    emitAtomicStore(RValue::get(value), atomicLValue, isInit);
+    return;
+  }
 
   // Update the alloca with more info on initialization.
   assert(addr.getPointer() && "expected pointer to exist");
@@ -550,7 +557,8 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, LValue lvalue,
   }
 
   emitStoreOfScalar(value, lvalue.getAddress(), lvalue.isVolatile(),
-                    lvalue.getType(), isInit, /*isNontemporal=*/false);
+                    lvalue.getType(), lvalue.getBaseInfo(), isInit,
+                    /*isNontemporal=*/false);
 }
 
 mlir::Value CIRGenFunction::emitLoadOfScalar(Address addr, bool isVolatile,
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index dece642eb13b6..1c52a78d72e33 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1271,6 +1271,9 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   RValue emitAtomicExpr(AtomicExpr *e);
   void emitAtomicInit(Expr *init, LValue dest);
+  void emitAtomicStore(RValue rvalue, LValue dest, bool isInit);
+  void emitAtomicStore(RValue rvalue, LValue dest, cir::MemOrder order,
+                       bool isVolatile, bool isInit);
 
   AutoVarEmission emitAutoVarAlloca(const clang::VarDecl &d,
                                     mlir::OpBuilder::InsertPoint ip = {});
@@ -1680,8 +1683,8 @@ class CIRGenFunction : public CIRGenTypeCache {
                           bool isInit);
 
   void emitStoreOfScalar(mlir::Value value, Address addr, bool isVolatile,
-                         clang::QualType ty, bool isInit = false,
-                         bool isNontemporal = false);
+                         clang::QualType ty, LValueBaseInfo baseInfo,
+                         bool isInit = false, bool isNontemporal = false);
   void emitStoreOfScalar(mlir::Value value, LValue lvalue, bool isInit);
 
   /// Store the specified rvalue into the specified
diff --git a/clang/test/AST/ByteCode/records.cpp b/clang/test/AST/ByteCode/records.cpp
index 83f32c97c50c7..4799ebe25dde1 100644
--- a/clang/test/AST/ByteCode/records.cpp
+++ b/clang/test/AST/ByteCode/records.cpp
@@ -1882,3 +1882,14 @@ namespace MethodWillHaveBody {
   }
   int n = f(0); // both-note {{instantiation of}}
 }
+
+namespace StaticRedecl {
+  struct T {
+    static T tt;
+    constexpr T() : p(&tt) {}
+    T *p;
+  };
+  T T::tt;
+  constexpr T t;
+  static_assert(t.p == &T::tt, "");
+}
diff --git a/clang/test/AST/ast-dump-arm-attr.c b/clang/test/AST/ast-dump-arm-attr.c
index 78f557d4eb0b1..d26a77d067e97 100644
--- a/clang/test/AST/ast-dump-arm-attr.c
+++ b/clang/test/AST/ast-dump-arm-attr.c
@@ -2,7 +2,7 @@
 // RUN: %clang_cc1 -triple arm-apple-darwin -ast-dump -ast-dump-filter Test %s \
 // RUN: | FileCheck --strict-whitespace %s
 //
-// RUN: %clang_cc1 -triple armv8m.base-none-eabi -mcmse -ast-dump -ast-dump-filter Test %s \
+// RUN: %clang_cc1 -triple thumbv8m.base-none-eabi -mcmse -ast-dump -ast-dump-filter Test %s \
 // RUN: | FileCheck --strict-whitespace %s --check-prefix=CHECK-CMSE
 //
 // Tests with serialization:
@@ -11,8 +11,8 @@
 // RUN: | sed -e "s/ <undeserialized declarations>//" -e "s/ imported//" \
 // RUN: | FileCheck --strict-whitespace %s
 //
-// RUN: %clang_cc1 -triple armv8m.base-none-eabi -mcmse -emit-pch -o %t %s
-// RUN: %clang_cc1 -x c -triple armv8m.base-none-eabi -mcmse -include-pch %t -ast-dump-all -ast-dump-filter Test /dev/null \
+// RUN: %clang_cc1 -triple thumbv8m.base-none-eabi -mcmse -emit-pch -o %t %s
+// RUN: %clang_cc1 -x c -triple thumbv8m.base-none-eabi -mcmse -include-pch %t -ast-dump-all -ast-dump-filter Test /dev/null \
 // RUN: | sed -e "s/ <undeserialized declarations>//" -e "s/ imported//" \
 // RUN: | FileCheck --strict-whitespace %s
 
diff --git a/clang/test/CIR/CodeGen/atomic.c b/clang/test/CIR/CodeGen/atomic.c
index 65799881a0cbe..d5bea8446d730 100644
--- a/clang/test/CIR/CodeGen/atomic.c
+++ b/clang/test/CIR/CodeGen/atomic.c
@@ -46,6 +46,32 @@ void f2(void) {
 // OGCG-NEXT:    store i32 42, ptr %[[SLOT]], align 4
 // OGCG:       }
 
+void f3(_Atomic(int) *p) {
+  *p = 42;
+}
+
+// CIR-LABEL: @f3
+// CIR: cir.store align(4) atomic(seq_cst) %{{.+}}, %{{.+}} : !s32i, !cir.ptr<!s32i>
+
+// LLVM-LABEL: @f3
+// LLVM: store atomic i32 42, ptr %{{.+}} seq_cst, align 4
+
+// OGCG-LABEL: @f3
+// OGCG: store atomic i32 42, ptr %{{.+}} seq_cst, align 4
+
+void f4(_Atomic(float) *p) {
+  *p = 3.14;
+}
+
+// CIR-LABEL: @f4
+// CIR: cir.store align(4) atomic(seq_cst) %{{.+}}, %{{.+}} : !cir.float, !cir.ptr<!cir.float>
+
+// LLVM-LABEL: @f4
+// LLVM: store atomic float 0x40091EB860000000, ptr %{{.+}} seq_cst, align 4
+
+// OGCG-LABEL: @f4
+// OGCG: store atomic float 0x40091EB860000000, ptr %{{.+}} seq_cst, align 4
+
 void load(int *ptr) {
   int x;
   __atomic_load(ptr, &x, __ATOMIC_RELAXED);
diff --git a/clang/test/CodeGen/arm-acle-coproc.c b/clang/test/CodeGen/arm-acle-coproc.c
index 5acb9f65413a0..000fff632f0b7 100644
--- a/clang/test/CodeGen/arm-acle-coproc.c
+++ b/clang/test/CodeGen/arm-acle-coproc.c
@@ -4,10 +4,10 @@
 // RUN: %clang_cc1 -triple armv5te %s -E -dD -o - | FileCheck --check-prefix=CHECK-V5-TE %s
 // RUN: %clang_cc1 -triple armv5tej %s -E -dD -o - | FileCheck --check-prefix=CHECK-V5-TE %s
 // RUN: %clang_cc1 -triple armv6 %s -E -dD -o - | FileCheck --check-prefix=CHECK-V6 %s
-// RUN: %clang_cc1 -triple armv6m %s -E -dD -o - | FileCheck --check-prefix=CHECK-V6M %s
+// RUN: %clang_cc1 -triple thumbv6m %s -E -dD -o - | FileCheck --check-prefix=CHECK-V6M %s
 // RUN: %clang_cc1 -triple armv7a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V7 %s
 // RUN: %clang_cc1 -triple armv7r %s -E -dD -o - | FileCheck --check-prefix=CHECK-V7 %s
-// RUN: %clang_cc1 -triple armv7m %s -E -dD -o - | FileCheck --check-prefix=CHECK-V7 %s
+// RUN: %clang_cc1 -triple thumbv7m %s -E -dD -o - | FileCheck --check-prefix=CHECK-V7 %s
 // RUN: %clang_cc1 -triple armv8a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple armv8r %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple armv8.1a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
diff --git a/clang/test/CodeGen/pr45476.cpp b/clang/test/CodeGen/pr45476.cpp
index c95f7fb8cd9c3..3a67904a8e568 100644
--- a/clang/test/CodeGen/pr45476.cpp
+++ b/clang/test/CodeGen/pr45476.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple armv6m-eabi -emit-llvm %s -o - | FileCheck -check-prefix=LIBCALL %s
+// RUN: %clang_cc1 -triple thumbv6m-eabi -emit-llvm %s -o - | FileCheck -check-prefix=LIBCALL %s
 // RUN: %clang_cc1 -triple armv8-eabi -emit-llvm %s -o - | FileCheck -check-prefix=NATIVE %s
 // PR45476
 
diff --git a/clang/test/Sema/builtins-arm-exclusive-124.c b/clang/test/Sema/builtins-arm-exclusive-124.c
index b35ac181f0887..93540879a01ba 100644
--- a/clang/test/Sema/builtins-arm-exclusive-124.c
+++ b/clang/test/Sema/builtins-arm-exclusive-124.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -triple armv7m -fsyntax-only -verify %s
-// RUN: %clang_cc1 -triple armv8m.main -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple thumbv7m -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple thumbv8m.main -fsyntax-only -verify %s
 // RUN: %clang_cc1 -triple armv8.1m.main -fsyntax-only -verify %s
 
 // All these architecture versions provide 1-, 2- or 4-byte exclusive accesses,
diff --git a/clang/test/Sema/builtins-arm-exclusive-none.c b/clang/test/Sema/builtins-arm-exclusive-none.c
index 2ef910dd99aaf..25a71e18935a6 100644
--- a/clang/test/Sema/builtins-arm-exclusive-none.c
+++ b/clang/test/Sema/builtins-arm-exclusive-none.c
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple armv6m -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple thumbv6m -fsyntax-only -verify %s
 
 // Armv6-M does not support exclusive loads/stores at all, so all uses of
 // __builtin_arm_ldrex[d] and __builtin_arm_strex[d] is forbidden.
diff --git a/clang/test/SemaCXX/dllexport.cpp b/clang/test/SemaCXX/dllexport.cpp
index f503e2fc311d1..169af5cacc6c7 100644
--- a/clang/test/SemaCXX/dllexport.cpp
+++ b/clang/test/SemaCXX/dllexport.cpp
@@ -1,13 +1,13 @@
-// RUN: %clang_cc1 -triple i686-win32             -fsyntax-only -fms-extensions -verify -std=c++11 -Wunsupported-dll-base-class-template -DMS  %s
-// RUN: %clang_cc1 -triple x86_64-win32           -fsyntax-only -fms-extensions -verify -std=c++1y -Wunsupported-dll-base-class-template -DMS  %s
-// RUN: %clang_cc1 -triple i686-mingw32           -fsyntax-only -fms-extensions -verify -std=c++1y -Wunsupported-dll-base-class-template -DGNU %s
-// RUN: %clang_cc1 -triple x86_64-mingw32         -fsyntax-only -fms-extensions -verify -std=c++11 -Wunsupported-dll-base-class-template -DGNU %s
-// RUN: %clang_cc1 -triple i686-pc-cygwin         -fsyntax-only -fms-extensions -verify -std=c++1y -Wunsupported-dll-base-class-template -DGNU %s
-// RUN: %clang_cc1 -triple x86_64-pc-cygwin       -fsyntax-only -fms-extensions -verify -std=c++11 -Wunsupported-dll-base-class-template -DGNU %s
-// RUN: %clang_cc1 -triple i686-windows-itanium   -fsyntax-only -fms-extensions -verify -std=c++11 -Wunsupported-dll-base-class-template -DWI  %s
-// RUN: %clang_cc1 -triple x86_64-windows-itanium -fsyntax-only -fms-extensions -verify -std=c++1y -Wunsupported-dll-base-class-template -DWI  %s
-// RUN: %clang_cc1 -triple x86_64-scei-ps4        -fsyntax-only -fdeclspec      -verify -std=c++11 -Wunsupported-dll-base-class-template -DPS  %s
-// RUN: %clang_cc1 -triple x86_64-sie-ps5         -fsyntax-only -fdeclspec      -verify -std=c++1y -Wunsupported-dll-base-class-template -DPS  %s
+// RUN: %clang_cc1 -triple i686-win32             -fsyntax-only -fms-extensions -verify=expected,ms,non-gnu,ms-ps        -std=c++11 -Wunsupported-dll-base-class-template %s
+// RUN: %clang_cc1 -triple x86_64-win32           -fsyntax-only -fms-extensions -verify=expected,ms,non-gnu,ms-ps        -std=c++1y -Wunsupported-dll-base-class-template %s
+// RUN: %clang_cc1 -triple i686-mingw32           -fsyntax-only -fms-extensions -verify=expected,non-ms,gnu,win-gnu      -std=c++1y -Wunsupported-dll-base-class-template %s
+// RUN: %clang_cc1 -triple x86_64-mingw32         -fsyntax-only -fms-extensions -verify=expected,non-ms,gnu,win-gnu      -std=c++11 -Wunsupported-dll-base-class-template %s
+// RUN: %clang_cc1 -triple i686-pc-cygwin         -fsyntax-only -fms-extensions -verify=expected,non-ms,gnu,win-gnu      -std=c++1y -Wunsupported-dll-base-class-template %s
+// RUN: %clang_cc1 -triple x86_64-pc-cygwin       -fsyntax-only -fms-extensions -verify=expected,non-ms,gnu,win-gnu      -std=c++11 -Wunsupported-dll-base-class-template %s
+// RUN: %clang_cc1 -triple i686-windows-itanium   -fsyntax-only -fms-extensions -verify=expected,non-ms,non-gnu,win-gnu  -std=c++11 -Wunsupported-dll-base-class-template %s
+// RUN: %clang_cc1 -triple x86_64-windows-itanium -fsyntax-only -fms-extensions -verify=expected,non-ms,non-gnu,win-gnu  -std=c++1y -Wunsupported-dll-base-class-template %s
+// RUN: %clang_cc1 -triple x86_64-scei-ps4        -fsyntax-only -fdeclspec      -verify=expected,non-ms,non-gnu,ms-ps    -std=c++11 -Wunsupported-dll-base-class-template %s
+// RUN: %clang_cc1 -triple x86_64-sie-ps5         -fsyntax-only -fdeclspec      -verify=expected,non-ms,non-gnu,ms-ps    -std=c++1y -Wunsupported-dll-base-class-template %s
 
 // Helper structs to make templates more expressive.
 struct ImplicitInst_Exported {};
@@ -75,9 +75,7 @@ __declspec(dllexport) extern int GlobalRedecl4; // expected-warning{{redeclarati
 // External linkage is required.
 __declspec(dllexport) static int StaticGlobal; // expected-error{{'StaticGlobal' must have external linkage when declared 'dllexport'}}
 __declspec(dllexport) Internal InternalTypeGlobal; // expected-error{{'InternalTypeGlobal' must have external linkage when declared 'dllexport'}}
-#ifndef MS
-namespace    { __declspec(dllexport) int InternalGlobal; } // expected-error{{'(anonymous namespace)::InternalGlobal' must have external linkage when declared 'dllexport'}}
-#endif
+namespace    { __declspec(dllexport) int InternalGlobal; } // non-ms-error{{'(anonymous namespace)::InternalGlobal' must have external linkage when declared 'dllexport'}}
 namespace ns { __declspec(dllexport) int ExternalGlobal; }
 
 __declspec(dllexport) auto InternalAutoTypeGlobal = Internal(); // expected-error{{'InternalAutoTypeGlobal' must have external linkage when declared 'dllexport'}}
@@ -132,9 +130,7 @@ template<typename T> __declspec(dllexport) extern int VarTmplRedecl3; // expecte
 // External linkage is required.
 template<typename T> __declspec(dllexport) static int StaticVarTmpl; // expected-error{{'StaticVarTmpl' must have external linkage when declared 'dllexport'}}
 template<typename T> __declspec(dllexport) Internal InternalTypeVarTmpl; // expected-error{{'InternalTypeVarTmpl' must have external linkage when declared 'dllexport'}}
-#ifndef MS
-namespace    { template<typename T> __declspec(dllexport) int InternalVarTmpl; } // expected-error{{'(anonymous namespace)::InternalVarTmpl' must have external linkage when declared 'dllexport'}}
-#endif
+namespace    { template<typename T> __declspec(dllexport) int InternalVarTmpl; } // non-ms-error{{'(anonymous namespace)::InternalVarTmpl' must have external linkage when declared 'dllexport'}}
 namespace ns { template<typename T> __declspec(dllexport) int ExternalVarTmpl = 1; }
 
 template<typename T> __declspec(dllexport) auto InternalAutoTypeVarTmpl = Internal(); // expected-error{{'InternalAutoTypeVarTmpl' must have external linkage when declared 'dllexport'}}
@@ -355,11 +351,8 @@ class __declspec(dllexport) ClassDecl;
 
 class __declspec(dllexport) ClassDef {};
 
-#if defined(MS) || defined (WI) || defined(PS)
-// expected-warning@+3{{'dllexport' attribute ignored}}
-#endif
 template <typename T> struct PartiallySpecializedClassTemplate {};
-template <typename T> struct __declspec(dllexport) PartiallySpecializedClassTemplate<T*> { void f() {} };
+template <typename T> struct __declspec(dllexport) PartiallySpecializedClassTemplate<T*> { void f() {} }; // non-gnu-warning {{'dllexport' attribute ignored}}
 
 template <typename T> struct ExpliciallySpecializedClassTemplate {};
 template <> struct __declspec(dllexport) ExpliciallySpecializedClassTemplate<int> { void f() {} };
@@ -373,16 +366,11 @@ ImplicitlyInstantiatedExportedTemplate<IncompleteType> implicitlyInstantiatedExp
 
 // Don't instantiate class members of templates with explicit instantiation declarations, even if they are exported.
 struct IncompleteType2;
-#if defined(MS) || defined (WI) || defined(PS)
-// expected-note@+2{{attribute is here}}
-#endif
-template <typename T> struct __declspec(dllexport) ExportedTemplateWithExplicitInstantiationDecl {
+
+template <typename T> struct __declspec(dllexport) ExportedTemplateWithExplicitInstantiationDecl { // non-gnu-note {{attribute is here}}
   int f() { return sizeof(T); } // no-error
 };
-#if defined(MS) || defined (WI) || defined(PS)
-// expected-warning@+2{{explicit instantiation declaration should not be 'dllexport'}}
-#endif
-extern template struct ExportedTemplateWithExplicitInstantiationDecl<IncompleteType2>;
+extern template struct ExportedTemplateWithExplicitInstantiationDecl<IncompleteType2>; // non-gnu-warning {{explicit instantiation declaration should not be 'dllexport'}}
 
 // Instantiate class members for explicitly instantiated exported templates.
 struct IncompleteType3; // expected-note{{forward declaration of 'IncompleteType3'}}
@@ -392,16 +380,9 @@ template <typename T> struct __declspec(dllexport) ExplicitlyInstantiatedExporte
 template struct ExplicitlyInstantiatedExportedTemplate<IncompleteType3>; // expected-note{{in instantiation of member function 'ExplicitlyInstantiatedExportedTemplate<IncompleteType3>::f' requested here}}
 
 // In MS mode, instantiate members of class templates that are base classes of exported classes.
-#if defined(MS) || defined(PS)
-  // expected-note@+3{{forward declaration of 'IncompleteType4'}}
-  // expected-note@+3{{in instantiation of member function 'BaseClassTemplateOfExportedClass<IncompleteType4>::f' requested here}}
-#endif
-struct IncompleteType4;
-template <typename T> struct BaseClassTemplateOfExportedClass {
-#if defined(MS) || defined(PS)
-  // expected-error@+2{{invalid application of 'sizeof' to an incomplete type 'IncompleteType4'}}
-#endif
-  int f() { return sizeof(T); };
+struct IncompleteType4; // ms-ps-note {{forward declaration of 'IncompleteType4'}}
+template <typename T> struct BaseClassTemplateOfExportedClass { // ms-ps-note {{in instantiation of member function 'BaseClassTemplateOfExportedClass<IncompleteType4>::f' requested here}}
+  int f() { return sizeof(T); }; // ms-ps-error {{invalid application of 'sizeof' to an incomplete type 'IncompleteType4'}}
 };
 struct __declspec(dllexport) ExportedBaseClass : public BaseClassTemplateOfExportedClass<IncompleteType4> {};
 
@@ -414,17 +395,11 @@ struct __declspec(dllexport) ExportedBaseClass2 : public ExportedBaseClassTempla
 
 // Warn about explicit instantiation declarations of dllexport classes.
 template <typename T> struct ExplicitInstantiationDeclTemplate {};
-#if defined(MS) || defined (WI) || defined(PS)
-// expected-warning@+2{{explicit instantiation declaration should not be 'dllexport'}} expected-note@+2{{attribute is here}}
-#endif
-extern template struct __declspec(dllexport) ExplicitInstantiationDeclTemplate<int>;
+extern template struct __declspec(dllexport) ExplicitInstantiationDeclTemplate<int>; // non-gnu-warning {{explicit instantiation declaration should not be 'dllexport'}} \
+                                                                                        non-gnu-note {{attribute is here}}
 
-template <typename T> struct __declspec(dllexport) ExplicitInstantiationDeclExportedTemplate {};
-#if defined(MS) || defined (WI) || defined(PS)
-// expected-note@-2{{attribute is here}}
-// expected-warning@+2{{explicit instantiation declaration should not be 'dllexport'}}
-#endif
-extern template struct ExplicitInstantiationDeclExportedTemplate<int>;
+template <typename T> struct __declspec(dllexport) ExplicitInstantiationDeclExportedTemplate {}; // non-gnu-note {{attribute is here}}
+extern template struct ExplicitInstantiationDeclExportedTemplate<int>; // non-gnu-warning {{explicit instantiation declaration should not be 'dllexport'}}
 
 namespace { struct InternalLinkageType {}; }
 struct __declspec(dllexport) PR23308 {
@@ -440,35 +415,23 @@ class __declspec(dllexport) ExportedClass {};
 class __declspec(dllimport) ImportedClass {};
 
 template <typename T> class ClassTemplate {};
-#if not defined(MS) && not defined(PS)
-// expected-error@+2{{'ExportedClassTemplate<LocalCRTP>' must have external linkage when declared 'dllexport'}}
-#endif
-template <typename T> class __declspec(dllexport) ExportedClassTemplate {};
+template <typename T> class __declspec(dllexport) ExportedClassTemplate {}; // win-gnu-error {{'ExportedClassTemplate<LocalCRTP>' must have external linkage when declared 'dllexport'}}
 template <typename T> class __declspec(dllimport) ImportedClassTemplate {};
 
 template <typename T> struct ExplicitlySpecializedTemplate { void func() {} };
-#if defined(MS) || defined(PS)
-// expected-note@+2{{class template 'ExplicitlySpecializedTemplate<int>' was explicitly specialized here}}
-#endif
-template <> struct ExplicitlySpecializedTemplate<int> { void func() {} };
+template <> struct ExplicitlySpecializedTemplate<int> { void func() {} }; // ms-ps-note {{class template 'ExplicitlySpecializedTemplate<int>' was explicitly specialized here}}
 template <typename T> struct ExplicitlyExportSpecializedTemplate { void func() {} };
 template <> struct __declspec(dllexport) ExplicitlyExportSpecializedTemplate<int> { void func() {} };
 template <typename T> struct ExplicitlyImportSpecializedTemplate { void func() {} };
 template <> struct __declspec(dllimport) ExplicitlyImportSpecializedTemplate<int> { void func() {} };
 
 template <typename T> struct ExplicitlyInstantiatedTemplate { void func() {} };
-#if defined(MS) || defined(PS)
-// expected-note@+2{{class template 'ExplicitlyInstantiatedTemplate<int>' was instantiated here}}
-#endif
-template struct ExplicitlyInstantiatedTemplate<int>;
+template struct ExplicitlyInstantiatedTemplate<int>; // ms-ps-note {{class template 'ExplicitlyInstantiatedTemplate<int>' was instantiated here}}
 template <typename T> struct ExplicitlyExportInstantiatedTemplate { void func() {} };
 template struct __declspec(dllexport) ExplicitlyExportInstantiatedTemplate<int>;
 template <typename T> struct ExplicitlyExportDeclaredInstantiatedTemplate { void func() {} };
 extern template struct ExplicitlyExportDeclaredInstantiatedTemplate<int>;
-#if not defined(MS) && not defined (WI) && not defined(PS)
-// expected-warning@+2{{'dllexport' attribute ignored on explicit instantiation definition}}
-#endif
-template struct __declspec(dllexport) ExplicitlyExportDeclaredInstantiatedTemplate<int>;
+template struct __declspec(dllexport) ExplicitlyExportDeclaredInstantiatedTemplate<int>; // gnu-warning {{'dllexport' attribute ignored on explicit instantiation definition}}
 template <typename T> struct ExplicitlyImportInstantiatedTemplate { void func() {} };
 template struct __declspec(dllimport) ExplicitlyImportInstantiatedTemplate<int>;
 
@@ -496,11 +459,8 @@ class __declspec(dllexport) DerivedFromTemplateB : public ClassTemplate<bool> {}
 // The second derived class doesn't change anything, the attribute that was propagated first wins.
 class __declspec(dllimport) DerivedFromTemplateB2 : public ClassTemplate<bool> {};
 
-#if defined(MS) || defined(PS)
-// expected-warning@+3{{propagating dll attribute to explicitly specialized base class template without dll attribute is not supported}}
-// expected-note@+2{{attribute is here}}
-#endif
-struct __declspec(dllexport) DerivedFromExplicitlySpecializedTemplate : public ExplicitlySpecializedTemplate<int> {};
+struct __declspec(dllexport) DerivedFromExplicitlySpecializedTemplate : public ExplicitlySpecializedTemplate<int> {}; // ms-ps-warning {{propagating dll attribute to explicitly specialized base class template without dll attribute is not supported}} \
+                                                                                                                         ms-ps-note {{attribute is here}}
 
 // Base class alredy specialized with export attribute.
 struct __declspec(dllexport) DerivedFromExplicitlyExportSpecializedTemplate : public ExplicitlyExportSpecializedTemplate<int> {};
@@ -508,11 +468,8 @@ struct __declspec(dllexport) DerivedFromExplicitlyExportSpecializedTemplate : pu
 // Base class already specialized with import attribute.
 struct __declspec(dllexport) DerivedFromExplicitlyImportSpecializedTemplate : public ExplicitlyImportSpecializedTemplate<int> {};
 
-#if defined(MS) || defined(PS)
-// expected-warning@+3{{propagating dll attribute to already instantiated base class template without dll attribute is not supported}}
-// expected-note@+2{{attribute is here}}
-#endif
-struct __declspec(dllexport) DerivedFromExplicitlyInstantiatedTemplate : public ExplicitlyInstantiatedTemplate<int> {};
+struct __declspec(dllexport) DerivedFromExplicitlyInstantiatedTemplate : public ExplicitlyInstantiatedTemplate<int> {}; // ms-ps-warning {{propagating dll attribute to already instantiated base class template without dll attribute is not supported}} \
+                                                                                                                           ms-ps-note {{attribute is here}}
 
 // Base class already instantiated with export attribute.
 struct __declspec(dllexport) DerivedFromExplicitlyExportInstantiatedTemplate : public ExplicitlyExportInstantiatedTemplate<int> {};
@@ -528,10 +485,7 @@ void func() {
   // MSVC allows deriving from exported template classes in local contexts.
   class LocalDerivedFromExportedClass : public ExportedClass {};
   class LocalDerivedFromExportedTemplate : public ExportedClassTemplate<int> {};
-#if not defined(MS) && not defined (PS)
-  // expected-note@+2{{in instantiation of template class 'ExportedClassTemplate<LocalCRTP>' requested here}}
-#endif
-  class LocalCRTP : public ExportedClassTemplate<LocalCRTP> {};
+  class LocalCRTP : public ExportedClassTemplate<LocalCRTP> {}; // win-gnu-note {{in instantiation of template class 'ExportedClassTemplate<LocalCRTP>' requested here}}
 }
 
 //===----------------------------------------------------------------------===//
@@ -778,46 +732,40 @@ __declspec(dllexport)        void MemberRedecl::staticInlineDecl() {}  // expect
 
 __declspec(dllexport)        int  MemberRedecl::StaticField = 1;       // expected-error{{redeclaration of 'MemberRedecl::StaticField' cannot add 'dllexport' attribute}}
 __declspec(dllexport) const  int  MemberRedecl::StaticConstField = 1;  // expected-error{{redeclaration of 'MemberRedecl::StaticConstField' cannot add 'dllexport' attribute}}
-#ifdef MS
-// expected-warning@+4{{attribute declaration must precede definition}}
-#else
-// expected-error@+2{{redeclaration of 'MemberRedecl::ConstexprField' cannot add 'dllexport' attribute}}
-#endif
-__declspec(dllexport) constexpr int MemberRedecl::ConstexprField;
 
-#ifdef MS
+__declspec(dllexport) constexpr int MemberRedecl::ConstexprField; // ms-warning {{attribute declaration must precede definition}} \
+                                                                     non-ms-error {{redeclaration of 'MemberRedecl::ConstexprField' cannot add 'dllexport' attribute}}
+
 struct __declspec(dllexport) ClassWithMultipleDefaultCtors {
-  ClassWithMultipleDefaultCtors(int = 40) {} // expected-error{{'__declspec(dllexport)' cannot be applied to more than one default constructor}}
-  ClassWithMultipleDefaultCtors(int = 30, ...) {} // expected-note{{declared here}}
+  ClassWithMultipleDefaultCtors(int = 40) {} // ms-error{{'__declspec(dllexport)' cannot be applied to more than one default constructor}}
+  ClassWithMultipleDefaultCtors(int = 30, ...) {} // ms-note{{declared here}}
 };
 template <typename T>
 struct ClassTemplateWithMultipleDefaultCtors {
-  __declspec(dllexport) ClassTemplateWithMultipleDefaultCtors(int = 40) {}      // expected-error{{'__declspec(dllexport)' cannot be applied to more than one default constructor}}
-  __declspec(dllexport) ClassTemplateWithMultipleDefaultCtors(int = 30, ...) {} // expected-note{{declared here}}
+  __declspec(dllexport) ClassTemplateWithMultipleDefaultCtors(int = 40) {}      // ms-error{{'__declspec(dllexport)' cannot be applied to more than one default constructor}}
+  __declspec(dllexport) ClassTemplateWithMultipleDefaultCtors(int = 30, ...) {} // ms-note{{declared here}}
 };
 
 template <typename T> struct HasDefaults {
-  HasDefaults(int x = sizeof(T)) {} // expected-error {{invalid application of 'sizeof'}}
+  HasDefaults(int x = sizeof(T)) {} // ms-error {{invalid application of 'sizeof'}}
 };
 template struct __declspec(dllexport) HasDefaults<char>;
 
 template struct
-__declspec(dllexport) // expected-note {{in instantiation of default function argument expression for 'HasDefaults<void>' required here}}
-HasDefaults<void>; // expected-note {{in instantiation of member function 'HasDefaults<void>::HasDefaults' requested here}}
+__declspec(dllexport) // ms-note {{in instantiation of default function argument expression for 'HasDefaults<void>' required here}}
+HasDefaults<void>; // ms-note {{in instantiation of member function 'HasDefaults<void>::HasDefaults' requested here}}
 
 template <typename T> struct HasDefaults2 {
-  __declspec(dllexport) // expected-note {{in instantiation of default function argument expression for 'HasDefaults2<void>' required here}}
-  HasDefaults2(int x = sizeof(T)) {} // expected-error {{invalid application of 'sizeof'}}
+  __declspec(dllexport) // ms-note {{in instantiation of default function argument expression for 'HasDefaults2<void>' required here}}
+  HasDefaults2(int x = sizeof(T)) {} // ms-error {{invalid application of 'sizeof'}}
 };
-template struct HasDefaults2<void>; // expected-note {{in instantiation of member function 'HasDefaults2<void>::HasDefaults2' requested here}}
+template struct HasDefaults2<void>; // ms-note {{in instantiation of member function 'HasDefaults2<void>::HasDefaults2' requested here}}
 
-template <typename T> struct __declspec(dllexport) HasDefaults3 { // expected-note{{in instantiation of default function argument expression for 'HasDefaults3<void>' required here}}
-  HasDefaults3(int x = sizeof(T)) {} // expected-error {{invalid application of 'sizeof'}}
+template <typename T> struct __declspec(dllexport) HasDefaults3 { // ms-note{{in instantiation of default function argument expression for 'HasDefaults3<void>' required here}}
+  HasDefaults3(int x = sizeof(T)) {} // ms-error {{invalid application of 'sizeof'}}
 };
 template <> HasDefaults3<void>::HasDefaults3(int) {};
 
-#endif
-
 //===----------------------------------------------------------------------===//
 // Class member templates
 //===----------------------------------------------------------------------===//
@@ -887,12 +835,8 @@ template<typename T> __declspec(dllexport)        void MemTmplRedecl::staticInli
 template<typename T> __declspec(dllexport)        int  MemTmplRedecl::StaticField = 1;      // expected-error{{redeclaration of 'MemTmplRedecl::StaticField' cannot add 'dllexport' attribute}}
 template<typename T> __declspec(dllexport) const  int  MemTmplRedecl::StaticConstField = 1; // expected-error{{redeclaration of 'MemTmplRedecl::StaticConstField' cannot add 'dllexport' attribute}}
 
-#ifdef MS
-// expected-warning@+4{{attribute declaration must precede definition}}
-#else
-// expected-error@+2{{redeclaration of 'MemTmplRedecl::ConstexprField' cannot add 'dllexport' attribute}}
-#endif
-template<typename T> __declspec(dllexport) constexpr int MemTmplRedecl::ConstexprField;
+template<typename T> __declspec(dllexport) constexpr int MemTmplRedecl::ConstexprField; // ms-warning {{attribute declaration must precede definition}} \
+                                                                                           non-ms-error {{redeclaration of 'MemTmplRedecl::ConstexprField' cannot add 'dllexport' attribute}}
 #endif // __has_feature(cxx_variable_templates)
 
 
@@ -1097,20 +1041,13 @@ template<typename T> __declspec(dllexport)        void CTMR<T>::staticInlineDecl
 
 template<typename T> __declspec(dllexport)        int  CTMR<T>::StaticField = 1;       // expected-error{{redeclaration of 'CTMR::StaticField' cannot add 'dllexport' attribute}}
 template<typename T> __declspec(dllexport) const  int  CTMR<T>::StaticConstField = 1;  // expected-error{{redeclaration of 'CTMR::StaticConstField' cannot add 'dllexport' attribute}}
-#ifdef MS
-// expected-warning@+4{{attribute declaration must precede definition}}
-#else
-// expected-error@+2{{redeclaration of 'CTMR::ConstexprField' cannot add 'dllexport' attribute}}
-#endif
-template<typename T> __declspec(dllexport) constexpr int CTMR<T>::ConstexprField;
+template<typename T> __declspec(dllexport) constexpr int CTMR<T>::ConstexprField; // ms-warning {{attribute declaration must precede definition}} \
+                                                                                     non-ms-error {{redeclaration of 'CTMR::ConstexprField' cannot add 'dllexport' attribute}}
 
 // MSVC exports explicit specialization of exported class template member
 // function, and errors on such definitions. MinGW does not treat them as
 // dllexport.
-#if !defined(GNU)
-// expected-error@+2{{attribute 'dllexport' cannot be applied to a deleted function}}
-#endif
-template <> void ExportClassTmplMembers<int>::normalDecl() = delete;
+template <> void ExportClassTmplMembers<int>::normalDecl() = delete; // non-gnu-error {{attribute 'dllexport' cannot be applied to a deleted function}}
 
 
 //===----------------------------------------------------------------------===//
@@ -1183,12 +1120,8 @@ template<typename T> template<typename U> __declspec(dllexport)        void CTMT
 #if __has_feature(cxx_variable_templates)
 template<typename T> template<typename U> __declspec(dllexport)        int  CTMTR<T>::StaticField = 1;       // expected-error{{redeclaration of 'CTMTR::StaticField' cannot add 'dllexport' attribute}}
 template<typename T> template<typename U> __declspec(dllexport) const  int  CTMTR<T>::StaticConstField = 1;  // expected-error{{redeclaration of 'CTMTR::StaticConstField' cannot add 'dllexport' attribute}}
-#ifdef MS
-// expected-warning@+4{{attribute declaration must precede definition}}
-#else
-// expected-error@+2{{redeclaration of 'CTMTR::ConstexprField' cannot add 'dllexport' attribute}}
-#endif
-template<typename T> template<typename U> __declspec(dllexport) constexpr int CTMTR<T>::ConstexprField;
+template<typename T> template<typename U> __declspec(dllexport) constexpr int CTMTR<T>::ConstexprField; // ms-warning {{attribute declaration must precede definition}} \
+                                                                                                           non-ms-error {{redeclaration of 'CTMTR::ConstexprField' cannot add 'dllexport' attribute}}
 #endif // __has_feature(cxx_variable_templates)
 
 // FIXME: Precedence rules seem to be different for classes.
@@ -1197,7 +1130,4 @@ template<typename T> template<typename U> __declspec(dllexport) constexpr int CT
 // Lambdas
 //===----------------------------------------------------------------------===//
 // The MS ABI doesn't provide a stable mangling for lambdas, so they can't be imported or exported.
-#if defined(MS) || defined (WI) || defined(PS)
-// expected-error@+2{{lambda cannot be declared 'dllexport'}}
-#endif
-auto Lambda = []() __declspec(dllexport) -> bool { return true; };
+auto Lambda = []() __declspec(dllexport) -> bool { return true; }; // non-gnu-error {{lambda cannot be declared 'dllexport'}}
diff --git a/libc/shared/math.h b/libc/shared/math.h
index bd6aee73c3933..282dd6243d6a7 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -51,6 +51,7 @@
 #include "math/exp2f.h"
 #include "math/exp2f16.h"
 #include "math/exp2m1f.h"
+#include "math/exp2m1f16.h"
 #include "math/expf.h"
 #include "math/expf16.h"
 #include "math/frexpf.h"
diff --git a/libc/shared/math/exp2m1f16.h b/libc/shared/math/exp2m1f16.h
new file mode 100644
index 0000000000000..96a404708be18
--- /dev/null
+++ b/libc/shared/math/exp2m1f16.h
@@ -0,0 +1,29 @@
+//===-- Shared exp2m1f16 function -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_EXP2M1F16_H
+#define LLVM_LIBC_SHARED_MATH_EXP2M1F16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+#include "shared/libc_common.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/math/exp2m1f16.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::exp2m1f16;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_EXP2M1F16_H
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index 620900028d424..ddc0159b10ce4 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -769,6 +769,24 @@ add_header_library(
     libc.src.__support.macros.properties.cpu_features
 )
 
+add_header_library(
+  exp2m1f16
+  HDRS
+    exp2m1f16.h
+  DEPENDS
+    .expxf16_utils
+    libc.src.__support.common
+    libc.src.__support.FPUtil.cast
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+    libc.src.__support.macros.properties.cpu_features
+)
+
 add_header_library(
   exp10
   HDRS
diff --git a/libc/src/__support/math/exp2m1f16.h b/libc/src/__support/math/exp2m1f16.h
new file mode 100644
index 0000000000000..0424af4aa953d
--- /dev/null
+++ b/libc/src/__support/math/exp2m1f16.h
@@ -0,0 +1,180 @@
+//===-- Implementation header for exp2m1f16 ----------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_EXP2M1F16_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_EXP2M1F16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+#include "src/__support/macros/properties/cpu_features.h"
+#include "src/__support/math/expxf16_utils.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE static constexpr float16 exp2m1f16(float16 x) {
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  constexpr fputil::ExceptValues<float16, 6> EXP2M1F16_EXCEPTS_LO = {{
+      // (input, RZ output, RU offset, RD offset, RN offset)
+      // x = 0x1.cf4p-13, exp2m1f16(x) = 0x1.41p-13 (RZ)
+      {0x0b3dU, 0x0904U, 1U, 0U, 1U},
+      // x = 0x1.4fcp-12, exp2m1f16(x) = 0x1.d14p-13 (RZ)
+      {0x0d3fU, 0x0b45U, 1U, 0U, 1U},
+      // x = 0x1.63p-11, exp2m1f16(x) = 0x1.ec4p-12 (RZ)
+      {0x118cU, 0x0fb1U, 1U, 0U, 0U},
+      // x = 0x1.6fp-7, exp2m1f16(x) = 0x1.fe8p-8 (RZ)
+      {0x21bcU, 0x1ffaU, 1U, 0U, 1U},
+      // x = -0x1.c6p-10, exp2m1f16(x) = -0x1.3a8p-10 (RZ)
+      {0x9718U, 0x94eaU, 0U, 1U, 0U},
+      // x = -0x1.cfcp-10, exp2m1f16(x) = -0x1.414p-10 (RZ)
+      {0x973fU, 0x9505U, 0U, 1U, 0U},
+  }};
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
+  constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 6;
+#else
+  constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 7;
+#endif
+
+  constexpr fputil::ExceptValues<float16, N_EXP2M1F16_EXCEPTS_HI>
+      EXP2M1F16_EXCEPTS_HI = {{
+          // (input, RZ output, RU offset, RD offset, RN offset)
+          // x = 0x1.e58p-3, exp2m1f16(x) = 0x1.6dcp-3 (RZ)
+          {0x3396U, 0x31b7U, 1U, 0U, 0U},
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
+          // x = 0x1.2e8p-2, exp2m1f16(x) = 0x1.d14p-3 (RZ)
+          {0x34baU, 0x3345U, 1U, 0U, 0U},
+#endif
+          // x = 0x1.ad8p-2, exp2m1f16(x) = 0x1.598p-2 (RZ)
+          {0x36b6U, 0x3566U, 1U, 0U, 0U},
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
+          // x = 0x1.edcp-2, exp2m1f16(x) = 0x1.964p-2 (RZ)
+          {0x37b7U, 0x3659U, 1U, 0U, 1U},
+#endif
+          // x = -0x1.804p-3, exp2m1f16(x) = -0x1.f34p-4 (RZ)
+          {0xb201U, 0xafcdU, 0U, 1U, 1U},
+          // x = -0x1.f3p-3, exp2m1f16(x) = -0x1.3e4p-3 (RZ)
+          {0xb3ccU, 0xb0f9U, 0U, 1U, 0U},
+          // x = -0x1.294p-1, exp2m1f16(x) = -0x1.53p-2 (RZ)
+          {0xb8a5U, 0xb54cU, 0U, 1U, 1U},
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
+          // x = -0x1.a34p-1, exp2m1f16(x) = -0x1.bb4p-2 (RZ)
+          {0xba8dU, 0xb6edU, 0U, 1U, 1U},
+#endif
+      }};
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  using namespace math::expxf16_internal;
+  using FPBits = fputil::FPBits<float16>;
+  FPBits x_bits(x);
+
+  uint16_t x_u = x_bits.uintval();
+  uint16_t x_abs = x_u & 0x7fffU;
+
+  // When |x| <= 2^(-3), or |x| >= 11, or x is NaN.
+  if (LIBC_UNLIKELY(x_abs <= 0x3000U || x_abs >= 0x4980U)) {
+    // exp2m1(NaN) = NaN
+    if (x_bits.is_nan()) {
+      if (x_bits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+
+      return x;
+    }
+
+    // When x >= 16.
+    if (x_u >= 0x4c00 && x_bits.is_pos()) {
+      // exp2m1(+inf) = +inf
+      if (x_bits.is_inf())
+        return FPBits::inf().get_val();
+
+      switch (fputil::quick_get_round()) {
+      case FE_TONEAREST:
+      case FE_UPWARD:
+        fputil::set_errno_if_required(ERANGE);
+        fputil::raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
+        return FPBits::inf().get_val();
+      default:
+        return FPBits::max_normal().get_val();
+      }
+    }
+
+    // When x < -11.
+    if (x_u > 0xc980U) {
+      // exp2m1(-inf) = -1
+      if (x_bits.is_inf())
+        return FPBits::one(Sign::NEG).get_val();
+
+      // When -12 < x < -11, round(2^x - 1, HP, RN) = -0x1.ffcp-1.
+      if (x_u < 0xca00U)
+        return fputil::round_result_slightly_down(
+            fputil::cast<float16>(-0x1.ffcp-1));
+
+      // When x <= -12, round(2^x - 1, HP, RN) = -1.
+      switch (fputil::quick_get_round()) {
+      case FE_TONEAREST:
+      case FE_DOWNWARD:
+        return FPBits::one(Sign::NEG).get_val();
+      default:
+        return fputil::cast<float16>(-0x1.ffcp-1);
+      }
+    }
+
+    // When |x| <= 2^(-3).
+    if (x_abs <= 0x3000U) {
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+      if (auto r = EXP2M1F16_EXCEPTS_LO.lookup(x_u);
+          LIBC_UNLIKELY(r.has_value()))
+        return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+      float xf = x;
+      // Degree-5 minimax polynomial generated by Sollya with the following
+      // commands:
+      //   > display = hexadecimal;
+      //   > P = fpminimax((2^x - 1)/x, 4, [|SG...|], [-2^-3, 2^-3]);
+      //   > x * P;
+      return fputil::cast<float16>(
+          xf * fputil::polyeval(xf, 0x1.62e43p-1f, 0x1.ebfbdep-3f,
+                                0x1.c6af88p-5f, 0x1.3b45d6p-7f,
+                                0x1.641e7cp-10f));
+    }
+  }
+
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  if (auto r = EXP2M1F16_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
+  // exp2(x) = exp2(hi + mid) * exp2(lo)
+  auto [exp2_hi_mid, exp2_lo] = exp2_range_reduction(x);
+  // exp2m1(x) = exp2(hi + mid) * exp2(lo) - 1
+  return fputil::cast<float16>(
+      fputil::multiply_add(exp2_hi_mid, exp2_lo, -1.0f));
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_EXP2M1F16_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index c048a64db6bc2..e71300536616b 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1498,19 +1498,7 @@ add_entrypoint_object(
   HDRS
     ../exp2m1f16.h
   DEPENDS
-    libc.hdr.errno_macros
-    libc.hdr.fenv_macros
-    libc.src.__support.common
-    libc.src.__support.FPUtil.cast
-    libc.src.__support.FPUtil.except_value_utils
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.rounding_mode
-    libc.src.__support.macros.optimization
-    libc.src.__support.macros.properties.cpu_features
-    libc.src.__support.math.expxf16_utils
+    libc.src.__support.math.exp2m1f16
 )
 
 add_entrypoint_object(
diff --git a/libc/src/math/generic/exp2m1f16.cpp b/libc/src/math/generic/exp2m1f16.cpp
index ce0cc60748f19..497a2887cea4c 100644
--- a/libc/src/math/generic/exp2m1f16.cpp
+++ b/libc/src/math/generic/exp2m1f16.cpp
@@ -7,163 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/exp2m1f16.h"
-#include "hdr/errno_macros.h"
-#include "hdr/fenv_macros.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/cast.h"
-#include "src/__support/FPUtil/except_value_utils.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h"
-#include "src/__support/macros/properties/cpu_features.h"
-#include "src/__support/math/expxf16_utils.h"
+#include "src/__support/math/exp2m1f16.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-static constexpr fputil::ExceptValues<float16, 6> EXP2M1F16_EXCEPTS_LO = {{
-    // (input, RZ output, RU offset, RD offset, RN offset)
-    // x = 0x1.cf4p-13, exp2m1f16(x) = 0x1.41p-13 (RZ)
-    {0x0b3dU, 0x0904U, 1U, 0U, 1U},
-    // x = 0x1.4fcp-12, exp2m1f16(x) = 0x1.d14p-13 (RZ)
-    {0x0d3fU, 0x0b45U, 1U, 0U, 1U},
-    // x = 0x1.63p-11, exp2m1f16(x) = 0x1.ec4p-12 (RZ)
-    {0x118cU, 0x0fb1U, 1U, 0U, 0U},
-    // x = 0x1.6fp-7, exp2m1f16(x) = 0x1.fe8p-8 (RZ)
-    {0x21bcU, 0x1ffaU, 1U, 0U, 1U},
-    // x = -0x1.c6p-10, exp2m1f16(x) = -0x1.3a8p-10 (RZ)
-    {0x9718U, 0x94eaU, 0U, 1U, 0U},
-    // x = -0x1.cfcp-10, exp2m1f16(x) = -0x1.414p-10 (RZ)
-    {0x973fU, 0x9505U, 0U, 1U, 0U},
-}};
-
-#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
-static constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 6;
-#else
-static constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 7;
-#endif
-
-static constexpr fputil::ExceptValues<float16, N_EXP2M1F16_EXCEPTS_HI>
-    EXP2M1F16_EXCEPTS_HI = {{
-        // (input, RZ output, RU offset, RD offset, RN offset)
-        // x = 0x1.e58p-3, exp2m1f16(x) = 0x1.6dcp-3 (RZ)
-        {0x3396U, 0x31b7U, 1U, 0U, 0U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
-        // x = 0x1.2e8p-2, exp2m1f16(x) = 0x1.d14p-3 (RZ)
-        {0x34baU, 0x3345U, 1U, 0U, 0U},
-#endif
-        // x = 0x1.ad8p-2, exp2m1f16(x) = 0x1.598p-2 (RZ)
-        {0x36b6U, 0x3566U, 1U, 0U, 0U},
-#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
-        // x = 0x1.edcp-2, exp2m1f16(x) = 0x1.964p-2 (RZ)
-        {0x37b7U, 0x3659U, 1U, 0U, 1U},
-#endif
-        // x = -0x1.804p-3, exp2m1f16(x) = -0x1.f34p-4 (RZ)
-        {0xb201U, 0xafcdU, 0U, 1U, 1U},
-        // x = -0x1.f3p-3, exp2m1f16(x) = -0x1.3e4p-3 (RZ)
-        {0xb3ccU, 0xb0f9U, 0U, 1U, 0U},
-        // x = -0x1.294p-1, exp2m1f16(x) = -0x1.53p-2 (RZ)
-        {0xb8a5U, 0xb54cU, 0U, 1U, 1U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
-        // x = -0x1.a34p-1, exp2m1f16(x) = -0x1.bb4p-2 (RZ)
-        {0xba8dU, 0xb6edU, 0U, 1U, 1U},
-#endif
-    }};
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
 LLVM_LIBC_FUNCTION(float16, exp2m1f16, (float16 x)) {
-  using namespace math::expxf16_internal;
-  using FPBits = fputil::FPBits<float16>;
-  FPBits x_bits(x);
-
-  uint16_t x_u = x_bits.uintval();
-  uint16_t x_abs = x_u & 0x7fffU;
-
-  // When |x| <= 2^(-3), or |x| >= 11, or x is NaN.
-  if (LIBC_UNLIKELY(x_abs <= 0x3000U || x_abs >= 0x4980U)) {
-    // exp2m1(NaN) = NaN
-    if (x_bits.is_nan()) {
-      if (x_bits.is_signaling_nan()) {
-        fputil::raise_except_if_required(FE_INVALID);
-        return FPBits::quiet_nan().get_val();
-      }
-
-      return x;
-    }
-
-    // When x >= 16.
-    if (x_u >= 0x4c00 && x_bits.is_pos()) {
-      // exp2m1(+inf) = +inf
-      if (x_bits.is_inf())
-        return FPBits::inf().get_val();
-
-      switch (fputil::quick_get_round()) {
-      case FE_TONEAREST:
-      case FE_UPWARD:
-        fputil::set_errno_if_required(ERANGE);
-        fputil::raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
-        return FPBits::inf().get_val();
-      default:
-        return FPBits::max_normal().get_val();
-      }
-    }
-
-    // When x < -11.
-    if (x_u > 0xc980U) {
-      // exp2m1(-inf) = -1
-      if (x_bits.is_inf())
-        return FPBits::one(Sign::NEG).get_val();
-
-      // When -12 < x < -11, round(2^x - 1, HP, RN) = -0x1.ffcp-1.
-      if (x_u < 0xca00U)
-        return fputil::round_result_slightly_down(
-            fputil::cast<float16>(-0x1.ffcp-1));
-
-      // When x <= -12, round(2^x - 1, HP, RN) = -1.
-      switch (fputil::quick_get_round()) {
-      case FE_TONEAREST:
-      case FE_DOWNWARD:
-        return FPBits::one(Sign::NEG).get_val();
-      default:
-        return fputil::cast<float16>(-0x1.ffcp-1);
-      }
-    }
-
-    // When |x| <= 2^(-3).
-    if (x_abs <= 0x3000U) {
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-      if (auto r = EXP2M1F16_EXCEPTS_LO.lookup(x_u);
-          LIBC_UNLIKELY(r.has_value()))
-        return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-      float xf = x;
-      // Degree-5 minimax polynomial generated by Sollya with the following
-      // commands:
-      //   > display = hexadecimal;
-      //   > P = fpminimax((2^x - 1)/x, 4, [|SG...|], [-2^-3, 2^-3]);
-      //   > x * P;
-      return fputil::cast<float16>(
-          xf * fputil::polyeval(xf, 0x1.62e43p-1f, 0x1.ebfbdep-3f,
-                                0x1.c6af88p-5f, 0x1.3b45d6p-7f,
-                                0x1.641e7cp-10f));
-    }
-  }
-
-#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-  if (auto r = EXP2M1F16_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
-    return r.value();
-#endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
-
-  // exp2(x) = exp2(hi + mid) * exp2(lo)
-  auto [exp2_hi_mid, exp2_lo] = exp2_range_reduction(x);
-  // exp2m1(x) = exp2(hi + mid) * exp2(lo) - 1
-  return fputil::cast<float16>(
-      fputil::multiply_add(exp2_hi_mid, exp2_lo, -1.0f));
+  return math::exp2m1f16(x);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/UnitTest/FEnvSafeTest.cpp b/libc/test/UnitTest/FEnvSafeTest.cpp
index 4393f9d5e5c3b..64f50d7be7fe3 100644
--- a/libc/test/UnitTest/FEnvSafeTest.cpp
+++ b/libc/test/UnitTest/FEnvSafeTest.cpp
@@ -43,7 +43,8 @@ void FEnvSafeTest::set_fenv(const fenv_t &fenv) {
 
 void FEnvSafeTest::expect_fenv_eq(const fenv_t &before_fenv,
                                   const fenv_t &after_fenv) {
-#if defined(LIBC_TARGET_ARCH_IS_AARCH64) && !defined(LIBC_COMPILER_IS_MSVC)
+#if defined(LIBC_TARGET_ARCH_IS_AARCH64) && !defined(LIBC_COMPILER_IS_MSVC) && \
+    defined(__ARM_FP)
   using FPState = LIBC_NAMESPACE::fputil::FEnv::FPState;
   const FPState &before_state = reinterpret_cast<const FPState &>(before_fenv);
   const FPState &after_state = reinterpret_cast<const FPState &>(after_fenv);
diff --git a/libc/test/shared/CMakeLists.txt b/libc/test/shared/CMakeLists.txt
index aede395350821..762b5b0417ef6 100644
--- a/libc/test/shared/CMakeLists.txt
+++ b/libc/test/shared/CMakeLists.txt
@@ -44,6 +44,7 @@ add_fp_unittest(
     libc.src.__support.math.exp2f
     libc.src.__support.math.exp2f16
     libc.src.__support.math.exp2m1f
+    libc.src.__support.math.exp2m1f16
     libc.src.__support.math.exp10
     libc.src.__support.math.exp10f
     libc.src.__support.math.exp10f16
diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp
index a6825a10654c9..5b409781a5b07 100644
--- a/libc/test/shared/shared_math_test.cpp
+++ b/libc/test/shared/shared_math_test.cpp
@@ -29,6 +29,7 @@ TEST(LlvmLibcSharedMathTest, AllFloat16) {
   EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::exp10f16(0.0f16));
   EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::exp10m1f16(0.0f16));
   EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::exp2f16(0.0f16));
+  EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::exp2m1f16(0.0f16));
   EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::expf16(0.0f16));
 
   ASSERT_FP_EQ(float16(8 << 5), LIBC_NAMESPACE::shared::ldexpf16(8.0f16, 5));
diff --git a/llvm/include/llvm/IR/IntrinsicsLoongArch.td b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
index 84026aa9d3624..1c46965d995fe 100644
--- a/llvm/include/llvm/IR/IntrinsicsLoongArch.td
+++ b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
@@ -1192,4 +1192,42 @@ def int_loongarch_lasx_xvstelm_w
 def int_loongarch_lasx_xvstelm_d
   : VecInt<[], [llvm_v4i64_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrWriteMem, IntrArgMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+
+// LASX and LSX conversion
+def int_loongarch_lasx_cast_128_s
+  : VecInt<[llvm_v8f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_cast_128_d
+  : VecInt<[llvm_v4f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_cast_128
+  : VecInt<[llvm_v4i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_concat_128_s
+  : VecInt<[llvm_v8f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_concat_128_d
+  : VecInt<[llvm_v4f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_concat_128
+  : VecInt<[llvm_v4i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_lo_s
+  : VecInt<[llvm_v4f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_lo_d
+  : VecInt<[llvm_v2f64_ty], [llvm_v4f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_lo
+  : VecInt<[llvm_v2i64_ty], [llvm_v4i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_hi_s
+  : VecInt<[llvm_v4f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_hi_d
+  : VecInt<[llvm_v2f64_ty], [llvm_v4f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_hi
+  : VecInt<[llvm_v2i64_ty], [llvm_v4i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_lo_s
+  : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_lo_d
+  : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_lo
+  : VecInt<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_hi_s
+  : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_hi_d
+  : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_hi
+  : VecInt<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 } // TargetPrefix = "loongarch"
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index fe700e17d341b..cf4ffc82f6009 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -6630,6 +6630,11 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(LoongArchISD::VANY_NONZERO, DL, N->getValueType(0),
                          N->getOperand(1));
     break;
+  case Intrinsic::loongarch_lasx_concat_128_s:
+  case Intrinsic::loongarch_lasx_concat_128_d:
+  case Intrinsic::loongarch_lasx_concat_128:
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
   }
   return SDValue();
 }
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index b502b056c4cdf..00d52870f1727 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -2113,6 +2113,37 @@ defm : subvector_subreg_lowering<LSX128, v2f64, LASX256, v4f64,  2,  sub_128>;
 defm : subvector_subreg_lowering<LSX128, v8i16, LASX256, v16i16, 8,  sub_128>;
 defm : subvector_subreg_lowering<LSX128, v16i8, LASX256, v32i8,  16, sub_128>;
 
+// LASX and LSX conversion
+def : Pat<(int_loongarch_lasx_cast_128_s (v4f32 LSX128:$src)),
+          (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_cast_128_d (v2f64 LSX128:$src)),
+          (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_cast_128 (v2i64 LSX128:$src)),
+          (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_lo_s (v8f32 LASX256:$src)),
+          (EXTRACT_SUBREG LASX256:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_lo_d (v4f64 LASX256:$src)),
+          (EXTRACT_SUBREG LASX256:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_lo (v4i64 LASX256:$src)),
+          (EXTRACT_SUBREG LASX256:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_hi_s (v8f32 LASX256:$src)),
+          (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_hi_d (v4f64 LASX256:$src)),
+          (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_hi (v4i64 LASX256:$src)),
+          (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>;
+def : Pat<(int_loongarch_lasx_insert_128_lo_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>;
+def : Pat<(int_loongarch_lasx_insert_128_lo_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>;
+def : Pat<(int_loongarch_lasx_insert_128_lo (v4i64 LASX256:$src), (v2i64 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>;
+def : Pat<(int_loongarch_lasx_insert_128_hi_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>;
+def : Pat<(int_loongarch_lasx_insert_128_hi_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>;
+def : Pat<(int_loongarch_lasx_insert_128_hi (v4i64 LASX256:$src), (v2i64 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>;
 } // Predicates = [HasExtLASX]
 
 /// Intrinsic pattern
diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index 0ff178e1f1959..e9088a4d9275c 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -58,6 +58,7 @@ add_llvm_target(RISCVCodeGen
   RISCVMoveMerger.cpp
   RISCVOptWInstrs.cpp
   RISCVPostRAExpandPseudoInsts.cpp
+  RISCVPromoteConstant.cpp
   RISCVPushPopOptimizer.cpp
   RISCVRedundantCopyElimination.cpp
   RISCVRegisterInfo.cpp
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index ae9410193efe1..51e8e8574ed15 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -20,6 +20,7 @@
 namespace llvm {
 class FunctionPass;
 class InstructionSelector;
+class ModulePass;
 class PassRegistry;
 class RISCVRegisterBankInfo;
 class RISCVSubtarget;
@@ -111,6 +112,9 @@ void initializeRISCVO0PreLegalizerCombinerPass(PassRegistry &);
 FunctionPass *createRISCVPreLegalizerCombiner();
 void initializeRISCVPreLegalizerCombinerPass(PassRegistry &);
 
+ModulePass *createRISCVPromoteConstantPass();
+void initializeRISCVPromoteConstantPass(PassRegistry &);
+
 FunctionPass *createRISCVVLOptimizerPass();
 void initializeRISCVVLOptimizerPass(PassRegistry &);
 
diff --git a/llvm/lib/Target/RISCV/RISCVPromoteConstant.cpp b/llvm/lib/Target/RISCV/RISCVPromoteConstant.cpp
new file mode 100644
index 0000000000000..bf1f69f8e8d93
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVPromoteConstant.cpp
@@ -0,0 +1,213 @@
+//==- RISCVPromoteConstant.cpp - Promote constant fp to global for RISC-V --==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVSubtarget.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-promote-const"
+#define RISCV_PROMOTE_CONSTANT_NAME "RISC-V Promote Constants"
+
+STATISTIC(NumPromoted, "Number of constant literals promoted to globals");
+STATISTIC(NumPromotedUses, "Number of uses of promoted literal constants");
+
+namespace {
+
+class RISCVPromoteConstant : public ModulePass {
+public:
+  static char ID;
+  RISCVPromoteConstant() : ModulePass(ID) {}
+
+  StringRef getPassName() const override { return RISCV_PROMOTE_CONSTANT_NAME; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.setPreservesCFG();
+  }
+
+  /// Iterate over the functions and promote the double fp constants that
+  /// would otherwise go into the constant pool to a constant array.
+  bool runOnModule(Module &M) override {
+    if (skipModule(M))
+      return false;
+    // TargetMachine and Subtarget are needed to query isFPImmlegal.
+    const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+    const TargetMachine &TM = TPC.getTM<TargetMachine>();
+    bool Changed = false;
+    for (Function &F : M) {
+      const RISCVSubtarget &ST = TM.getSubtarget<RISCVSubtarget>(F);
+      const RISCVTargetLowering *TLI = ST.getTargetLowering();
+      Changed |= runOnFunction(F, TLI);
+    }
+    return Changed;
+  }
+
+private:
+  bool runOnFunction(Function &F, const RISCVTargetLowering *TLI);
+};
+} // end anonymous namespace
+
+char RISCVPromoteConstant::ID = 0;
+
+INITIALIZE_PASS(RISCVPromoteConstant, DEBUG_TYPE, RISCV_PROMOTE_CONSTANT_NAME,
+                false, false)
+
+ModulePass *llvm::createRISCVPromoteConstantPass() {
+  return new RISCVPromoteConstant();
+}
+
+bool RISCVPromoteConstant::runOnFunction(Function &F,
+                                         const RISCVTargetLowering *TLI) {
+  if (F.hasOptNone() || F.hasOptSize())
+    return false;
+
+  // Bail out and make no transformation if the target doesn't support
+  // doubles, or if we're not targeting RV64 as we currently see some
+  // regressions for those targets.
+  if (!TLI->isTypeLegal(MVT::f64) || !TLI->isTypeLegal(MVT::i64))
+    return false;
+
+  // Collect all unique double constants and their uses in the function. Use
+  // MapVector to preserve insertion order.
+  MapVector<ConstantFP *, SmallVector<Use *, 8>> ConstUsesMap;
+
+  for (Instruction &I : instructions(F)) {
+    for (Use &U : I.operands()) {
+      auto *C = dyn_cast<ConstantFP>(U.get());
+      if (!C || !C->getType()->isDoubleTy())
+        continue;
+      // Do not promote if it wouldn't be loaded from the constant pool.
+      if (TLI->isFPImmLegal(C->getValueAPF(), MVT::f64,
+                            /*ForCodeSize=*/false))
+        continue;
+      // Do not promote a constant if it is used as an immediate argument
+      // for an intrinsic.
+      if (auto *II = dyn_cast<IntrinsicInst>(U.getUser())) {
+        Function *IntrinsicFunc = II->getFunction();
+        unsigned OperandIdx = U.getOperandNo();
+        if (IntrinsicFunc && IntrinsicFunc->getAttributes().hasParamAttr(
+                                 OperandIdx, Attribute::ImmArg)) {
+          LLVM_DEBUG(dbgs() << "Skipping promotion of constant in: " << *II
+                            << " because operand " << OperandIdx
+                            << " must be an immediate.\n");
+          continue;
+        }
+      }
+      // Note: FP args to inline asm would be problematic if we had a
+      // constraint that required an immediate floating point operand. At the
+      // time of writing LLVM doesn't recognise such a constraint.
+      ConstUsesMap[C].push_back(&U);
+    }
+  }
+
+  int PromotableConstants = ConstUsesMap.size();
+  LLVM_DEBUG(dbgs() << "Found " << PromotableConstants
+                    << " promotable constants in " << F.getName() << "\n");
+  // Bail out if no promotable constants found, or if only one is found.
+  if (PromotableConstants < 2) {
+    LLVM_DEBUG(dbgs() << "Performing no promotions as insufficient promotable "
+                         "constants found\n");
+    return false;
+  }
+
+  NumPromoted += PromotableConstants;
+
+  // Create a global array containing the promoted constants.
+  Module *M = F.getParent();
+  Type *DoubleTy = Type::getDoubleTy(M->getContext());
+
+  SmallVector<Constant *, 16> ConstantVector;
+  for (auto const &Pair : ConstUsesMap)
+    ConstantVector.push_back(Pair.first);
+
+  ArrayType *ArrayTy = ArrayType::get(DoubleTy, ConstantVector.size());
+  Constant *GlobalArrayInitializer =
+      ConstantArray::get(ArrayTy, ConstantVector);
+
+  auto *GlobalArray = new GlobalVariable(
+      *M, ArrayTy,
+      /*isConstant=*/true, GlobalValue::InternalLinkage, GlobalArrayInitializer,
+      ".promoted_doubles." + F.getName());
+
+  // A cache to hold the loaded value for a given constant within a basic block.
+  DenseMap<std::pair<ConstantFP *, BasicBlock *>, Value *> LocalLoads;
+
+  // Replace all uses with the loaded value.
+  unsigned Idx = 0;
+  for (auto const &Pair : ConstUsesMap) {
+    ConstantFP *Const = Pair.first;
+    const SmallVector<Use *, 8> &Uses = Pair.second;
+
+    for (Use *U : Uses) {
+      Instruction *UserInst = cast<Instruction>(U->getUser());
+      BasicBlock *InsertionBB;
+
+      // If the user is a PHI node, we must insert the load in the
+      // corresponding predecessor basic block. Otherwise, it's inserted into
+      // the same block as the use.
+      if (auto *PN = dyn_cast<PHINode>(UserInst))
+        InsertionBB = PN->getIncomingBlock(*U);
+      else
+        InsertionBB = UserInst->getParent();
+
+      if (isa<CatchSwitchInst>(InsertionBB->getTerminator())) {
+        LLVM_DEBUG(dbgs() << "Bailing out: catchswitch means thre is no valid "
+                             "insertion point.\n");
+        return false;
+      }
+
+      auto CacheKey = std::make_pair(Const, InsertionBB);
+      Value *LoadedVal = nullptr;
+
+      // Re-use a load if it exists in the insertion block.
+      if (LocalLoads.count(CacheKey)) {
+        LoadedVal = LocalLoads.at(CacheKey);
+      } else {
+        // Otherwise, create a new GEP and Load at the correct insertion point.
+        // It is always safe to insert in the first insertion point in the BB,
+        // so do that and let other passes reorder.
+        IRBuilder<> Builder(InsertionBB, InsertionBB->getFirstInsertionPt());
+        Value *ElementPtr = Builder.CreateConstInBoundsGEP2_64(
+            GlobalArray->getValueType(), GlobalArray, 0, Idx, "double.addr");
+        LoadedVal = Builder.CreateLoad(DoubleTy, ElementPtr, "double.val");
+
+        // Cache the newly created load for this block.
+        LocalLoads[CacheKey] = LoadedVal;
+      }
+
+      U->set(LoadedVal);
+      ++NumPromotedUses;
+    }
+    ++Idx;
+  }
+
+  return true;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index ae54ff1515121..16ef67da83128 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -139,6 +139,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   initializeRISCVExpandAtomicPseudoPass(*PR);
   initializeRISCVRedundantCopyEliminationPass(*PR);
   initializeRISCVAsmPrinterPass(*PR);
+  initializeRISCVPromoteConstantPass(*PR);
 }
 
 static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
@@ -462,6 +463,8 @@ void RISCVPassConfig::addIRPasses() {
 }
 
 bool RISCVPassConfig::addPreISel() {
+  if (TM->getOptLevel() != CodeGenOptLevel::None)
+    addPass(createRISCVPromoteConstantPass());
   if (TM->getOptLevel() != CodeGenOptLevel::None) {
     // Add a barrier before instruction selection so that we will not get
     // deleted block address after enabling default outlining. See D99707 for
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 2588c878d8472..9e65399e75dc7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -167,8 +167,7 @@ static bool sinkScalarOperands(VPlan &Plan) {
     if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Candidate))
       return;
 
-    if (Candidate->getParent() == SinkTo || Candidate->mayHaveSideEffects() ||
-        Candidate->mayReadOrWriteMemory())
+    if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
       return;
 
     if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll
new file mode 100644
index 0000000000000..006713ccabf47
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll
@@ -0,0 +1,303 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+declare <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float>)
+
+define void @lasx_cast_128_s(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_cast_128_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x float>, ptr %va
+  %b = call <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float> %a)
+  store <8 x float> %b, ptr %vd
+  ret void
+}
+
+declare <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double>)
+
+define void @lasx_cast_128_d(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_cast_128_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <2 x double>, ptr %va
+  %b = call <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double> %a)
+  store <4 x double> %b, ptr %vd
+  ret void
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64>)
+
+define void @lasx_cast_128(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_cast_128:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <2 x i64>, ptr %va
+  %b = call <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64> %a)
+  store <4 x i64> %b, ptr %vd
+  ret void
+}
+
+declare <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float>, <4 x float>)
+
+define void @lasx_concat_128_s(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_concat_128_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x float>, ptr %va
+  %b = load <4 x float>, ptr %vb
+  %c = call <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float> %a, <4 x float> %b)
+  store <8 x float> %c, ptr %vd
+  ret void
+}
+
+declare <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double>, <2 x double>)
+
+define void @lasx_concat_128_d(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_concat_128_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <2 x double>, ptr %va
+  %b = load <2 x double>, ptr %vb
+  %c = call <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double> %a, <2 x double> %b)
+  store <4 x double> %c, ptr %vd
+  ret void
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64>, <2 x i64>)
+
+define void @lasx_concat_128(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_concat_128:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <2 x i64>, ptr %va
+  %b = load <2 x i64>, ptr %vb
+  %c = call <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64> %a, <2 x i64> %b)
+  store <4 x i64> %c, ptr %vd
+  ret void
+}
+
+declare <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float>)
+
+define void @lasx_extract_128_lo_s(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_lo_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x float>, ptr %va
+  %c = call <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float> %a)
+  store <4 x float> %c, ptr %vd
+  ret void
+}
+
+declare <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double>)
+
+define void @lasx_extract_128_lo_d(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_lo_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x double>, ptr %va
+  %c = call <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double> %a)
+  store <2 x double> %c, ptr %vd
+  ret void
+}
+
+declare <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64>)
+
+define void @lasx_extract_128_lo(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_lo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i64>, ptr %va
+  %c = call <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64> %a)
+  store <2 x i64> %c, ptr %vd
+  ret void
+}
+
+declare <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float>)
+
+define void @lasx_extract_128_hi_s(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_hi_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x float>, ptr %va
+  %c = call <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float> %a)
+  store <4 x float> %c, ptr %vd
+  ret void
+}
+
+declare <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double>)
+
+define void @lasx_extract_128_hi_d(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_hi_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x double>, ptr %va
+  %c = call <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double> %a)
+  store <2 x double> %c, ptr %vd
+  ret void
+}
+
+declare <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64>)
+
+define void @lasx_extract_128_hi(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_hi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i64>, ptr %va
+  %c = call <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64> %a)
+  store <2 x i64> %c, ptr %vd
+  ret void
+}
+
+declare <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float>, <4 x float>)
+
+define void @lasx_insert_128_lo_s(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_lo_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x float>, ptr %va
+  %b = load <4 x float>, ptr %vb
+  %c = call <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float> %a, <4 x float> %b)
+  store <8 x float> %c, ptr %vd
+  ret void
+}
+
+declare <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double>, <2 x double>)
+
+define void @lasx_insert_128_lo_d(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_lo_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x double>, ptr %va
+  %b = load <2 x double>, ptr %vb
+  %c = call <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double> %a, <2 x double> %b)
+  store <4 x double> %c, ptr %vd
+  ret void
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64>, <2 x i64>)
+
+define void @lasx_insert_128_lo(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_lo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i64>, ptr %va
+  %b = load <2 x i64>, ptr %vb
+  %c = call <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64> %a, <2 x i64> %b)
+  store <4 x i64> %c, ptr %vd
+  ret void
+}
+
+declare <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float>, <4 x float>)
+
+define void @lasx_insert_128_hi_s(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_hi_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x float>, ptr %va
+  %b = load <4 x float>, ptr %vb
+  %c = call <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float> %a, <4 x float> %b)
+  store <8 x float> %c, ptr %vd
+  ret void
+}
+
+declare <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double>, <2 x double>)
+
+define void @lasx_insert_128_hi_d(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_hi_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x double>, ptr %va
+  %b = load <2 x double>, ptr %vb
+  %c = call <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double> %a, <2 x double> %b)
+  store <4 x double> %c, ptr %vd
+  ret void
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64>, <2 x i64>)
+
+define void @lasx_insert_128_hi(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_hi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i64>, ptr %va
+  %b = load <2 x i64>, ptr %vb
+  %c = call <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64> %a, <2 x i64> %b)
+  store <4 x i64> %c, ptr %vd
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index ea08061221fd4..769823d1c4216 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -75,6 +75,7 @@
 ; CHECK-NEXT:       CodeGen Prepare
 ; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Exception handling preparation
+; CHECK-NEXT:     RISC-V Promote Constants
 ; CHECK-NEXT:     A No-Op Barrier Pass
 ; CHECK-NEXT:     FunctionPass Manager
 ; CHECK-NEXT:       Merge internal globals
diff --git a/llvm/test/CodeGen/RISCV/cfi-multiple-locations.mir b/llvm/test/CodeGen/RISCV/cfi-multiple-locations.mir
new file mode 100644
index 0000000000000..7844589e3f93c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/cfi-multiple-locations.mir
@@ -0,0 +1,35 @@
+# RUN: llc %s -mtriple=riscv64 \
+# RUN: -run-pass=cfi-instr-inserter \
+# RUN: -riscv-enable-cfi-instr-inserter=true
+# XFAIL: *
+
+# Technically, it is possible that a callee-saved register is saved in multiple different locations.
+# CFIInstrInserter should handle this, but currently it does not.
+---
+name: multiple_locations
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x10, $x9, $x2
+    BEQ $x10, $x0, %bb.3
+    PseudoBR %bb.2
+
+  bb.1:
+    liveins: $x10, $x9, $x2
+    $x5 = COPY $x9
+    CFI_INSTRUCTION register $x9, $x5
+    $x9 = COPY $x5
+    CFI_INSTRUCTION register $x9, $x9
+    PseudoBR %bb.3
+
+  bb.2:
+    liveins: $x10, $x9, $x2
+    SD $x9, $x2, 0 :: (store (s64))
+    CFI_INSTRUCTION offset $x9, 0
+    $x9  = LD $x2, 0 :: (load (s64))
+    CFI_INSTRUCTION register $x9, $x9
+    PseudoBR %bb.3
+
+  bb.3:
+    PseudoRET
+...
diff --git a/llvm/test/CodeGen/RISCV/riscv-promote-constant.ll b/llvm/test/CodeGen/RISCV/riscv-promote-constant.ll
new file mode 100644
index 0000000000000..2bde6013b3640
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/riscv-promote-constant.ll
@@ -0,0 +1,148 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt %s -S -riscv-promote-const -mtriple=riscv64 -mattr=+d | FileCheck %s
+
+; No promotion should take place, as the pass skips floats.
+define float @multiple_floats(float %a, float %b) {
+; CHECK-LABEL: define float @multiple_floats(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd float [[A]], 1.000000e+00
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd float [[B]], 2.000000e+00
+; CHECK-NEXT:    [[SUM_F:%.*]] = fadd float [[ADD1]], [[ADD2]]
+; CHECK-NEXT:    ret float [[SUM_F]]
+;
+entry:
+  %add1 = fadd float %a, 1.0
+  %add2 = fadd float %b, 2.0
+  %sum_f = fadd float %add1, %add2
+  ret float %sum_f
+}
+
+; No promotion should take place as cases with a single constant are skipped.
+define double @single_double(double %a) {
+; CHECK-LABEL: define double @single_double(
+; CHECK-SAME: double [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[A]], 4.210000e+01
+; CHECK-NEXT:    ret double [[ADD]]
+;
+entry:
+  %add = fadd double %a, 42.1
+  ret double %add
+}
+
+; Promotion should happen as we have at least two unique constants that would
+; otherwise go in the constant pool.
+define double @multiple_doubles(double %a, double %b) {
+; CHECK-LABEL: define double @multiple_doubles(
+; CHECK-SAME: double [[A:%.*]], double [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DOUBLE_VAL1:%.*]] = load double, ptr getelementptr inbounds ([2 x double], ptr @.promoted_doubles.multiple_doubles, i64 0, i64 1), align 8
+; CHECK-NEXT:    [[ADD3:%.*]] = load double, ptr @.promoted_doubles.multiple_doubles, align 8
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd double [[A]], [[ADD3]]
+; CHECK-NEXT:    [[ADD4:%.*]] = fadd double [[B]], [[DOUBLE_VAL1]]
+; CHECK-NEXT:    [[SUM:%.*]] = fadd double [[ADD2]], [[ADD3]]
+; CHECK-NEXT:    [[SUM1:%.*]] = fadd double [[ADD4]], [[SUM]]
+; CHECK-NEXT:    ret double [[SUM1]]
+;
+entry:
+  %add1 = fadd double %a, 2.718
+  %add2 = fadd double %b, 42.1
+  %add3 = fadd double %add1, 2.718
+  %sum = fadd double %add2, %add3
+  ret double %sum
+}
+
+; Promotion should not happen as the constants will be materialised rather
+; than using the constant pool.
+define double @multiple_doubles_no_promote(double %a, double %b) {
+; CHECK-LABEL: define double @multiple_doubles_no_promote(
+; CHECK-SAME: double [[A:%.*]], double [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd double [[A]], 1.000000e+00
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd double [[B]], 2.000000e+00
+; CHECK-NEXT:    [[ADD3:%.*]] = fadd double [[ADD1]], 1.000000e+00
+; CHECK-NEXT:    [[SUM:%.*]] = fadd double [[ADD2]], [[ADD3]]
+; CHECK-NEXT:    ret double [[SUM]]
+;
+entry:
+  %add1 = fadd double %a, 1.0
+  %add2 = fadd double %b, 2.0
+  %add3 = fadd double %add1, 1.0
+  %sum = fadd double %add2, %add3
+  ret double %sum
+}
+
+; The same constant shouldn't be loaded more than once per BB.
+define double @multiple_doubles_multi_bb(double %a, i1 %cond) {
+; CHECK-LABEL: define double @multiple_doubles_multi_bb(
+; CHECK-SAME: double [[A:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF_TRUE:.*]], label %[[IF_FALSE:.*]]
+; CHECK:       [[IF_TRUE]]:
+; CHECK-NEXT:    [[DOUBLE_VAL2:%.*]] = load double, ptr getelementptr inbounds ([2 x double], ptr @.promoted_doubles.multiple_doubles_multi_bb, i64 0, i64 1), align 8
+; CHECK-NEXT:    [[DOUBLE_VAL:%.*]] = load double, ptr @.promoted_doubles.multiple_doubles_multi_bb, align 8
+; CHECK-NEXT:    [[ADD_T:%.*]] = fadd double [[A]], [[DOUBLE_VAL]]
+; CHECK-NEXT:    [[MUL_T:%.*]] = fmul double [[ADD_T]], [[DOUBLE_VAL2]]
+; CHECK-NEXT:    [[SUB_T:%.*]] = fsub double [[MUL_T]], [[DOUBLE_VAL]]
+; CHECK-NEXT:    br label %[[IF_END:.*]]
+; CHECK:       [[IF_FALSE]]:
+; CHECK-NEXT:    [[DOUBLE_VAL3:%.*]] = load double, ptr getelementptr inbounds ([2 x double], ptr @.promoted_doubles.multiple_doubles_multi_bb, i64 0, i64 1), align 8
+; CHECK-NEXT:    [[DOUBLE_VAL1:%.*]] = load double, ptr @.promoted_doubles.multiple_doubles_multi_bb, align 8
+; CHECK-NEXT:    [[ADD_F:%.*]] = fadd double [[A]], [[DOUBLE_VAL1]]
+; CHECK-NEXT:    [[MUL_F:%.*]] = fmul double [[ADD_F]], [[DOUBLE_VAL3]]
+; CHECK-NEXT:    [[SUB_F:%.*]] = fsub double [[MUL_F]], [[DOUBLE_VAL1]]
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi double [ [[SUB_T]], %[[IF_TRUE]] ], [ [[SUB_F]], %[[IF_FALSE]] ]
+; CHECK-NEXT:    ret double [[PHI_RES]]
+;
+entry:
+  br i1 %cond, label %if.true, label %if.false
+
+if.true:
+  %add.t = fadd double %a, 1.23
+  %mul.t = fmul double %add.t, 4.56
+  %sub.t = fsub double %mul.t, 1.23
+  br label %if.end
+
+if.false:
+  %add.f = fadd double %a, 1.23
+  %mul.f = fmul double %add.f, 4.56
+  %sub.f = fsub double %mul.f, 1.23
+  br label %if.end
+
+if.end:
+  %phi.res = phi double [ %sub.t, %if.true ], [ %sub.f, %if.false ]
+  ret double %phi.res
+}
+
+; Check the insertion point in the case we have a phi taking a constant C and
+; the source block also uses that same constant.
+define double @multiple_doubles_phi(double %a, i1 %cond) {
+; CHECK-LABEL: define double @multiple_doubles_phi(
+; CHECK-SAME: double [[A:%.*]], i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br i1 [[COND]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    [[DOUBLE_VAL:%.*]] = load double, ptr @.promoted_doubles.multiple_doubles_phi, align 8
+; CHECK-NEXT:    [[MUL:%.*]] = fmul double [[A]], [[DOUBLE_VAL]]
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[PHI_VAL:%.*]] = phi double [ [[DOUBLE_VAL]], %[[IF_THEN]] ], [ [[A]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[DOUBLE_VAL1:%.*]] = load double, ptr getelementptr inbounds ([2 x double], ptr @.promoted_doubles.multiple_doubles_phi, i64 0, i64 1), align 8
+; CHECK-NEXT:    [[RES:%.*]] = fadd double [[PHI_VAL]], [[DOUBLE_VAL1]]
+; CHECK-NEXT:    ret double [[RES]]
+;
+entry:
+  br i1 %cond, label %if.then, label %if.end
+
+if.then:
+  %mul = fmul double %a, 1.23
+  br label %if.end
+
+if.end:
+  %phi.val = phi double [ 1.23, %if.then ], [ %a, %entry ]
+  %res = fadd double %phi.val, 4.56
+  ret double %res
+}
diff --git a/llvm/test/CodeGen/X86/pr166534.ll b/llvm/test/CodeGen/X86/pr166534.ll
new file mode 100644
index 0000000000000..aef44cc3e40d0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr166534.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE4
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512
+
+define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
+; SSE2-LABEL: pr166534:
+; SSE2:       # %bb.0: # %entry
+; SSE2-NEXT:    movq (%rdi), %rax
+; SSE2-NEXT:    movq 8(%rdi), %r8
+; SSE2-NEXT:    movdqu (%rdi), %xmm0
+; SSE2-NEXT:    movq (%rsi), %r9
+; SSE2-NEXT:    movq 8(%rsi), %rdi
+; SSE2-NEXT:    movdqu (%rsi), %xmm1
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT:    pmovmskb %xmm1, %esi
+; SSE2-NEXT:    xorl %r10d, %r10d
+; SSE2-NEXT:    cmpl $65535, %esi # imm = 0xFFFF
+; SSE2-NEXT:    sete %r10b
+; SSE2-NEXT:    orq %r10, (%rdx)
+; SSE2-NEXT:    cmpl $65535, %esi # imm = 0xFFFF
+; SSE2-NEXT:    jne .LBB0_2
+; SSE2-NEXT:  # %bb.1: # %if.then
+; SSE2-NEXT:    xorq %r9, %rax
+; SSE2-NEXT:    xorq %rdi, %r8
+; SSE2-NEXT:    xorl %edx, %edx
+; SSE2-NEXT:    orq %rax, %r8
+; SSE2-NEXT:    sete %dl
+; SSE2-NEXT:    orq %rdx, (%rcx)
+; SSE2-NEXT:  .LBB0_2: # %if.end
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: pr166534:
+; SSE4:       # %bb.0: # %entry
+; SSE4-NEXT:    movq (%rdi), %rax
+; SSE4-NEXT:    movq 8(%rdi), %r8
+; SSE4-NEXT:    movdqu (%rdi), %xmm0
+; SSE4-NEXT:    movq (%rsi), %r9
+; SSE4-NEXT:    movq 8(%rsi), %rdi
+; SSE4-NEXT:    movdqu (%rsi), %xmm1
+; SSE4-NEXT:    pxor %xmm0, %xmm1
+; SSE4-NEXT:    xorl %esi, %esi
+; SSE4-NEXT:    ptest %xmm1, %xmm1
+; SSE4-NEXT:    sete %sil
+; SSE4-NEXT:    orq %rsi, (%rdx)
+; SSE4-NEXT:    ptest %xmm1, %xmm1
+; SSE4-NEXT:    jne .LBB0_2
+; SSE4-NEXT:  # %bb.1: # %if.then
+; SSE4-NEXT:    xorq %r9, %rax
+; SSE4-NEXT:    xorq %rdi, %r8
+; SSE4-NEXT:    xorl %edx, %edx
+; SSE4-NEXT:    orq %rax, %r8
+; SSE4-NEXT:    sete %dl
+; SSE4-NEXT:    orq %rdx, (%rcx)
+; SSE4-NEXT:  .LBB0_2: # %if.end
+; SSE4-NEXT:    retq
+;
+; AVX2-LABEL: pr166534:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    movq (%rdi), %rax
+; AVX2-NEXT:    movq 8(%rdi), %r8
+; AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX2-NEXT:    movq (%rsi), %rdi
+; AVX2-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; AVX2-NEXT:    movq 8(%rsi), %rsi
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    vptest %xmm0, %xmm0
+; AVX2-NEXT:    sete %r9b
+; AVX2-NEXT:    orq %r9, (%rdx)
+; AVX2-NEXT:    vptest %xmm0, %xmm0
+; AVX2-NEXT:    jne .LBB0_2
+; AVX2-NEXT:  # %bb.1: # %if.then
+; AVX2-NEXT:    xorq %rdi, %rax
+; AVX2-NEXT:    xorq %rsi, %r8
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    orq %rax, %r8
+; AVX2-NEXT:    sete %dl
+; AVX2-NEXT:    orq %rdx, (%rcx)
+; AVX2-NEXT:  .LBB0_2: # %if.end
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: pr166534:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    movq (%rdi), %rax
+; AVX512-NEXT:    movq 8(%rdi), %r8
+; AVX512-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX512-NEXT:    movq (%rsi), %r9
+; AVX512-NEXT:    movq 8(%rsi), %rdi
+; AVX512-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; AVX512-NEXT:    xorl %esi, %esi
+; AVX512-NEXT:    vptest %xmm0, %xmm0
+; AVX512-NEXT:    sete %sil
+; AVX512-NEXT:    orq %rsi, (%rdx)
+; AVX512-NEXT:    vptest %xmm0, %xmm0
+; AVX512-NEXT:    jne .LBB0_2
+; AVX512-NEXT:  # %bb.1: # %if.then
+; AVX512-NEXT:    xorq %r9, %rax
+; AVX512-NEXT:    xorq %rdi, %r8
+; AVX512-NEXT:    xorl %edx, %edx
+; AVX512-NEXT:    orq %rax, %r8
+; AVX512-NEXT:    sete %dl
+; AVX512-NEXT:    orq %rdx, (%rcx)
+; AVX512-NEXT:  .LBB0_2: # %if.end
+; AVX512-NEXT:    retq
+entry:
+  %a = load i128, ptr %pa, align 8
+  %b = load i128, ptr %pb, align 8
+  %cmp = icmp eq i128 %a, %b
+  %conv1 = zext i1 %cmp to i128
+  %c = load i128, ptr %pc, align 8
+  %or = or i128 %c, %conv1
+  store i128 %or, ptr %pc, align 8
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %d = load i128, ptr %pd, align 8
+  %or7 = or i128 %d, %conv1
+  store i128 %or7, ptr %pd, align 8
+  br label %if.end
+
+if.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll
index a852b731ea13b..9e523be618b44 100644
--- a/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-alloca-in-loop.ll
@@ -12,12 +12,15 @@ define i32 @test(ptr %vf1, i64 %n) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE6:.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[PRED_STORE_CONTINUE6]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule <4 x i8> [[VEC_IND]], splat (i8 -56)
+; CHECK-NEXT:    [[TMP18:%.*]] = alloca i8, i64 [[N]], align 16
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP18]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x ptr> [[BROADCAST_SPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP1]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[VF1]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = alloca i8, i64 [[N]], align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x ptr> [[BROADCAST_SPLAT]], i32 0
 ; CHECK-NEXT:    store ptr [[TMP4]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; CHECK:       [[PRED_STORE_CONTINUE]]:
@@ -26,7 +29,7 @@ define i32 @test(ptr %vf1, i64 %n) {
 ; CHECK:       [[PRED_STORE_IF1]]:
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[VF1]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = alloca i8, i64 [[N]], align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x ptr> [[BROADCAST_SPLAT]], i32 0
 ; CHECK-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 8
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
 ; CHECK:       [[PRED_STORE_CONTINUE2]]:
@@ -35,7 +38,7 @@ define i32 @test(ptr %vf1, i64 %n) {
 ; CHECK:       [[PRED_STORE_IF3]]:
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds ptr, ptr [[VF1]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = alloca i8, i64 [[N]], align 16
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x ptr> [[BROADCAST_SPLAT]], i32 0
 ; CHECK-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 8
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
 ; CHECK:       [[PRED_STORE_CONTINUE4]]:
@@ -44,7 +47,7 @@ define i32 @test(ptr %vf1, i64 %n) {
 ; CHECK:       [[PRED_STORE_IF5]]:
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds ptr, ptr [[VF1]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = alloca i8, i64 [[N]], align 16
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x ptr> [[BROADCAST_SPLAT]], i32 0
 ; CHECK-NEXT:    store ptr [[TMP16]], ptr [[TMP15]], align 8
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
 ; CHECK:       [[PRED_STORE_CONTINUE6]]:
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
index 2ece91331c5d8..11a57fcb008cd 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/WebAssembly/BUILD.gn
@@ -12,10 +12,17 @@ tablegen("WebAssemblyGenFastISel") {
   td_file = "WebAssembly.td"
 }
 
+tablegen("WebAssemblyGenSDNodeInfo") {
+  visibility = [ ":LLVMWebAssemblyCodeGen" ]
+  args = [ "-gen-sd-node-info" ]
+  td_file = "WebAssembly.td"
+}
+
 static_library("LLVMWebAssemblyCodeGen") {
   deps = [
     ":WebAssemblyGenDAGISel",
     ":WebAssemblyGenFastISel",
+    ":WebAssemblyGenSDNodeInfo",
     "MCTargetDesc",
     "TargetInfo",
     "//llvm/include/llvm/Config:llvm-config",
diff --git a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
index 964281592cc65..cad6cec761ab8 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
@@ -92,12 +92,43 @@ class VectorConvertToLLVMPattern : public ConvertOpToLLVMPattern<SourceOp> {
   using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern;
   using Super = VectorConvertToLLVMPattern<SourceOp, TargetOp>;
 
+  /// Return the given type if it's a floating point type. If the given type is
+  /// a vector type, return its element type if it's a floating point type.
+  static FloatType getFloatingPointType(Type type) {
+    if (auto floatType = dyn_cast<FloatType>(type))
+      return floatType;
+    if (auto vecType = dyn_cast<VectorType>(type))
+      return dyn_cast<FloatType>(vecType.getElementType());
+    return nullptr;
+  }
+
   LogicalResult
   matchAndRewrite(SourceOp op, typename SourceOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     static_assert(
         std::is_base_of<OpTrait::OneResult<SourceOp>, SourceOp>::value,
         "expected single result op");
+
+    // The pattern should not apply if a floating-point operand is converted to
+    // a non-floating-point type. This indicates that the floating point type
+    // is not supported by the LLVM lowering. (Such types are converted to
+    // integers.)
+    auto checkType = [&](Value v) -> LogicalResult {
+      FloatType floatType = getFloatingPointType(v.getType());
+      if (!floatType)
+        return success();
+      Type convertedType = this->getTypeConverter()->convertType(floatType);
+      if (!isa_and_nonnull<FloatType>(convertedType))
+        return rewriter.notifyMatchFailure(op,
+                                           "unsupported floating point type");
+      return success();
+    };
+    for (Value operand : op->getOperands())
+      if (failed(checkType(operand)))
+        return failure();
+    if (failed(checkType(op->getResult(0))))
+      return failure();
+
     // Determine attributes for the target op
     AttrConvert<SourceOp, TargetOp> attrConvert(op);
 
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index ed7e2a08ebfd9..5ac9e26e8636d 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -981,6 +981,28 @@ class ConversionPatternRewriter final : public PatternRewriter {
   /// Return a reference to the internal implementation.
   detail::ConversionPatternRewriterImpl &getImpl();
 
+  /// Attempt to legalize the given operation. This can be used within
+  /// conversion patterns to change the default pre-order legalization order.
+  /// Returns "success" if the operation was legalized, "failure" otherwise.
+  ///
+  /// Note: In a partial conversion, this function returns "success" even if
+  /// the operation could not be legalized, as long as it was not explicitly
+  /// marked as illegal in the conversion target.
+  LogicalResult legalize(Operation *op);
+
+  /// Attempt to legalize the given region. This can be used within
+  /// conversion patterns to change the default pre-order legalization order.
+  /// Returns "success" if the region was legalized, "failure" otherwise.
+  ///
+  /// If the current pattern runs with a type converter, the entry block
+  /// signature will be converted before legalizing the operations in the
+  /// region.
+  ///
+  /// Note: In a partial conversion, this function returns "success" even if
+  /// an operation could not be legalized, as long as it was not explicitly
+  /// marked as illegal in the conversion target.
+  LogicalResult legalize(Region *r);
+
 private:
   // Allow OperationConverter to construct new rewriters.
   friend struct OperationConverter;
@@ -989,7 +1011,8 @@ class ConversionPatternRewriter final : public PatternRewriter {
   /// conversions. They apply some IR rewrites in a delayed fashion and could
   /// bring the IR into an inconsistent state when used standalone.
   explicit ConversionPatternRewriter(MLIRContext *ctx,
-                                     const ConversionConfig &config);
+                                     const ConversionConfig &config,
+                                     OperationConverter &converter);
 
   // Hide unsupported pattern rewriter API.
   using OpBuilder::setListener;
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 2fe06970eb568..f8c38fadbd229 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -92,6 +92,22 @@ static OpBuilder::InsertPoint computeInsertPoint(ArrayRef<Value> vals) {
   return pt;
 }
 
+namespace {
+enum OpConversionMode {
+  /// In this mode, the conversion will ignore failed conversions to allow
+  /// illegal operations to co-exist in the IR.
+  Partial,
+
+  /// In this mode, all operations must be legal for the given target for the
+  /// conversion to succeed.
+  Full,
+
+  /// In this mode, operations are analyzed for legality. No actual rewrites are
+  /// applied to the operations on success.
+  Analysis,
+};
+} // namespace
+
 //===----------------------------------------------------------------------===//
 // ConversionValueMapping
 //===----------------------------------------------------------------------===//
@@ -866,8 +882,9 @@ namespace mlir {
 namespace detail {
 struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   explicit ConversionPatternRewriterImpl(ConversionPatternRewriter &rewriter,
-                                         const ConversionConfig &config)
-      : rewriter(rewriter), config(config),
+                                         const ConversionConfig &config,
+                                         OperationConverter &opConverter)
+      : rewriter(rewriter), config(config), opConverter(opConverter),
         notifyingRewriter(rewriter.getContext(), config.listener) {}
 
   //===--------------------------------------------------------------------===//
@@ -1124,6 +1141,9 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// Dialect conversion configuration.
   const ConversionConfig &config;
 
+  /// The operation converter to use for recursive legalization.
+  OperationConverter &opConverter;
+
   /// A set of erased operations. This set is utilized only if
   /// `allowPatternRollback` is set to "false". Conceptually, this set is
   /// similar to `replacedOps` (which is maintained when the flag is set to
@@ -2084,9 +2104,10 @@ void ConversionPatternRewriterImpl::notifyMatchFailure(
 //===----------------------------------------------------------------------===//
 
 ConversionPatternRewriter::ConversionPatternRewriter(
-    MLIRContext *ctx, const ConversionConfig &config)
-    : PatternRewriter(ctx),
-      impl(new detail::ConversionPatternRewriterImpl(*this, config)) {
+    MLIRContext *ctx, const ConversionConfig &config,
+    OperationConverter &opConverter)
+    : PatternRewriter(ctx), impl(new detail::ConversionPatternRewriterImpl(
+                                *this, config, opConverter)) {
   setListener(impl.get());
 }
 
@@ -2207,6 +2228,37 @@ ConversionPatternRewriter::getRemappedValues(ValueRange keys,
   return success();
 }
 
+LogicalResult ConversionPatternRewriter::legalize(Region *r) {
+  // Fast path: If the region is empty, there is nothing to legalize.
+  if (r->empty())
+    return success();
+
+  // Gather a list of all operations to legalize. This is done before
+  // converting the entry block signature because unrealized_conversion_cast
+  // ops should not be included.
+  SmallVector<Operation *> ops;
+  for (Block &b : *r)
+    for (Operation &op : b)
+      ops.push_back(&op);
+
+  // If the current pattern runs with a type converter, convert the entry block
+  // signature.
+  if (const TypeConverter *converter = impl->currentTypeConverter) {
+    std::optional<TypeConverter::SignatureConversion> conversion =
+        converter->convertBlockSignature(&r->front());
+    if (!conversion)
+      return failure();
+    applySignatureConversion(&r->front(), *conversion, converter);
+  }
+
+  // Legalize all operations in the region.
+  for (Operation *op : ops)
+    if (failed(legalize(op)))
+      return failure();
+
+  return success();
+}
+
 void ConversionPatternRewriter::inlineBlockBefore(Block *source, Block *dest,
                                                   Block::iterator before,
                                                   ValueRange argValues) {
@@ -3192,22 +3244,6 @@ static void reconcileUnrealizedCasts(
 // OperationConverter
 //===----------------------------------------------------------------------===//
 
-namespace {
-enum OpConversionMode {
-  /// In this mode, the conversion will ignore failed conversions to allow
-  /// illegal operations to co-exist in the IR.
-  Partial,
-
-  /// In this mode, all operations must be legal for the given target for the
-  /// conversion to succeed.
-  Full,
-
-  /// In this mode, operations are analyzed for legality. No actual rewrites are
-  /// applied to the operations on success.
-  Analysis,
-};
-} // namespace
-
 namespace mlir {
 // This class converts operations to a given conversion target via a set of
 // rewrite patterns. The conversion behaves differently depending on the
@@ -3217,16 +3253,20 @@ struct OperationConverter {
                               const FrozenRewritePatternSet &patterns,
                               const ConversionConfig &config,
                               OpConversionMode mode)
-      : rewriter(ctx, config), opLegalizer(rewriter, target, patterns),
+      : rewriter(ctx, config, *this), opLegalizer(rewriter, target, patterns),
         mode(mode) {}
 
   /// Converts the given operations to the conversion target.
   LogicalResult convertOperations(ArrayRef<Operation *> ops);
 
-private:
-  /// Converts an operation with the given rewriter.
-  LogicalResult convert(Operation *op);
+  /// Converts a single operation. If `isRecursiveLegalization` is "true", the
+  /// conversion is a recursive legalization request, triggered from within a
+  /// pattern. In that case, do not emit errors because there will be another
+  /// attempt at legalizing the operation later (via the regular pre-order
+  /// legalization mechanism).
+  LogicalResult convert(Operation *op, bool isRecursiveLegalization = false);
 
+private:
   /// The rewriter to use when converting operations.
   ConversionPatternRewriter rewriter;
 
@@ -3238,32 +3278,42 @@ struct OperationConverter {
 };
 } // namespace mlir
 
-LogicalResult OperationConverter::convert(Operation *op) {
+LogicalResult ConversionPatternRewriter::legalize(Operation *op) {
+  return impl->opConverter.convert(op, /*isRecursiveLegalization=*/true);
+}
+
+LogicalResult OperationConverter::convert(Operation *op,
+                                          bool isRecursiveLegalization) {
   const ConversionConfig &config = rewriter.getConfig();
 
   // Legalize the given operation.
   if (failed(opLegalizer.legalize(op))) {
     // Handle the case of a failed conversion for each of the different modes.
     // Full conversions expect all operations to be converted.
-    if (mode == OpConversionMode::Full)
-      return op->emitError()
-             << "failed to legalize operation '" << op->getName() << "'";
+    if (mode == OpConversionMode::Full) {
+      if (!isRecursiveLegalization)
+        op->emitError() << "failed to legalize operation '" << op->getName()
+                        << "'";
+      return failure();
+    }
     // Partial conversions allow conversions to fail iff the operation was not
     // explicitly marked as illegal. If the user provided a `unlegalizedOps`
     // set, non-legalizable ops are added to that set.
     if (mode == OpConversionMode::Partial) {
-      if (opLegalizer.isIllegal(op))
-        return op->emitError()
-               << "failed to legalize operation '" << op->getName()
-               << "' that was explicitly marked illegal";
-      if (config.unlegalizedOps)
+      if (opLegalizer.isIllegal(op)) {
+        if (!isRecursiveLegalization)
+          op->emitError() << "failed to legalize operation '" << op->getName()
+                          << "' that was explicitly marked illegal";
+        return failure();
+      }
+      if (config.unlegalizedOps && !isRecursiveLegalization)
         config.unlegalizedOps->insert(op);
     }
   } else if (mode == OpConversionMode::Analysis) {
     // Analysis conversions don't fail if any operations fail to legalize,
     // they are only interested in the operations that were successfully
     // legalized.
-    if (config.legalizableOps)
+    if (config.legalizableOps && !isRecursiveLegalization)
       config.legalizableOps->insert(op);
   }
   return success();
diff --git a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
index ba12ff29ebef9..b5dcb01d3dc6b 100644
--- a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
+++ b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
@@ -747,3 +747,29 @@ func.func @memref_bitcast(%1: memref<?xi16>) -> memref<?xbf16> {
   %2 = arith.bitcast %1 : memref<?xi16> to memref<?xbf16>
   func.return %2 : memref<?xbf16>
 }
+
+// -----
+
+// CHECK-LABEL: func @unsupported_fp_type
+//       CHECK:   arith.addf {{.*}} : f4E2M1FN
+//       CHECK:   arith.addf {{.*}} : vector<4xf4E2M1FN>
+//       CHECK:   arith.addf {{.*}} : vector<8x4xf4E2M1FN>
+func.func @unsupported_fp_type(%arg0: f4E2M1FN, %arg1: vector<4xf4E2M1FN>, %arg2: vector<8x4xf4E2M1FN>) -> (f4E2M1FN, vector<4xf4E2M1FN>, vector<8x4xf4E2M1FN>) {
+  %0 = arith.addf %arg0, %arg0 : f4E2M1FN
+  %1 = arith.addf %arg1, %arg1 : vector<4xf4E2M1FN>
+  %2 = arith.addf %arg2, %arg2 : vector<8x4xf4E2M1FN>
+  return %0, %1, %2 : f4E2M1FN, vector<4xf4E2M1FN>, vector<8x4xf4E2M1FN>
+}
+
+// -----
+
+//   CHECK-LABEL: func @supported_fp_type
+//         CHECK:   llvm.fadd {{.*}} : f32
+//         CHECK:   llvm.fadd {{.*}} : vector<4xf32>
+// CHECK-COUNT-4:   llvm.fadd {{.*}} : vector<8xf32>
+func.func @supported_fp_type(%arg0: f32, %arg1: vector<4xf32>, %arg2: vector<4x8xf32>) -> (f32, vector<4xf32>, vector<4x8xf32>) {
+  %0 = arith.addf %arg0, %arg0 : f32
+  %1 = arith.addf %arg1, %arg1 : vector<4xf32>
+  %2 = arith.addf %arg2, %arg2 : vector<4x8xf32>
+  return %0, %1, %2 : f32, vector<4xf32>, vector<4x8xf32>
+}
diff --git a/mlir/test/Transforms/test-legalizer-full.mlir b/mlir/test/Transforms/test-legalizer-full.mlir
index 42cec68b9fbbb..8da9109a32762 100644
--- a/mlir/test/Transforms/test-legalizer-full.mlir
+++ b/mlir/test/Transforms/test-legalizer-full.mlir
@@ -72,3 +72,21 @@ builtin.module {
   }
 
 }
+
+// -----
+
+// The region of "test.post_order_legalization" is converted before the op.
+
+// expected-remark@+1 {{applyFullConversion failed}}
+builtin.module {
+func.func @test_preorder_legalization() {
+  // expected-error@+1 {{failed to legalize operation 'test.post_order_legalization'}}
+  "test.post_order_legalization"() ({
+  ^bb0(%arg0: i64):
+    // Not-explicitly-legal ops are not allowed to survive.
+    "test.remaining_consumer"(%arg0) : (i64) -> ()
+    "test.invalid"(%arg0) : (i64) -> ()
+  }) : () -> ()
+  return
+}
+}
diff --git a/mlir/test/Transforms/test-legalizer-rollback.mlir b/mlir/test/Transforms/test-legalizer-rollback.mlir
index 71e11782e14b0..4bcca6b7e5228 100644
--- a/mlir/test/Transforms/test-legalizer-rollback.mlir
+++ b/mlir/test/Transforms/test-legalizer-rollback.mlir
@@ -163,3 +163,22 @@ func.func @create_unregistered_op_in_pattern() -> i32 {
   "test.return"(%0) : (i32) -> ()
 }
 }
+
+// -----
+
+// CHECK-LABEL: func @test_failed_preorder_legalization
+//       CHECK:   "test.post_order_legalization"() ({
+//       CHECK:     %[[r:.*]] = "test.illegal_op_g"() : () -> i32
+//       CHECK:     "test.return"(%[[r]]) : (i32) -> ()
+//       CHECK:   }) : () -> ()
+// expected-remark @+1 {{applyPartialConversion failed}}
+module {
+func.func @test_failed_preorder_legalization() {
+  // expected-error @+1 {{failed to legalize operation 'test.post_order_legalization' that was explicitly marked illegal}}
+  "test.post_order_legalization"() ({
+    %0 = "test.illegal_op_g"() : () -> (i32)
+    "test.return"(%0) : (i32) -> ()
+  }) : () -> ()
+  return
+}
+}
diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir
index 7c43bb7bface0..88a71cc26ab0c 100644
--- a/mlir/test/Transforms/test-legalizer.mlir
+++ b/mlir/test/Transforms/test-legalizer.mlir
@@ -448,3 +448,35 @@ func.func @test_working_1to1_pattern(%arg0: f16) {
   "test.type_consumer"(%arg0) : (f16) -> ()
   "test.return"() : () -> ()
 }
+
+// -----
+
+// The region of "test.post_order_legalization" is converted before the op.
+
+// CHECK: notifyBlockInserted into test.post_order_legalization: was unlinked
+// CHECK: notifyOperationInserted: test.invalid
+// CHECK: notifyBlockErased
+// CHECK: notifyOperationInserted: test.valid, was unlinked
+// CHECK: notifyOperationReplaced: test.invalid
+// CHECK: notifyOperationErased: test.invalid
+// CHECK: notifyOperationModified: test.post_order_legalization
+
+// CHECK-LABEL: func @test_preorder_legalization
+//       CHECK:   "test.post_order_legalization"() ({
+//       CHECK:   ^{{.*}}(%[[arg0:.*]]: f64):
+// Note: The survival of a not-explicitly-invalid operation does *not* cause
+// a conversion failure in when applying a partial conversion.
+//       CHECK:     %[[cast:.*]] = "test.cast"(%[[arg0]]) : (f64) -> i64
+//       CHECK:     "test.remaining_consumer"(%[[cast]]) : (i64) -> ()
+//       CHECK:     "test.valid"(%[[arg0]]) : (f64) -> ()
+//       CHECK:   }) {is_legal} : () -> ()
+func.func @test_preorder_legalization() {
+  "test.post_order_legalization"() ({
+  ^bb0(%arg0: i64):
+    // expected-remark @+1 {{'test.remaining_consumer' is not legalizable}}
+    "test.remaining_consumer"(%arg0) : (i64) -> ()
+    "test.invalid"(%arg0) : (i64) -> ()
+  }) : () -> ()
+  // expected-remark @+1 {{'func.return' is not legalizable}}
+  return
+}
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index bba397e2e58c0..a38cfa848de4b 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -1419,6 +1419,22 @@ class TestTypeConsumerOpPattern
   }
 };
 
+class TestPostOrderLegalization : public ConversionPattern {
+public:
+  TestPostOrderLegalization(MLIRContext *ctx, const TypeConverter &converter)
+      : ConversionPattern(converter, "test.post_order_legalization", 1, ctx) {}
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<ValueRange> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    for (Region &r : op->getRegions())
+      if (failed(rewriter.legalize(&r)))
+        return failure();
+    rewriter.modifyOpInPlace(
+        op, [&]() { op->setAttr("is_legal", rewriter.getUnitAttr()); });
+    return success();
+  }
+};
+
 /// Test unambiguous overload resolution of replaceOpWithMultiple. This
 /// function is just to trigger compiler errors. It is never executed.
 [[maybe_unused]] void testReplaceOpWithMultipleOverloads(
@@ -1533,7 +1549,8 @@ struct TestLegalizePatternDriver
     patterns.add<TestDropOpSignatureConversion, TestDropAndReplaceInvalidOp,
                  TestPassthroughInvalidOp, TestMultiple1ToNReplacement,
                  TestValueReplace, TestReplaceWithValidConsumer,
-                 TestTypeConsumerOpPattern>(&getContext(), converter);
+                 TestTypeConsumerOpPattern, TestPostOrderLegalization>(
+        &getContext(), converter);
     patterns.add<TestConvertBlockArgs>(converter, &getContext());
     mlir::populateAnyFunctionOpInterfaceTypeConversionPattern(patterns,
                                                               converter);
@@ -1561,6 +1578,9 @@ struct TestLegalizePatternDriver
     target.addDynamicallyLegalOp(
         OperationName("test.value_replace", &getContext()),
         [](Operation *op) { return op->hasAttr("is_legal"); });
+    target.addDynamicallyLegalOp(
+        OperationName("test.post_order_legalization", &getContext()),
+        [](Operation *op) { return op->hasAttr("is_legal"); });
 
     // TestCreateUnregisteredOp creates `arith.constant` operation,
     // which was not added to target intentionally to test
diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index 0888ebd7a9362..cb08397c201f2 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -927,9 +927,24 @@ static void runIslScheduleOptimizer(
     walkScheduleTreeForStatistics(Schedule, 2);
   }
 
+  // Check for why any computation could have failed
   if (MaxOpGuard.hasQuotaExceeded()) {
     POLLY_DEBUG(dbgs() << "Schedule optimizer calculation exceeds ISL quota\n");
     return;
+  } else if (isl_ctx_last_error(Ctx) != isl_error_none) {
+    const char *File = isl_ctx_last_error_file(Ctx);
+    int Line = isl_ctx_last_error_line(Ctx);
+    const char *Msg = isl_ctx_last_error_msg(Ctx);
+    POLLY_DEBUG(
+        dbgs()
+        << "ISL reported an error during the computation of a new schedule at "
+        << File << ":" << Line << ": " << Msg);
+    isl_ctx_reset_error(Ctx);
+    return;
+  } else if (Schedule.is_null()) {
+    POLLY_DEBUG(dbgs() << "Schedule optimizer did not compute a new schedule "
+                          "for unknown reasons\n");
+    return;
   }
 
   // Skip profitability check if user transformation(s) have been applied.
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 5a1e0b53b021c..8d225d63cdf3e 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -2964,6 +2964,22 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_exp2m1f16",
+    hdrs = ["src/__support/math/exp2m1f16.h"],
+    deps = [
+        ":__support_fputil_except_value_utils",
+        ":__support_fputil_fma",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_nearest_integer",
+        ":__support_fputil_polyeval",
+        ":__support_fputil_rounding_mode",
+        ":__support_macros_optimization",
+        ":__support_math_common_constants",
+        ":__support_math_expxf16_utils",
+    ],
+)
+
 libc_support_library(
     name = "__support_math_exp10",
     hdrs = ["src/__support/math/exp10.h"],
@@ -3762,7 +3778,7 @@ libc_math_function(
 libc_math_function(
     name = "exp2m1f16",
     additional_deps = [
-        ":__support_math_expxf16_utils",
+        ":__support_math_exp2m1f16",
     ],
 )