WebAssembly · kripken · Jul 22, 2022 · May 18, 2022 · May 19, 2022 · May 19, 2022
diff --git a/scripts/fuzz_opt.py b/scripts/fuzz_opt.py
@@ -1158,6 +1158,8 @@ def write_commands(commands, filename):
     ["--global-refining"],
     ["--gsi"],
     ["--gto"],
+    ["--gufa"],
+    ["--gufa-optimizing"],
     ["--local-cse"],
     ["--heap2local"],
     ["--remove-unused-names", "--heap2local"],

diff --git a/src/ir/drop.h b/src/ir/drop.h
@@ -32,10 +32,10 @@ namespace wasm {
 //
 // The caller must also pass in a last item to append to the output (which is
 // typically what the original expression is replaced with).
-Expression* getDroppedChildrenAndAppend(Expression* curr,
-                                        Module& wasm,
-                                        const PassOptions& options,
-                                        Expression* last) {
+inline Expression* getDroppedChildrenAndAppend(Expression* curr,
+                                               Module& wasm,
+                                               const PassOptions& options,
+                                               Expression* last) {
   Builder builder(wasm);
   std::vector<Expression*> contents;
   for (auto* child : ChildIterator(curr)) {

diff --git a/src/passes/CMakeLists.txt b/src/passes/CMakeLists.txt
@@ -40,6 +40,7 @@ set(passes_SOURCES
   GlobalRefining.cpp
   GlobalStructInference.cpp
   GlobalTypeOptimization.cpp
+  GUFA.cpp
   Heap2Local.cpp
   I64ToI32Lowering.cpp
   Inlining.cpp

diff --git a/src/passes/GUFA.cpp b/src/passes/GUFA.cpp
@@ -0,0 +1,308 @@
+/*
+ * Copyright 2022 WebAssembly Community Group participants
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+//
+// Grand Unified Flow Analysis (GUFA)
+//
+// Optimize based on information about what content can appear in each location
+// in the program. This does a whole-program analysis to find that out and
+// hopefully learn more than the type system does - for example, a type might be
+// $A, which means $A or any subtype can appear there, but perhaps the analysis
+// can find that only $A', a particular subtype, can appear there in practice,
+// and not $A or any subtypes of $A', etc. Or, we may find that no type is
+// actually possible at a particular location, say if we can prove that the
+// casts on the way to that location allow nothing through. We can also find
+// that only a particular value is possible of that type.
+//
+// GUFA will infer constants and unreachability, and add those to the code. This
+// can increase code size if further optimizations are not run later like dead
+// code elimination and vacuum. The "optimizing" variant of this pass will run
+// such followup opts automatically in functions where we make changes, and so
+// it is useful if GUFA is run near the end of the optimization pipeline.
+//
+// TODO: GUFA + polymorphic devirtualization + traps-never-happen. If we see
+//       that the possible call targets are {A, B, C}, and GUFA info lets us
+//       prove that A, C will trap if called - say, if they cast the first
+//       parameter to something GUFA proved it cannot be - then we can ignore
+//       them, and devirtualize to a call to B.
+//
+
+#include "ir/drop.h"
+#include "ir/eh-utils.h"
+#include "ir/possible-contents.h"
+#include "ir/properties.h"
+#include "ir/utils.h"
+#include "pass.h"
+#include "wasm.h"
+
+namespace wasm {
+
+namespace {
+
+struct GUFAOptimizer
+  : public WalkerPass<
+      PostWalker<GUFAOptimizer, UnifiedExpressionVisitor<GUFAOptimizer>>> {
+  bool isFunctionParallel() override { return true; }
+
+  ContentOracle& oracle;
+  bool optimizing;
+
+  GUFAOptimizer(ContentOracle& oracle, bool optimizing)
+    : oracle(oracle), optimizing(optimizing) {}
+
+  GUFAOptimizer* create() override {
+    return new GUFAOptimizer(oracle, optimizing);
+  }
+
+  bool optimized = false;
+
+  // Check if removing something (but not its children - just the node itself)
+  // would be ok structurally - whether the IR would still validate.
+  bool canRemoveStructurally(Expression* curr) {
+    // We can remove almost anything, but not a branch target, as we still need
+    // the target for the branches to it to validate.
+    if (BranchUtils::getDefinedName(curr).is()) {
+      return false;
+    }
+
+    // Pops are structurally necessary in catch bodies, and removing a try could
+    // leave a pop without a proper parent.
+    return !curr->is<Pop>() && !curr->is<Try>();
+  }
+
+  // Whether we can remove something (but not its children) without changing
+  // observable behavior or breaking validation.
+  bool canRemove(Expression* curr) {
+    if (!canRemoveStructurally(curr)) {
+      return false;
+    }
+    return !EffectAnalyzer(getPassOptions(), *getModule(), curr)
+              .hasUnremovableSideEffects();
+  }
+
+  // Whether we can replace something (but not its children, we can keep them
+  // with drops) with an unreachable without changing observable behavior or
+  // breaking validation.
+  bool canReplaceWithUnreachable(Expression* curr) {
+    if (!canRemoveStructurally(curr)) {
+      return false;
+    }
+    EffectAnalyzer effects(getPassOptions(), *getModule(), curr);
+    // Ignore a trap, as the unreachable replacement would trap too.
+    effects.trap = false;
+    return !effects.hasUnremovableSideEffects();
+  }
+
+  void visitExpression(Expression* curr) {
+    // Skip things we can't improve in any way.
+    auto type = curr->type;
+    if (type == Type::unreachable || type == Type::none ||
+        Properties::isConstantExpression(curr)) {
+      return;
+    }
+
+    if (type.isTuple()) {
+      // TODO: tuple types.
+      return;
+    }
+
+    if (type.isRef() && (getTypeSystem() != TypeSystem::Nominal &&
+                         getTypeSystem() != TypeSystem::Isorecursive)) {
+      // Without type info we can't analyze subtypes, so we cannot infer
+      // anything about refs.
+      return;
+    }
+
+    // Ok, this is an interesting location that we might optimize. See what the
+    // oracle says is possible there.
+    auto contents = oracle.getContents(ExpressionLocation{curr, 0});
+
+    auto& options = getPassOptions();
+    auto& wasm = *getModule();
+    Builder builder(wasm);
+
+    auto replaceWithUnreachable = [&]() {
+      if (canReplaceWithUnreachable(curr)) {
+        replaceCurrent(getDroppedChildrenAndAppend(
+          curr, wasm, options, builder.makeUnreachable()));
+      } else {
+        // We can't remove this, but we can at least put an unreachable
+        // right after it.
+        replaceCurrent(builder.makeSequence(builder.makeDrop(curr),
+                                            builder.makeUnreachable()));
+      }
+      optimized = true;
+    };
+
+    if (contents.getType() == Type::unreachable) {
+      // This cannot contain any possible value at all. It must be unreachable
+      // code.
+      replaceWithUnreachable();
+      return;
+    }
+
+    // This is reachable. Check if we can emit something optimized for it.
+    // TODO: can we handle more general things here too?
+    if (!contents.canMakeExpression()) {
+      return;
+    }
+
+    if (contents.isNull() && curr->type.isNullable()) {
+      // Null values are all identical, so just fix up the type here (the null's
+      // type might not fit in this expression).
+      contents =
+        PossibleContents::literal(Literal::makeNull(curr->type.getHeapType()));
+
+      // Note that if curr's type is *not* nullable, then the code will trap at
+      // runtime (the null must arrive through a cast that will trap). We handle
+      // that below, so we don't need to think about it here.
+
+      // TODO: would emitting a more specific null be useful when valid?
+    }
+
+    auto* c = contents.makeExpression(wasm);
+
+    // We can only place the constant value here if it has the right type. For
+    // example, a block may return (ref any), that is, not allow a null, but in
+    // practice only a null may flow there if it goes through casts that will
+    // trap at runtime.
+    // TODO: GUFA should eventually do this, but it will require it properly
+    //       filtering content not just on ref.cast as it does now, but also
+    //       ref.as etc. Once it does those we could assert on the type being
+    //       valid here.
+    if (Type::isSubType(c->type, curr->type)) {
+      if (canRemove(curr)) {
+        replaceCurrent(getDroppedChildrenAndAppend(curr, wasm, options, c));
+      } else {
+        // We can't remove this, but we can at least drop it and put the
+        // optimized value right after it.
+        replaceCurrent(builder.makeSequence(builder.makeDrop(curr), c));
+      }
+      optimized = true;
+    } else {
+      // The type is not compatible: we cannot place |c| in this location, even
+      // though we have proven it is the only value possible here.
+      if (Properties::isConstantExpression(c)) {
+        // The type is not compatible and this is a simple constant expression
+        // like a ref.func. That means this code must be unreachable. (See below
+        // for the case of a non-constant.)
+        replaceWithUnreachable();
+      } else {
+        // This is not a constant expression, but we are certain it is the right
+        // value. Atm the only such case we handle is a global.get of an
+        // immutable global. We don't know what the value will be, nor its
+        // specific type, but we do know that a global.get will get that value
+        // properly. However, in this case it does not have the right type for
+        // this location. That can happen since the global.get does not have
+        // exactly the proper type for the contents: the global.get might be
+        // nullable, for example, even though the contents are not actually a
+        // null. For example, consider what happens here:
+        //
+        //  (global $foo (ref null any) (struct.new $Foo))
+        //  ..
+        //    (ref.as_non_null
+        //      (global.get $foo))
+        //
+        // We create a $Foo in the global $foo, so its value is not a null. But
+        // the global's type is nullable, so the global.get's type will be as
+        // well. When we get to the ref.as_non_null, we then want to replace it
+        // with a global.get - in fact that's what its child already is, showing
+        // it is the right content for it - but that global.get would not have a
+        // non-nullable type like a ref.as_non_null must have, so we cannot
+        // simply replace it.
+        //
+        // For now, do nothing here, but in some cases we could probably
+        // optimize (e.g. by adding a ref.as_non_null in the example) TODO
+        assert(c->is<GlobalGet>());
+      }
+    }
+  }
+
+  // TODO: If an instruction would trap on null, like struct.get, we could
+  //       remove it here if it has no possible contents. That information
+  //       is present in OptimizeInstructions where it removes redundant
+  //       ref.as_non_null, so maybe there is a way to share that.
+
+  void visitFunction(Function* func) {
+    if (!optimized) {
+      return;
+    }
+
+    // Optimization may introduce more unreachables, which we need to
+    // propagate.
+    ReFinalize().walkFunctionInModule(func, getModule());
+
+    // We may add blocks around pops, which we must fix up.
+    EHUtils::handleBlockNestedPops(func, *getModule());
+
+    // If we are in "optimizing" mode, we'll also run some more passes on this
+    // function that we just optimized. If not, leave now.
+    if (!optimizing) {
+      return;
+    }
+
+    PassRunner runner(getModule(), getPassOptions());
+    runner.setIsNested(true);
+    // New unreachables we added have created dead code we can remove. If we do
+    // not do this, then running GUFA repeatedly can actually increase code size
+    // (by adding multiple unneeded unreachables).
+    runner.add("dce");
+    // New drops we added allow us to remove more unused code and values. As
+    // with unreachables, without a vacuum we may increase code size as in
+    // nested expressions we may apply the same value multiple times:
+    //
+    //  (block $out
+    //   (block $in
+    //    (i32.const 10)))
+    //
+    // In each of the blocks we'll infer the value must be 10, so we'll end up
+    // with this repeating code:
+    //
+    //  (block ;; a new block just to drop the old outer block
+    //   (drop
+    //    (block $out
+    //     (drop
+    //      (block $in
+    //       (i32.const 10)
+    //      )
+    //     )
+    //     (i32.const 10)
+    //    )
+    //   )
+    //   (i32.const 10)
+    //  )
+    runner.add("vacuum");
+    runner.runOnFunction(func);
+  }
+};
+
+struct GUFAPass : public Pass {
+  bool optimizing;
+
+  GUFAPass(bool optimizing) : optimizing(optimizing) {}
+
+  void run(PassRunner* runner, Module* module) override {
+    ContentOracle oracle(*module);
+    GUFAOptimizer(oracle, optimizing).run(runner, module);
+  }
+};
+
+} // anonymous namespace
+
+Pass* createGUFAPass() { return new GUFAPass(false); }
+Pass* createGUFAOptimizingPass() { return new GUFAPass(true); }
+
+} // namespace wasm
diff --git a/src/passes/pass.cpp b/src/passes/pass.cpp
@@ -168,10 +168,18 @@ void PassRegistry::registerPasses() {
     "generate-stack-ir", "generate Stack IR", createGenerateStackIRPass);
   registerPass(
     "global-refining", "refine the types of globals", createGlobalRefiningPass);
-  registerPass(
-    "gto", "globally optimize GC types", createGlobalTypeOptimizationPass);
   registerPass(
     "gsi", "globally optimize struct values", createGlobalStructInferencePass);
+  registerPass(
+    "gto", "globally optimize GC types", createGlobalTypeOptimizationPass);
+  registerPass("gufa",
+               "Grand Unified Flow Analysis: optimize the entire program using "
+               "information about what content can actually appear in each "
+               "location",
+               createGUFAPass);
+  registerPass("gufa-optimizing",
+               "GUFA plus local optimizations in functions we modified",
+               createGUFAOptimizingPass);
   registerPass("type-refining",
                "apply more specific subtypes to type fields where possible",
                createTypeRefiningPass);

diff --git a/src/passes/passes.h b/src/passes/passes.h
@@ -54,6 +54,8 @@ Pass* createGenerateStackIRPass();
 Pass* createGlobalRefiningPass();
 Pass* createGlobalStructInferencePass();
 Pass* createGlobalTypeOptimizationPass();
+Pass* createGUFAPass();
+Pass* createGUFAOptimizingPass();
 Pass* createHeap2LocalPass();
 Pass* createI64ToI32LoweringPass();
 Pass* createInlineMainPass();

diff --git a/test/lit/help/wasm-opt.test b/test/lit/help/wasm-opt.test
@@ -173,6 +173,15 @@
 ;; CHECK-NEXT:
 ;; CHECK-NEXT:   --gto                                         globally optimize GC types
 ;; CHECK-NEXT:
+;; CHECK-NEXT:   --gufa                                        Grand Unified Flow Analysis:
+;; CHECK-NEXT:                                                 optimize the entire program
+;; CHECK-NEXT:                                                 using information about what
+;; CHECK-NEXT:                                                 content can actually appear in
+;; CHECK-NEXT:                                                 each location
+;; CHECK-NEXT:
+;; CHECK-NEXT:   --gufa-optimizing                             GUFA plus local optimizations in
+;; CHECK-NEXT:                                                 functions we modified
+;; CHECK-NEXT:
 ;; CHECK-NEXT:   --heap2local                                  replace GC allocations with
 ;; CHECK-NEXT:                                                 locals
 ;; CHECK-NEXT: