diff --git a/src/passes/Precompute.cpp b/src/passes/Precompute.cpp
index c1a9a1bb43a..f3a6e52e8c2 100644
--- a/src/passes/Precompute.cpp
+++ b/src/passes/Precompute.cpp
@@ -72,20 +72,19 @@ using GetValues = std::unordered_map<LocalGet*, Literals>;
 // possible input values than that struct.new, which means we will not infer
 // a value for it, and not attempt to say anything about comparisons of $x.
 struct HeapValues {
-  // Store two maps, one for effects and one without. The one with effects is
-  // used when PRESERVE_SIDEEFFECTS is on, and the other when not. This is
-  // necessary because when we preserve effects then nested effects in a GC
-  // allocation can cause us to end up as nonconstant (nothing can be
-  // precomputed), and we do not want to mix results between the two modes (if
-  // we did, we might cache a result when we ignore effects that we later use
-  // when not ignoring them, which would forget the effects).
-  std::unordered_map<Expression*, std::shared_ptr<GCData>> withEffects,
-    withoutEffects;
-
-  void clear() {
-    withEffects.clear();
-    withoutEffects.clear();
-  }
+  struct Entry {
+    // The GC data for an expression.
+    std::shared_ptr<GCData> data;
+    // Whether the expression has effects. If it does then we must recompute it
+    // each time we see it, even though we return |data| to represent it.
+    // (Recomputing will apply those effects each time, so we don't forget them
+    // when we read from the cache. This recomputing is rare, and doesn't happen
+    // e.g. in global GC objects, where most of the work happens, so this cache
+    // still saves a lot.)
+    bool hasEffects;
+  };
+
+  std::unordered_map<Expression*, Entry> map;
 };
 
 // Precomputes an expression. Errors if we hit anything that can't be
@@ -202,23 +201,28 @@ class PrecomputingExpressionRunner
 
   // Generates heap info for a heap-allocating expression.
   Flow getGCAllocation(Expression* curr, std::function<Flow()> visitFunc) {
-    auto& heapValuesMap = (flags & FlagValues::PRESERVE_SIDEEFFECTS)
-                            ? heapValues.withEffects
-                            : heapValues.withoutEffects;
     // We must return a literal that refers to the canonical location for this
     // source expression, so that each time we compute a specific *.new then
     // we get the same identity.
-    auto iter = heapValuesMap.find(curr);
-    if (iter != heapValuesMap.end()) {
+    auto iter = heapValues.map.find(curr);
+    if (iter != heapValues.map.end()) {
+      auto& [data, hasEffects] = iter->second;
+      if (hasEffects) {
+        // Visit, so we recompute the effects. (This is rare, see comment
+        // above.)
+        visitFunc();
+      }
       // Refer to the same canonical GCData that we already created.
-      return Literal(iter->second, curr->type.getHeapType());
+      return Literal(data, curr->type.getHeapType());
     }
-    // Only call the visitor function here, so we do it once per allocation.
+    // Only call the visitor function here, so we do it once per allocation. See
+    // if we have effects while doing so.
     auto flow = visitFunc();
     if (flow.breaking()) {
       return flow;
     }
-    heapValuesMap[curr] = flow.getSingleValue().getGCData();
+    heapValues.map[curr] =
+      HeapValues::Entry{flow.getSingleValue().getGCData(), hasEffectfulSets()};
     return flow;
   }
 
@@ -301,94 +305,169 @@ struct Precompute
     // unlikely chance, we leave such things for later.
   }
 
-  template<typename T> void reuseConstantNode(T* curr, Flow flow) {
-    if (flow.values.isConcrete()) {
-      // reuse a const / ref.null / ref.func node if there is one
-      if (curr->value && flow.values.size() == 1) {
-        Literal singleValue = flow.getSingleValue();
-        if (singleValue.type.isNumber()) {
-          if (auto* c = curr->value->template dynCast<Const>()) {
-            c->value = singleValue;
-            c->finalize();
-            curr->finalize();
-            return;
-          }
-        } else if (singleValue.isNull()) {
-          if (auto* n = curr->value->template dynCast<RefNull>()) {
-            n->finalize(singleValue.type);
-            curr->finalize();
-            return;
-          }
-        } else if (singleValue.type.isRef() &&
-                   singleValue.type.getHeapType().isSignature()) {
-          if (auto* r = curr->value->template dynCast<RefFunc>()) {
-            r->func = singleValue.getFunc();
-            auto heapType = getModule()->getFunction(r->func)->type;
-            r->finalize(heapType);
-            curr->finalize();
-            return;
-          }
-        }
+  void visitExpression(Expression* curr) {
+    // Ignore trivial things like constants, nops, local/global.set (which have
+    // an effect we cannot remove, and it is simpler to ignore them here than
+    // later below), return (which we cannot improve), and loop (which it is
+    // simpler to leave for other passes).
+    if (Properties::isConstantExpression(curr) || curr->is<Nop>() ||
+        curr->is<LocalSet>() || curr->is<GlobalSet>() || curr->is<Return>() ||
+        curr->is<Loop>()) {
+      return;
+    }
+    // Breaks with conditions can be simplified, but unconditional ones are like
+    // returns, and we cannot improve.
+    if (auto* br = curr->dynCast<Break>()) {
+      if (!br->condition) {
+        return;
       }
-      curr->value = flow.getConstExpression(*getModule());
-    } else {
-      curr->value = nullptr;
     }
-    curr->finalize();
-  }
 
-  void visitExpression(Expression* curr) {
-    // TODO: if local.get, only replace with a constant if we don't care about
-    // size...?
-    if (Properties::isConstantExpression(curr) || curr->is<Nop>()) {
+    // See if we can precompute the value that flows out. We set
+    // |replaceExpression| to false because we do not necessarily want to
+    // replace it entirely, see below - we may keep parts, in some cases, if we
+    // can still simplify it to a precomputed value.
+    Flow flow;
+    PrecomputingExpressionRunner runner(
+      getModule(), getValues, heapValues, false /* replaceExpression */);
+    try {
+      flow = runner.visit(curr);
+    } catch (NonconstantException&) {
       return;
     }
-    // try to evaluate this into a const
-    Flow flow = precomputeExpression(curr);
+    // The resulting value must be of a type we can emit a constant for (or
+    // there must be no value at all, in which case the value is a nop).
     if (!canEmitConstantFor(flow.values)) {
       return;
     }
+    if (flow.breakTo == NONCONSTANT_FLOW) {
+      // This cannot be turned into a constant, but perhaps we can partially
+      // precompute it.
+      considerPartiallyPrecomputing(curr);
+      return;
+    }
+    // TODO: Handle suspends somehow?
+    if (flow.suspendTag) {
+      return;
+    }
+
+    // This looks like a promising precomputation: We have found that its value,
+    // if any, can be emitted as a constant (or there is no value, and it is a
+    // nop or break etc.). Build that value, so we can replace the expression
+    // with it.
+    Builder builder(*getModule());
+    Expression* value = nullptr;
+    if (flow.values.isConcrete()) {
+      value = flow.getConstExpression(*getModule());
+    }
     if (flow.breaking()) {
-      if (flow.breakTo == NONCONSTANT_FLOW) {
-        // This cannot be turned into a constant, but perhaps we can partially
-        // precompute it.
-        considerPartiallyPrecomputing(curr);
+      if (flow.breakTo == RETURN_FLOW) {
+        // We avoided trivial returns earlier (by doing so, we avoid wasted
+        // work replacing a return with itself).
+        assert(!curr->is<Return>());
+        value = builder.makeReturn(value);
+      } else {
+        value = builder.makeBreak(flow.breakTo, value);
+      }
+      // Note we don't need to handle RETURN_CALL_FLOW, as the call there has
+      // effects that would stop us earlier.
+    }
+
+    // We have something to replace the expression. While precomputing the
+    // expression, we verified it has no effects that cause problems - no traps
+    // or exceptions etc., as those things would lead to NONCONSTANT_FLOW. We
+    // can therefore replace this with what flows out of it. The only exception
+    // is that we set replaceExpression to false, above, which means we run the
+    // interpreter without PRESERVE_SIDEEFFECTS. That allows local and global
+    // sets to happen (to help optimize small code fragments with sets and
+    // gets). To handle that, keep relevant children if we have such sets.
+    if (runner.hasEffectfulSets()) {
+      if (curr->is<Block>() || curr->is<If>() || curr->is<Try>()) {
+        // These control flow structures have children that might not execute.
+        // We know that some of the children have effectful sets, but not which,
+        // and we can't just keep them all, so give up.
+        // TODO: Check if this would be useful to improve, but other passes
+        //       might do enough already.
         return;
       }
-      if (flow.breakTo == RETURN_FLOW) {
-        // this expression causes a return. if it's already a return, reuse the
-        // node
-        if (auto* ret = curr->dynCast<Return>()) {
-          reuseConstantNode(ret, flow);
-        } else {
-          Builder builder(*getModule());
-          replaceCurrent(builder.makeReturn(
-            flow.values.isConcrete() ? flow.getConstExpression(*getModule())
-                                     : nullptr));
-        }
+
+      // To keep things simple, stop here if we are precomputing to a break/
+      // return. Handling that case requires ordering considerations:
+      //
+      //  (foo
+      //    (br)
+      //    (call)
+      //  )
+      //
+      // Here we know we need to keep the call, and can remove foo, but this
+      // would be wrong:
+      //
+      //  (block
+      //    ;; removed br
+      //    (call)
+      //    (br) ;; the value we precompute to, added at the end
+      //  )
+      //
+      // Instead we must keep the br, leaving this for later opts to improve:
+      //
+      //  (block
+      //    (br)
+      //    (call)
+      //    (br)
+      //  )
+      //
+      // That is, we cannot remove unneeded children easily in this case, where
+      // control flow might transfer, so we need to keep all children when we
+      // remove foo. In that case, it's not clear we are helping much, and other
+      // passes can do better with the break/return anyhow. After dismissing
+      // this situation, we know no transfer of control flow needs to be handled
+      // in the code below (because we executed the code, and found it did not
+      // do so).
+      if (flow.breaking()) {
         return;
       }
-      // this expression causes a break, emit it directly. if it's already a br,
-      // reuse the node.
-      if (auto* br = curr->dynCast<Break>()) {
-        br->name = flow.breakTo;
-        br->condition = nullptr;
-        reuseConstantNode(br, flow);
-      } else {
-        Builder builder(*getModule());
-        replaceCurrent(builder.makeBreak(
-          flow.breakTo,
-          flow.values.isConcrete() ? flow.getConstExpression(*getModule())
-                                   : nullptr));
+
+      // Find the necessary children that we must keep.
+      SmallVector<Expression*, 10> kept;
+      for (auto* child : ChildIterator(curr)) {
+        EffectAnalyzer effects(getPassOptions(), *getModule(), child);
+        if (!effects.localsWritten.empty() || !effects.globalsWritten.empty()) {
+          kept.push_back(builder.makeDrop(child));
+        }
+      }
+      // Find all the things we must keep, which might include |value|.
+      if (!kept.empty()) {
+        if (value) {
+          kept.push_back(value);
+        }
+        if (kept.size() == 1) {
+          value = kept[0];
+        } else {
+          // We are returning a block with some kept children + some value. This
+          // may seem to increase code size in some cases, but it cannot do so
+          // monotonically: while doing all this we are definitely removing
+          // |curr| itself, so we are making progress, even if we emit a new
+          // constant that we weren't before. That is, we are not in this
+          // situation:
+          //
+          //   (foo A B) => (block (foo A B) (value))
+          //
+          // We are in this one:
+          //
+          //   (foo A B) => (block A B (value))
+          //
+          // where foo vanishes.
+          value = builder.makeBlock(kept);
+        }
       }
-      return;
     }
-    // this was precomputed
-    if (flow.values.isConcrete()) {
-      replaceCurrent(flow.getConstExpression(*getModule()));
-    } else {
+    if (!value) {
+      // We don't need to replace this with anything: there is no value or other
+      // code that we need. Just nop it.
       ExpressionManipulator::nop(curr);
+      return;
     }
+    replaceCurrent(value);
   }
 
   void visitBlock(Block* curr) {
@@ -695,7 +774,7 @@ struct Precompute
         // |parent|. Results here must not be cached for later.
         HeapValues temp;
         auto ifTrue = precomputeExpression(parent, true, &temp);
-        temp.clear();
+        temp.map.clear();
         if (isValidPrecomputation(ifTrue)) {
           *pointerToSelect = select->ifFalse;
           auto ifFalse = precomputeExpression(parent, true, &temp);
diff --git a/src/wasm-interpreter.h b/src/wasm-interpreter.h
index c0191e50cce..d6a9db9bf9c 100644
--- a/src/wasm-interpreter.h
+++ b/src/wasm-interpreter.h
@@ -2741,6 +2741,11 @@ class ConstantExpressionRunner : public ExpressionRunner<SubType> {
     globalValues[name] = values;
   }
 
+  // Returns true if we set a local or a global.
+  bool hasEffectfulSets() const {
+    return !localValues.empty() || !globalValues.empty();
+  }
+
   Flow visitLocalGet(LocalGet* curr) {
     // Check if a constant value has been set in the context of this runner.
     auto iter = localValues.find(curr->index);
diff --git a/test/lit/ctor-eval/return_call.wast b/test/lit/ctor-eval/return_call.wast
index 650c5c63ce4..20dace4dbc9 100644
--- a/test/lit/ctor-eval/return_call.wast
+++ b/test/lit/ctor-eval/return_call.wast
@@ -437,44 +437,25 @@
 ;; CHECK-NEXT: )
 (module
   ;; Return call to self with different params, then stop evaluating.
-  ;; CHECK:      (type $0 (func (param i32)))
+  ;; CHECK:      (type $0 (func))
 
-  ;; CHECK:      (type $1 (func))
+  ;; CHECK:      (type $1 (func (param i32)))
 
-  ;; CHECK:      (import "env" "import" (func $import (type $1)))
+  ;; CHECK:      (import "env" "import" (func $import (type $0)))
   (import "env" "import" (func $import))
 
-  ;; CHECK:      (global $g (mut i32) (i32.const 42))
   (global $g (mut i32) (i32.const 0))
 
-  ;; CHECK:      (export "test" (func $test_2))
-
-  ;; CHECK:      (func $test (type $0) (param $0 i32)
-  ;; CHECK-NEXT:  (global.set $g
-  ;; CHECK-NEXT:   (local.get $0)
-  ;; CHECK-NEXT:  )
-  ;; CHECK-NEXT:  (if
-  ;; CHECK-NEXT:   (i32.eq
-  ;; CHECK-NEXT:    (local.get $0)
-  ;; CHECK-NEXT:    (i32.const 42)
-  ;; CHECK-NEXT:   )
-  ;; CHECK-NEXT:   (then
-  ;; CHECK-NEXT:    (call $import)
-  ;; CHECK-NEXT:   )
-  ;; CHECK-NEXT:   (else
-  ;; CHECK-NEXT:    (return_call $test
-  ;; CHECK-NEXT:     (i32.add
-  ;; CHECK-NEXT:      (local.get $0)
-  ;; CHECK-NEXT:      (i32.const 1)
-  ;; CHECK-NEXT:     )
-  ;; CHECK-NEXT:    )
-  ;; CHECK-NEXT:   )
-  ;; CHECK-NEXT:  )
-  ;; CHECK-NEXT: )
   (func $test (export "test") (param i32)
     (global.set $g
       (local.get 0)
     )
+    ;; When first called, the param is 0 and we do the return call, sending it
+    ;; a parameter of 1. That does another return call, and so forth, until the
+    ;; incrementing counter in the param reaches 42 and the if condition is
+    ;; true. We then call the import. The final precomputed code after all this
+    ;; is just to set the local to 42 (which other optimizations can remove),
+    ;; and to call the import.
     (if
       (i32.eq
         (local.get 0)
@@ -495,24 +476,11 @@
   )
 )
 
-;; CHECK:      (func $test_2 (type $0) (param $0 i32)
-;; CHECK-NEXT:  (if
-;; CHECK-NEXT:   (i32.eq
-;; CHECK-NEXT:    (local.tee $0
-;; CHECK-NEXT:     (i32.const 42)
-;; CHECK-NEXT:    )
-;; CHECK-NEXT:    (i32.const 42)
-;; CHECK-NEXT:   )
-;; CHECK-NEXT:   (then
-;; CHECK-NEXT:    (call $import)
-;; CHECK-NEXT:   )
-;; CHECK-NEXT:   (else
-;; CHECK-NEXT:    (return_call $test
-;; CHECK-NEXT:     (i32.add
-;; CHECK-NEXT:      (local.get $0)
-;; CHECK-NEXT:      (i32.const 1)
-;; CHECK-NEXT:     )
-;; CHECK-NEXT:    )
-;; CHECK-NEXT:   )
+;; CHECK:      (export "test" (func $test_2))
+
+;; CHECK:      (func $test_2 (type $1) (param $0 i32)
+;; CHECK-NEXT:  (local.set $0
+;; CHECK-NEXT:   (i32.const 42)
 ;; CHECK-NEXT:  )
+;; CHECK-NEXT:  (call $import)
 ;; CHECK-NEXT: )
diff --git a/test/lit/passes/precompute-effects.wast b/test/lit/passes/precompute-effects.wast
new file mode 100644
index 00000000000..d2709cf3a4d
--- /dev/null
+++ b/test/lit/passes/precompute-effects.wast
@@ -0,0 +1,294 @@
+;; NOTE: Assertions have been generated by update_lit_checks.py and should not be edited.
+
+;; RUN: wasm-opt %s --remove-unused-names --precompute-propagate --fuzz-exec -all -S -o - \
+;; RUN:   | filecheck %s
+
+(module
+ ;; CHECK:      (global $g (mut i32) (i32.const 10))
+ (global $g (mut i32) (i32.const 10))
+
+ ;; CHECK:      (func $loop (type $1)
+ ;; CHECK-NEXT:  (local $temp i32)
+ ;; CHECK-NEXT:  (local.set $temp
+ ;; CHECK-NEXT:   (i32.const 10)
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT: )
+ (func $loop
+  (local $temp i32)
+  ;; We should not try to precompute this loop. If we attempted to replace it
+  ;; with its children, we'd need to handle the effects of chidren properly,
+  ;; which we do not do in this pass.
+  (loop
+   (local.set $temp
+    (i32.const 10)
+   )
+  )
+ )
+
+ ;; CHECK:      (func $local.set (type $1)
+ ;; CHECK-NEXT:  (local $temp i32)
+ ;; CHECK-NEXT:  (local.set $temp
+ ;; CHECK-NEXT:   (i32.const 10)
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT:  (drop
+ ;; CHECK-NEXT:   (local.tee $temp
+ ;; CHECK-NEXT:    (i32.const 20)
+ ;; CHECK-NEXT:   )
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT: )
+ (func $local.set
+  (local $temp i32)
+  ;; We should not try to precompute a set or tee.
+  (local.set $temp
+   (i32.const 10)
+  )
+  (drop
+   (local.tee $temp
+    (i32.const 20)
+   )
+  )
+ )
+
+ ;; CHECK:      (func $global.set (type $1)
+ ;; CHECK-NEXT:  (global.set $g
+ ;; CHECK-NEXT:   (i32.const 20)
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT: )
+ (func $global.set
+  ;; We should not try to precompute a global set.
+  (global.set $g
+   (i32.const 20)
+  )
+ )
+
+ ;; CHECK:      (func $binary-tee (type $0) (result i32)
+ ;; CHECK-NEXT:  (local $temp i32)
+ ;; CHECK-NEXT:  (drop
+ ;; CHECK-NEXT:   (local.tee $temp
+ ;; CHECK-NEXT:    (i32.const 10)
+ ;; CHECK-NEXT:   )
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT:  (i32.const 20)
+ ;; CHECK-NEXT: )
+ (func $binary-tee (result i32)
+  (local $temp i32)
+  ;; We can precompute this and remove the add, but must keep the tee.
+  (i32.add
+   (local.tee $temp
+    (i32.const 10)
+   )
+   (local.get $temp)
+  )
+ )
+
+ ;; CHECK:      (func $binary-tee-2 (type $0) (result i32)
+ ;; CHECK-NEXT:  (local $temp i32)
+ ;; CHECK-NEXT:  (drop
+ ;; CHECK-NEXT:   (local.tee $temp
+ ;; CHECK-NEXT:    (i32.const 10)
+ ;; CHECK-NEXT:   )
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT:  (i32.const 10)
+ ;; CHECK-NEXT: )
+ (func $binary-tee-2 (result i32)
+  (local $temp i32)
+  ;; A tee on the other side.
+  (i32.add
+   (local.get $temp)
+   (local.tee $temp
+    (i32.const 10)
+   )
+  )
+ )
+
+ ;; CHECK:      (func $binary-both (type $0) (result i32)
+ ;; CHECK-NEXT:  (local $temp i32)
+ ;; CHECK-NEXT:  (drop
+ ;; CHECK-NEXT:   (local.tee $temp
+ ;; CHECK-NEXT:    (i32.const 10)
+ ;; CHECK-NEXT:   )
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT:  (drop
+ ;; CHECK-NEXT:   (local.tee $temp
+ ;; CHECK-NEXT:    (i32.const 20)
+ ;; CHECK-NEXT:   )
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT:  (i32.const 30)
+ ;; CHECK-NEXT: )
+ (func $binary-both (result i32)
+  (local $temp i32)
+  ;; Now we must keep both tees.
+  (i32.add
+   (local.tee $temp
+    (i32.const 10)
+   )
+   (local.tee $temp
+    (i32.const 20)
+   )
+  )
+ )
+
+ ;; CHECK:      (func $nested-global (type $0) (result i32)
+ ;; CHECK-NEXT:  (local $temp i32)
+ ;; CHECK-NEXT:  (drop
+ ;; CHECK-NEXT:   (block (result i32)
+ ;; CHECK-NEXT:    (local.set $temp
+ ;; CHECK-NEXT:     (i32.const 10)
+ ;; CHECK-NEXT:    )
+ ;; CHECK-NEXT:    (i32.const 20)
+ ;; CHECK-NEXT:   )
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT:  (drop
+ ;; CHECK-NEXT:   (block (result i32)
+ ;; CHECK-NEXT:    (global.set $g
+ ;; CHECK-NEXT:     (i32.const 30)
+ ;; CHECK-NEXT:    )
+ ;; CHECK-NEXT:    (i32.const 40)
+ ;; CHECK-NEXT:   )
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT:  (i32.const 60)
+ ;; CHECK-NEXT: )
+ (func $nested-global (result i32)
+  (local $temp i32)
+  ;; Nested effects inside arms, and one is a global effect.
+  (i32.add
+   (block (result i32)
+    (local.set $temp
+     (i32.const 10)
+    )
+    (i32.const 20)
+   )
+   (block (result i32)
+    (global.set $g
+     (i32.const 30)
+    )
+    (i32.const 40)
+   )
+  )
+ )
+
+ ;; CHECK:      (func $if (type $0) (result i32)
+ ;; CHECK-NEXT:  (i32.const 2)
+ ;; CHECK-NEXT: )
+ (func $if (result i32)
+  ;; We precompute simple ifs.
+  (if (result i32)
+   (i32.const 1)
+   (then
+    (i32.const 2)
+   )
+   (else
+    (i32.const 3)
+   )
+  )
+ )
+
+ ;; CHECK:      (func $if-no (type $0) (result i32)
+ ;; CHECK-NEXT:  (if (result i32)
+ ;; CHECK-NEXT:   (i32.const 1)
+ ;; CHECK-NEXT:   (then
+ ;; CHECK-NEXT:    (global.set $g
+ ;; CHECK-NEXT:     (i32.const 20)
+ ;; CHECK-NEXT:    )
+ ;; CHECK-NEXT:    (i32.const 2)
+ ;; CHECK-NEXT:   )
+ ;; CHECK-NEXT:   (else
+ ;; CHECK-NEXT:    (i32.const 3)
+ ;; CHECK-NEXT:   )
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT: )
+ (func $if-no (result i32)
+  ;; We do not precompute ifs with effects.
+  (if (result i32)
+   (i32.const 1)
+   (then
+    (block (result i32)
+     (global.set $g
+      (i32.const 20)
+     )
+     (i32.const 2)
+    )
+   )
+   (else
+    (i32.const 3)
+   )
+  )
+ )
+
+ ;; CHECK:      (func $try (type $0) (result i32)
+ ;; CHECK-NEXT:  (try (result i32)
+ ;; CHECK-NEXT:   (do
+ ;; CHECK-NEXT:    (i32.const 1)
+ ;; CHECK-NEXT:   )
+ ;; CHECK-NEXT:   (catch_all
+ ;; CHECK-NEXT:    (i32.const 2)
+ ;; CHECK-NEXT:   )
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT: )
+ (func $try (result i32)
+  ;; We don't precompute trys.
+  (try (result i32)
+   (do
+    (i32.const 1)
+   )
+   (catch_all
+    (i32.const 2)
+   )
+  )
+ )
+
+ ;; CHECK:      (func $ordering (type $0) (result i32)
+ ;; CHECK-NEXT:  (local $temp i32)
+ ;; CHECK-NEXT:  (block $out (result i32)
+ ;; CHECK-NEXT:   (select
+ ;; CHECK-NEXT:    (block (result i32)
+ ;; CHECK-NEXT:     (local.set $temp
+ ;; CHECK-NEXT:      (i32.const 0)
+ ;; CHECK-NEXT:     )
+ ;; CHECK-NEXT:     (i32.const 20)
+ ;; CHECK-NEXT:    )
+ ;; CHECK-NEXT:    (br $out
+ ;; CHECK-NEXT:     (i32.const 10)
+ ;; CHECK-NEXT:    )
+ ;; CHECK-NEXT:    (block (result i32)
+ ;; CHECK-NEXT:     (global.set $g
+ ;; CHECK-NEXT:      (i32.const 30)
+ ;; CHECK-NEXT:     )
+ ;; CHECK-NEXT:     (i32.const 40)
+ ;; CHECK-NEXT:    )
+ ;; CHECK-NEXT:   )
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT: )
+ (func $ordering (result i32)
+  (local $temp i32)
+  ;; Nested effects inside arms. The br in the middle arm will execute, so we
+  ;; want to precompute the entire select into a br, but we must keep alive the
+  ;; children before and after. While doing so, we must not *reorder* the middle
+  ;; child against them: if we just remove the middle child (and add a br at the
+  ;; end) then we are changing the order of execution, as the global.set would
+  ;; happen, when before it did not. For simplicity, we do not optimize here.
+  (block $out (result i32)
+   (select
+    (block (result i32)
+     (local.set $temp
+      (i32.const 0)
+     )
+     (i32.const 20)
+    )
+    (block (result i32)
+     (br $out
+      (i32.const 10)
+     )
+     (i32.const 20)
+    )
+    (block (result i32)
+     (global.set $g
+      (i32.const 30)
+     )
+     (i32.const 40)
+    )
+   )
+  )
+ )
+)
+
diff --git a/test/lit/passes/precompute-gc.wast b/test/lit/passes/precompute-gc.wast
index f4de1f71508..e07dfd398e6 100644
--- a/test/lit/passes/precompute-gc.wast
+++ b/test/lit/passes/precompute-gc.wast
@@ -13,13 +13,13 @@
  ;; two incompatible struct types
  (type $A (struct (field (mut f32))))
 
+ ;; CHECK:      (type $referrer (struct (field (mut (ref null $empty)))))
+ (type $referrer (struct (field (mut (ref null $empty)))))
+
  ;; CHECK:      (type $func-return-i32 (func (result i32)))
 
  ;; CHECK:      (type $array-i32 (array (mut i32)))
 
- ;; CHECK:      (type $referrer (struct (field (mut (ref null $empty)))))
- (type $referrer (struct (field (mut (ref null $empty)))))
-
  ;; CHECK:      (type $B (struct (field (mut f64))))
  (type $B (struct (field (mut f64))))
 
@@ -32,7 +32,7 @@
  ;; CHECK:      (type $array-ref (array (mut (ref null $array-i32))))
  (type $array-ref (array (mut (ref null $array-i32))))
 
- ;; CHECK:      (import "fuzzing-support" "log-i32" (func $log (type $5) (param i32)))
+ ;; CHECK:      (import "fuzzing-support" "log-i32" (func $log (type $6) (param i32)))
  (import "fuzzing-support" "log-i32" (func $log (param i32)))
 
  ;; CHECK:      (func $test-fallthrough (type $func-return-i32) (result i32)
@@ -129,7 +129,7 @@
    (struct.get $struct 0 (local.get $x))
   )
  )
- ;; CHECK:      (func $load-from-struct-bad-merge (type $5) (param $i i32)
+ ;; CHECK:      (func $load-from-struct-bad-merge (type $6) (param $i i32)
  ;; CHECK-NEXT:  (local $x (ref null $struct))
  ;; CHECK-NEXT:  (if
  ;; CHECK-NEXT:   (local.get $i)
@@ -347,11 +347,13 @@
  ;; CHECK-NEXT:  (local $tempresult i32)
  ;; CHECK-NEXT:  (local $tempref (ref null $empty))
  ;; CHECK-NEXT:  (local.set $tempresult
- ;; CHECK-NEXT:   (ref.eq
- ;; CHECK-NEXT:    (local.tee $tempref
- ;; CHECK-NEXT:     (struct.new_default $empty)
+ ;; CHECK-NEXT:   (block (result i32)
+ ;; CHECK-NEXT:    (drop
+ ;; CHECK-NEXT:     (local.tee $tempref
+ ;; CHECK-NEXT:      (struct.new_default $empty)
+ ;; CHECK-NEXT:     )
  ;; CHECK-NEXT:    )
- ;; CHECK-NEXT:    (local.get $tempref)
+ ;; CHECK-NEXT:    (i32.const 1)
  ;; CHECK-NEXT:   )
  ;; CHECK-NEXT:  )
  ;; CHECK-NEXT:  (i32.const 1)
@@ -359,7 +361,8 @@
  (func $propagate-equal (result i32)
   (local $tempresult i32)
   (local $tempref (ref null $empty))
-  ;; assign the result, so that propagate calculates the ref.eq
+  ;; We can compute a 1 here, as the ref.eq compares a struct to itself. We must
+  ;; keep the tee around, however.
   (local.set $tempresult
    (ref.eq
     ;; allocate one struct
@@ -369,8 +372,7 @@
     (local.get $tempref)
    )
   )
-  ;; we can compute a 1 here as the ref.eq compares a struct to itself. note
-  ;; that the ref.eq itself cannot be precomputed away (as it has side effects).
+  ;; We can propagate 1 to here.
   (local.get $tempresult)
  )
  ;; CHECK:      (func $propagate-unequal (type $func-return-i32) (result i32)
@@ -902,7 +904,7 @@
   )
  )
 
- ;; CHECK:      (func $ref.is_null (type $5) (param $param i32)
+ ;; CHECK:      (func $ref.is_null (type $6) (param $param i32)
  ;; CHECK-NEXT:  (local $ref (ref null $empty))
  ;; CHECK-NEXT:  (local.set $ref
  ;; CHECK-NEXT:   (struct.new_default $empty)
@@ -1212,15 +1214,19 @@
  ;; CHECK-NEXT:  (local $2 i32)
  ;; CHECK-NEXT:  (local $3 (ref $array-i32))
  ;; CHECK-NEXT:  (drop
- ;; CHECK-NEXT:   (i32.lt_u
- ;; CHECK-NEXT:    (local.tee $2
- ;; CHECK-NEXT:     (select
- ;; CHECK-NEXT:      (i32.const 0)
+ ;; CHECK-NEXT:   (block (result i32)
+ ;; CHECK-NEXT:    (drop
+ ;; CHECK-NEXT:     (local.tee $2
  ;; CHECK-NEXT:      (block (result i32)
  ;; CHECK-NEXT:       (drop
- ;; CHECK-NEXT:        (array.new $array-ref
- ;; CHECK-NEXT:         (local.tee $1
- ;; CHECK-NEXT:          (array.new_default $array-i32
+ ;; CHECK-NEXT:        (block (result i32)
+ ;; CHECK-NEXT:         (drop
+ ;; CHECK-NEXT:          (array.new $array-ref
+ ;; CHECK-NEXT:           (local.tee $1
+ ;; CHECK-NEXT:            (array.new_default $array-i32
+ ;; CHECK-NEXT:             (i32.const 0)
+ ;; CHECK-NEXT:            )
+ ;; CHECK-NEXT:           )
  ;; CHECK-NEXT:           (i32.const 0)
  ;; CHECK-NEXT:          )
  ;; CHECK-NEXT:         )
@@ -1229,14 +1235,19 @@
  ;; CHECK-NEXT:       )
  ;; CHECK-NEXT:       (i32.const 0)
  ;; CHECK-NEXT:      )
- ;; CHECK-NEXT:      (i32.const 0)
  ;; CHECK-NEXT:     )
  ;; CHECK-NEXT:    )
- ;; CHECK-NEXT:    (array.len
- ;; CHECK-NEXT:     (local.tee $3
- ;; CHECK-NEXT:      (local.get $1)
+ ;; CHECK-NEXT:    (drop
+ ;; CHECK-NEXT:     (block (result i32)
+ ;; CHECK-NEXT:      (drop
+ ;; CHECK-NEXT:       (local.tee $3
+ ;; CHECK-NEXT:        (local.get $1)
+ ;; CHECK-NEXT:       )
+ ;; CHECK-NEXT:      )
+ ;; CHECK-NEXT:      (i32.const 0)
  ;; CHECK-NEXT:     )
  ;; CHECK-NEXT:    )
+ ;; CHECK-NEXT:    (i32.const 0)
  ;; CHECK-NEXT:   )
  ;; CHECK-NEXT:  )
  ;; CHECK-NEXT: )
@@ -1254,7 +1265,8 @@
   ;; if we did then we'd think that entire array.new has no effects, and can be
   ;; optimized away, together with large chunks of the rest of the code.
   ;;
-  ;; We should not succeed in optimizing anything away here.
+  ;; We should not optimize away any local.tee (but we can remove things like
+  ;; the i32.lt_u).
   (drop
    (i32.lt_u
     (local.tee $2
@@ -1336,4 +1348,154 @@
    )
   )
  )
+
+ ;; CHECK:      (func $nested-struct-ref.eq-tee (type $2)
+ ;; CHECK-NEXT:  (local $A (ref $referrer))
+ ;; CHECK-NEXT:  (drop
+ ;; CHECK-NEXT:   (block (result i32)
+ ;; CHECK-NEXT:    (drop
+ ;; CHECK-NEXT:     (local.tee $A
+ ;; CHECK-NEXT:      (struct.new $referrer
+ ;; CHECK-NEXT:       (struct.new_default $empty)
+ ;; CHECK-NEXT:      )
+ ;; CHECK-NEXT:     )
+ ;; CHECK-NEXT:    )
+ ;; CHECK-NEXT:    (i32.const 1)
+ ;; CHECK-NEXT:   )
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT: )
+ (func $nested-struct-ref.eq-tee
+  (local $A (ref $referrer))
+  ;; As above, but immediately ref.eq the tee'd value with a get of itself. This
+  ;; can be computed to 1, but we must keep the tee effect.
+  (drop
+   (ref.eq
+    (local.tee $A
+     (struct.new $referrer
+      (struct.new_default $empty)
+     )
+    )
+    (local.get $A)
+   )
+  )
+ )
+
+ ;; CHECK:      (func $nested-struct-ref.eq-tee-2 (type $2)
+ ;; CHECK-NEXT:  (local $A (ref $referrer))
+ ;; CHECK-NEXT:  (local $B (ref $empty))
+ ;; CHECK-NEXT:  (drop
+ ;; CHECK-NEXT:   (block (result i32)
+ ;; CHECK-NEXT:    (drop
+ ;; CHECK-NEXT:     (local.tee $A
+ ;; CHECK-NEXT:      (struct.new $referrer
+ ;; CHECK-NEXT:       (local.tee $B
+ ;; CHECK-NEXT:        (struct.new_default $empty)
+ ;; CHECK-NEXT:       )
+ ;; CHECK-NEXT:      )
+ ;; CHECK-NEXT:     )
+ ;; CHECK-NEXT:    )
+ ;; CHECK-NEXT:    (i32.const 1)
+ ;; CHECK-NEXT:   )
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT: )
+ (func $nested-struct-ref.eq-tee-2
+  (local $A (ref $referrer))
+  (local $B (ref $empty))
+  ;; As above but with an extra nested tee, causing more cache usage. We can
+  ;; optimize in the same way.
+  (drop
+   (ref.eq
+    (local.tee $A
+     (struct.new $referrer
+      (local.tee $B
+       (struct.new_default $empty)
+      )
+     )
+    )
+    (local.get $A)
+   )
+  )
+ )
+
+ ;; CHECK:      (func $nested-struct-ref.eq-tee-3 (type $2)
+ ;; CHECK-NEXT:  (local $A (ref $referrer))
+ ;; CHECK-NEXT:  (local $A2 (ref $referrer))
+ ;; CHECK-NEXT:  (local $B (ref $empty))
+ ;; CHECK-NEXT:  (local $B2 (ref $empty))
+ ;; CHECK-NEXT:  (drop
+ ;; CHECK-NEXT:   (block (result i32)
+ ;; CHECK-NEXT:    (drop
+ ;; CHECK-NEXT:     (local.tee $A
+ ;; CHECK-NEXT:      (struct.new $referrer
+ ;; CHECK-NEXT:       (local.tee $B
+ ;; CHECK-NEXT:        (local.tee $B
+ ;; CHECK-NEXT:         (struct.new_default $empty)
+ ;; CHECK-NEXT:        )
+ ;; CHECK-NEXT:       )
+ ;; CHECK-NEXT:      )
+ ;; CHECK-NEXT:     )
+ ;; CHECK-NEXT:    )
+ ;; CHECK-NEXT:    (drop
+ ;; CHECK-NEXT:     (local.tee $A2
+ ;; CHECK-NEXT:      (local.get $A)
+ ;; CHECK-NEXT:     )
+ ;; CHECK-NEXT:    )
+ ;; CHECK-NEXT:    (i32.const 1)
+ ;; CHECK-NEXT:   )
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT:  (call $log
+ ;; CHECK-NEXT:   (i32.const 1)
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT:  (call $log
+ ;; CHECK-NEXT:   (i32.const 0)
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT:  (call $log
+ ;; CHECK-NEXT:   (i32.const 1)
+ ;; CHECK-NEXT:  )
+ ;; CHECK-NEXT: )
+ (func $nested-struct-ref.eq-tee-3
+  (local $A (ref $referrer))
+  (local $A2 (ref $referrer))
+  (local $B (ref $empty))
+  (local $B2 (ref $empty))
+  ;; As above but with yet more nested tees, causing more cache usage.
+  ;; We can optimize in the same way.
+  (drop
+   (ref.eq
+    (local.tee $A
+     (struct.new $referrer
+      (local.tee $B
+       (local.tee $B
+        (struct.new_default $empty)
+       )
+      )
+     )
+    )
+    (local.tee $A2
+     (local.get $A)
+    )
+   )
+  )
+  ;; Use the extra tee. We can optimize to 1 here.
+  (call $log
+   (ref.eq
+    (local.get $A)
+    (local.get $A2)
+   )
+  )
+  ;; This evaluates to 0.
+  (call $log
+   (ref.eq
+    (local.get $A)
+    (local.get $B)
+   )
+  )
+  ;; And this to 1.
+  (call $log
+   (ref.eq
+    (local.get $B)
+    (local.get $B)
+   )
+  )
+ )
 )
diff --git a/test/passes/precompute-propagate_all-features.txt b/test/passes/precompute-propagate_all-features.txt
index 6f5a716f93b..4068b752dbd 100644
--- a/test/passes/precompute-propagate_all-features.txt
+++ b/test/passes/precompute-propagate_all-features.txt
@@ -251,10 +251,13 @@
  )
  (func $through-tee-more (type $2) (param $x i32) (param $y i32) (result i32)
   (local.set $x
-   (i32.eqz
-    (local.tee $y
-     (i32.const 7)
+   (block (result i32)
+    (drop
+     (local.tee $y
+      (i32.const 7)
+     )
     )
+    (i32.const 0)
    )
   )
   (return
diff --git a/test/passes/precompute_all-features.txt b/test/passes/precompute_all-features.txt
index ee79a5c26f1..ab836bb4b4c 100644
--- a/test/passes/precompute_all-features.txt
+++ b/test/passes/precompute_all-features.txt
@@ -106,7 +106,9 @@
    (call $ret)
    (then
     (return
-     (i32.const 1)
+     (return
+      (i32.const 1)
+     )
     )
    )
   )