diff --git a/src/passes/Precompute.cpp b/src/passes/Precompute.cpp index c1a9a1bb43a..f3a6e52e8c2 100644 --- a/src/passes/Precompute.cpp +++ b/src/passes/Precompute.cpp @@ -72,20 +72,19 @@ using GetValues = std::unordered_map; // possible input values than that struct.new, which means we will not infer // a value for it, and not attempt to say anything about comparisons of $x. struct HeapValues { - // Store two maps, one for effects and one without. The one with effects is - // used when PRESERVE_SIDEEFFECTS is on, and the other when not. This is - // necessary because when we preserve effects then nested effects in a GC - // allocation can cause us to end up as nonconstant (nothing can be - // precomputed), and we do not want to mix results between the two modes (if - // we did, we might cache a result when we ignore effects that we later use - // when not ignoring them, which would forget the effects). - std::unordered_map> withEffects, - withoutEffects; - - void clear() { - withEffects.clear(); - withoutEffects.clear(); - } + struct Entry { + // The GC data for an expression. + std::shared_ptr data; + // Whether the expression has effects. If it does then we must recompute it + // each time we see it, even though we return |data| to represent it. + // (Recomputing will apply those effects each time, so we don't forget them + // when we read from the cache. This recomputing is rare, and doesn't happen + // e.g. in global GC objects, where most of the work happens, so this cache + // still saves a lot.) + bool hasEffects; + }; + + std::unordered_map map; }; // Precomputes an expression. Errors if we hit anything that can't be @@ -202,23 +201,28 @@ class PrecomputingExpressionRunner // Generates heap info for a heap-allocating expression. Flow getGCAllocation(Expression* curr, std::function visitFunc) { - auto& heapValuesMap = (flags & FlagValues::PRESERVE_SIDEEFFECTS) - ? heapValues.withEffects - : heapValues.withoutEffects; // We must return a literal that refers to the canonical location for this // source expression, so that each time we compute a specific *.new then // we get the same identity. - auto iter = heapValuesMap.find(curr); - if (iter != heapValuesMap.end()) { + auto iter = heapValues.map.find(curr); + if (iter != heapValues.map.end()) { + auto& [data, hasEffects] = iter->second; + if (hasEffects) { + // Visit, so we recompute the effects. (This is rare, see comment + // above.) + visitFunc(); + } // Refer to the same canonical GCData that we already created. - return Literal(iter->second, curr->type.getHeapType()); + return Literal(data, curr->type.getHeapType()); } - // Only call the visitor function here, so we do it once per allocation. + // Only call the visitor function here, so we do it once per allocation. See + // if we have effects while doing so. auto flow = visitFunc(); if (flow.breaking()) { return flow; } - heapValuesMap[curr] = flow.getSingleValue().getGCData(); + heapValues.map[curr] = + HeapValues::Entry{flow.getSingleValue().getGCData(), hasEffectfulSets()}; return flow; } @@ -301,94 +305,169 @@ struct Precompute // unlikely chance, we leave such things for later. } - template void reuseConstantNode(T* curr, Flow flow) { - if (flow.values.isConcrete()) { - // reuse a const / ref.null / ref.func node if there is one - if (curr->value && flow.values.size() == 1) { - Literal singleValue = flow.getSingleValue(); - if (singleValue.type.isNumber()) { - if (auto* c = curr->value->template dynCast()) { - c->value = singleValue; - c->finalize(); - curr->finalize(); - return; - } - } else if (singleValue.isNull()) { - if (auto* n = curr->value->template dynCast()) { - n->finalize(singleValue.type); - curr->finalize(); - return; - } - } else if (singleValue.type.isRef() && - singleValue.type.getHeapType().isSignature()) { - if (auto* r = curr->value->template dynCast()) { - r->func = singleValue.getFunc(); - auto heapType = getModule()->getFunction(r->func)->type; - r->finalize(heapType); - curr->finalize(); - return; - } - } + void visitExpression(Expression* curr) { + // Ignore trivial things like constants, nops, local/global.set (which have + // an effect we cannot remove, and it is simpler to ignore them here than + // later below), return (which we cannot improve), and loop (which it is + // simpler to leave for other passes). + if (Properties::isConstantExpression(curr) || curr->is() || + curr->is() || curr->is() || curr->is() || + curr->is()) { + return; + } + // Breaks with conditions can be simplified, but unconditional ones are like + // returns, and we cannot improve. + if (auto* br = curr->dynCast()) { + if (!br->condition) { + return; } - curr->value = flow.getConstExpression(*getModule()); - } else { - curr->value = nullptr; } - curr->finalize(); - } - void visitExpression(Expression* curr) { - // TODO: if local.get, only replace with a constant if we don't care about - // size...? - if (Properties::isConstantExpression(curr) || curr->is()) { + // See if we can precompute the value that flows out. We set + // |replaceExpression| to false because we do not necessarily want to + // replace it entirely, see below - we may keep parts, in some cases, if we + // can still simplify it to a precomputed value. + Flow flow; + PrecomputingExpressionRunner runner( + getModule(), getValues, heapValues, false /* replaceExpression */); + try { + flow = runner.visit(curr); + } catch (NonconstantException&) { return; } - // try to evaluate this into a const - Flow flow = precomputeExpression(curr); + // The resulting value must be of a type we can emit a constant for (or + // there must be no value at all, in which case the value is a nop). if (!canEmitConstantFor(flow.values)) { return; } + if (flow.breakTo == NONCONSTANT_FLOW) { + // This cannot be turned into a constant, but perhaps we can partially + // precompute it. + considerPartiallyPrecomputing(curr); + return; + } + // TODO: Handle suspends somehow? + if (flow.suspendTag) { + return; + } + + // This looks like a promising precomputation: We have found that its value, + // if any, can be emitted as a constant (or there is no value, and it is a + // nop or break etc.). Build that value, so we can replace the expression + // with it. + Builder builder(*getModule()); + Expression* value = nullptr; + if (flow.values.isConcrete()) { + value = flow.getConstExpression(*getModule()); + } if (flow.breaking()) { - if (flow.breakTo == NONCONSTANT_FLOW) { - // This cannot be turned into a constant, but perhaps we can partially - // precompute it. - considerPartiallyPrecomputing(curr); + if (flow.breakTo == RETURN_FLOW) { + // We avoided trivial returns earlier (by doing so, we avoid wasted + // work replacing a return with itself). + assert(!curr->is()); + value = builder.makeReturn(value); + } else { + value = builder.makeBreak(flow.breakTo, value); + } + // Note we don't need to handle RETURN_CALL_FLOW, as the call there has + // effects that would stop us earlier. + } + + // We have something to replace the expression. While precomputing the + // expression, we verified it has no effects that cause problems - no traps + // or exceptions etc., as those things would lead to NONCONSTANT_FLOW. We + // can therefore replace this with what flows out of it. The only exception + // is that we set replaceExpression to false, above, which means we run the + // interpreter without PRESERVE_SIDEEFFECTS. That allows local and global + // sets to happen (to help optimize small code fragments with sets and + // gets). To handle that, keep relevant children if we have such sets. + if (runner.hasEffectfulSets()) { + if (curr->is() || curr->is() || curr->is()) { + // These control flow structures have children that might not execute. + // We know that some of the children have effectful sets, but not which, + // and we can't just keep them all, so give up. + // TODO: Check if this would be useful to improve, but other passes + // might do enough already. return; } - if (flow.breakTo == RETURN_FLOW) { - // this expression causes a return. if it's already a return, reuse the - // node - if (auto* ret = curr->dynCast()) { - reuseConstantNode(ret, flow); - } else { - Builder builder(*getModule()); - replaceCurrent(builder.makeReturn( - flow.values.isConcrete() ? flow.getConstExpression(*getModule()) - : nullptr)); - } + + // To keep things simple, stop here if we are precomputing to a break/ + // return. Handling that case requires ordering considerations: + // + // (foo + // (br) + // (call) + // ) + // + // Here we know we need to keep the call, and can remove foo, but this + // would be wrong: + // + // (block + // ;; removed br + // (call) + // (br) ;; the value we precompute to, added at the end + // ) + // + // Instead we must keep the br, leaving this for later opts to improve: + // + // (block + // (br) + // (call) + // (br) + // ) + // + // That is, we cannot remove unneeded children easily in this case, where + // control flow might transfer, so we need to keep all children when we + // remove foo. In that case, it's not clear we are helping much, and other + // passes can do better with the break/return anyhow. After dismissing + // this situation, we know no transfer of control flow needs to be handled + // in the code below (because we executed the code, and found it did not + // do so). + if (flow.breaking()) { return; } - // this expression causes a break, emit it directly. if it's already a br, - // reuse the node. - if (auto* br = curr->dynCast()) { - br->name = flow.breakTo; - br->condition = nullptr; - reuseConstantNode(br, flow); - } else { - Builder builder(*getModule()); - replaceCurrent(builder.makeBreak( - flow.breakTo, - flow.values.isConcrete() ? flow.getConstExpression(*getModule()) - : nullptr)); + + // Find the necessary children that we must keep. + SmallVector kept; + for (auto* child : ChildIterator(curr)) { + EffectAnalyzer effects(getPassOptions(), *getModule(), child); + if (!effects.localsWritten.empty() || !effects.globalsWritten.empty()) { + kept.push_back(builder.makeDrop(child)); + } + } + // Find all the things we must keep, which might include |value|. + if (!kept.empty()) { + if (value) { + kept.push_back(value); + } + if (kept.size() == 1) { + value = kept[0]; + } else { + // We are returning a block with some kept children + some value. This + // may seem to increase code size in some cases, but it cannot do so + // monotonically: while doing all this we are definitely removing + // |curr| itself, so we are making progress, even if we emit a new + // constant that we weren't before. That is, we are not in this + // situation: + // + // (foo A B) => (block (foo A B) (value)) + // + // We are in this one: + // + // (foo A B) => (block A B (value)) + // + // where foo vanishes. + value = builder.makeBlock(kept); + } } - return; } - // this was precomputed - if (flow.values.isConcrete()) { - replaceCurrent(flow.getConstExpression(*getModule())); - } else { + if (!value) { + // We don't need to replace this with anything: there is no value or other + // code that we need. Just nop it. ExpressionManipulator::nop(curr); + return; } + replaceCurrent(value); } void visitBlock(Block* curr) { @@ -695,7 +774,7 @@ struct Precompute // |parent|. Results here must not be cached for later. HeapValues temp; auto ifTrue = precomputeExpression(parent, true, &temp); - temp.clear(); + temp.map.clear(); if (isValidPrecomputation(ifTrue)) { *pointerToSelect = select->ifFalse; auto ifFalse = precomputeExpression(parent, true, &temp); diff --git a/src/wasm-interpreter.h b/src/wasm-interpreter.h index c0191e50cce..d6a9db9bf9c 100644 --- a/src/wasm-interpreter.h +++ b/src/wasm-interpreter.h @@ -2741,6 +2741,11 @@ class ConstantExpressionRunner : public ExpressionRunner { globalValues[name] = values; } + // Returns true if we set a local or a global. + bool hasEffectfulSets() const { + return !localValues.empty() || !globalValues.empty(); + } + Flow visitLocalGet(LocalGet* curr) { // Check if a constant value has been set in the context of this runner. auto iter = localValues.find(curr->index); diff --git a/test/lit/ctor-eval/return_call.wast b/test/lit/ctor-eval/return_call.wast index 650c5c63ce4..20dace4dbc9 100644 --- a/test/lit/ctor-eval/return_call.wast +++ b/test/lit/ctor-eval/return_call.wast @@ -437,44 +437,25 @@ ;; CHECK-NEXT: ) (module ;; Return call to self with different params, then stop evaluating. - ;; CHECK: (type $0 (func (param i32))) + ;; CHECK: (type $0 (func)) - ;; CHECK: (type $1 (func)) + ;; CHECK: (type $1 (func (param i32))) - ;; CHECK: (import "env" "import" (func $import (type $1))) + ;; CHECK: (import "env" "import" (func $import (type $0))) (import "env" "import" (func $import)) - ;; CHECK: (global $g (mut i32) (i32.const 42)) (global $g (mut i32) (i32.const 0)) - ;; CHECK: (export "test" (func $test_2)) - - ;; CHECK: (func $test (type $0) (param $0 i32) - ;; CHECK-NEXT: (global.set $g - ;; CHECK-NEXT: (local.get $0) - ;; CHECK-NEXT: ) - ;; CHECK-NEXT: (if - ;; CHECK-NEXT: (i32.eq - ;; CHECK-NEXT: (local.get $0) - ;; CHECK-NEXT: (i32.const 42) - ;; CHECK-NEXT: ) - ;; CHECK-NEXT: (then - ;; CHECK-NEXT: (call $import) - ;; CHECK-NEXT: ) - ;; CHECK-NEXT: (else - ;; CHECK-NEXT: (return_call $test - ;; CHECK-NEXT: (i32.add - ;; CHECK-NEXT: (local.get $0) - ;; CHECK-NEXT: (i32.const 1) - ;; CHECK-NEXT: ) - ;; CHECK-NEXT: ) - ;; CHECK-NEXT: ) - ;; CHECK-NEXT: ) - ;; CHECK-NEXT: ) (func $test (export "test") (param i32) (global.set $g (local.get 0) ) + ;; When first called, the param is 0 and we do the return call, sending it + ;; a parameter of 1. That does another return call, and so forth, until the + ;; incrementing counter in the param reaches 42 and the if condition is + ;; true. We then call the import. The final precomputed code after all this + ;; is just to set the local to 42 (which other optimizations can remove), + ;; and to call the import. (if (i32.eq (local.get 0) @@ -495,24 +476,11 @@ ) ) -;; CHECK: (func $test_2 (type $0) (param $0 i32) -;; CHECK-NEXT: (if -;; CHECK-NEXT: (i32.eq -;; CHECK-NEXT: (local.tee $0 -;; CHECK-NEXT: (i32.const 42) -;; CHECK-NEXT: ) -;; CHECK-NEXT: (i32.const 42) -;; CHECK-NEXT: ) -;; CHECK-NEXT: (then -;; CHECK-NEXT: (call $import) -;; CHECK-NEXT: ) -;; CHECK-NEXT: (else -;; CHECK-NEXT: (return_call $test -;; CHECK-NEXT: (i32.add -;; CHECK-NEXT: (local.get $0) -;; CHECK-NEXT: (i32.const 1) -;; CHECK-NEXT: ) -;; CHECK-NEXT: ) -;; CHECK-NEXT: ) +;; CHECK: (export "test" (func $test_2)) + +;; CHECK: (func $test_2 (type $1) (param $0 i32) +;; CHECK-NEXT: (local.set $0 +;; CHECK-NEXT: (i32.const 42) ;; CHECK-NEXT: ) +;; CHECK-NEXT: (call $import) ;; CHECK-NEXT: ) diff --git a/test/lit/passes/precompute-effects.wast b/test/lit/passes/precompute-effects.wast new file mode 100644 index 00000000000..d2709cf3a4d --- /dev/null +++ b/test/lit/passes/precompute-effects.wast @@ -0,0 +1,294 @@ +;; NOTE: Assertions have been generated by update_lit_checks.py and should not be edited. + +;; RUN: wasm-opt %s --remove-unused-names --precompute-propagate --fuzz-exec -all -S -o - \ +;; RUN: | filecheck %s + +(module + ;; CHECK: (global $g (mut i32) (i32.const 10)) + (global $g (mut i32) (i32.const 10)) + + ;; CHECK: (func $loop (type $1) + ;; CHECK-NEXT: (local $temp i32) + ;; CHECK-NEXT: (local.set $temp + ;; CHECK-NEXT: (i32.const 10) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + (func $loop + (local $temp i32) + ;; We should not try to precompute this loop. If we attempted to replace it + ;; with its children, we'd need to handle the effects of chidren properly, + ;; which we do not do in this pass. + (loop + (local.set $temp + (i32.const 10) + ) + ) + ) + + ;; CHECK: (func $local.set (type $1) + ;; CHECK-NEXT: (local $temp i32) + ;; CHECK-NEXT: (local.set $temp + ;; CHECK-NEXT: (i32.const 10) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (drop + ;; CHECK-NEXT: (local.tee $temp + ;; CHECK-NEXT: (i32.const 20) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + (func $local.set + (local $temp i32) + ;; We should not try to precompute a set or tee. + (local.set $temp + (i32.const 10) + ) + (drop + (local.tee $temp + (i32.const 20) + ) + ) + ) + + ;; CHECK: (func $global.set (type $1) + ;; CHECK-NEXT: (global.set $g + ;; CHECK-NEXT: (i32.const 20) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + (func $global.set + ;; We should not try to precompute a global set. + (global.set $g + (i32.const 20) + ) + ) + + ;; CHECK: (func $binary-tee (type $0) (result i32) + ;; CHECK-NEXT: (local $temp i32) + ;; CHECK-NEXT: (drop + ;; CHECK-NEXT: (local.tee $temp + ;; CHECK-NEXT: (i32.const 10) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (i32.const 20) + ;; CHECK-NEXT: ) + (func $binary-tee (result i32) + (local $temp i32) + ;; We can precompute this and remove the add, but must keep the tee. + (i32.add + (local.tee $temp + (i32.const 10) + ) + (local.get $temp) + ) + ) + + ;; CHECK: (func $binary-tee-2 (type $0) (result i32) + ;; CHECK-NEXT: (local $temp i32) + ;; CHECK-NEXT: (drop + ;; CHECK-NEXT: (local.tee $temp + ;; CHECK-NEXT: (i32.const 10) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (i32.const 10) + ;; CHECK-NEXT: ) + (func $binary-tee-2 (result i32) + (local $temp i32) + ;; A tee on the other side. + (i32.add + (local.get $temp) + (local.tee $temp + (i32.const 10) + ) + ) + ) + + ;; CHECK: (func $binary-both (type $0) (result i32) + ;; CHECK-NEXT: (local $temp i32) + ;; CHECK-NEXT: (drop + ;; CHECK-NEXT: (local.tee $temp + ;; CHECK-NEXT: (i32.const 10) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (drop + ;; CHECK-NEXT: (local.tee $temp + ;; CHECK-NEXT: (i32.const 20) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (i32.const 30) + ;; CHECK-NEXT: ) + (func $binary-both (result i32) + (local $temp i32) + ;; Now we must keep both tees. + (i32.add + (local.tee $temp + (i32.const 10) + ) + (local.tee $temp + (i32.const 20) + ) + ) + ) + + ;; CHECK: (func $nested-global (type $0) (result i32) + ;; CHECK-NEXT: (local $temp i32) + ;; CHECK-NEXT: (drop + ;; CHECK-NEXT: (block (result i32) + ;; CHECK-NEXT: (local.set $temp + ;; CHECK-NEXT: (i32.const 10) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (i32.const 20) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (drop + ;; CHECK-NEXT: (block (result i32) + ;; CHECK-NEXT: (global.set $g + ;; CHECK-NEXT: (i32.const 30) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (i32.const 40) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (i32.const 60) + ;; CHECK-NEXT: ) + (func $nested-global (result i32) + (local $temp i32) + ;; Nested effects inside arms, and one is a global effect. + (i32.add + (block (result i32) + (local.set $temp + (i32.const 10) + ) + (i32.const 20) + ) + (block (result i32) + (global.set $g + (i32.const 30) + ) + (i32.const 40) + ) + ) + ) + + ;; CHECK: (func $if (type $0) (result i32) + ;; CHECK-NEXT: (i32.const 2) + ;; CHECK-NEXT: ) + (func $if (result i32) + ;; We precompute simple ifs. + (if (result i32) + (i32.const 1) + (then + (i32.const 2) + ) + (else + (i32.const 3) + ) + ) + ) + + ;; CHECK: (func $if-no (type $0) (result i32) + ;; CHECK-NEXT: (if (result i32) + ;; CHECK-NEXT: (i32.const 1) + ;; CHECK-NEXT: (then + ;; CHECK-NEXT: (global.set $g + ;; CHECK-NEXT: (i32.const 20) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (i32.const 2) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (else + ;; CHECK-NEXT: (i32.const 3) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + (func $if-no (result i32) + ;; We do not precompute ifs with effects. + (if (result i32) + (i32.const 1) + (then + (block (result i32) + (global.set $g + (i32.const 20) + ) + (i32.const 2) + ) + ) + (else + (i32.const 3) + ) + ) + ) + + ;; CHECK: (func $try (type $0) (result i32) + ;; CHECK-NEXT: (try (result i32) + ;; CHECK-NEXT: (do + ;; CHECK-NEXT: (i32.const 1) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (catch_all + ;; CHECK-NEXT: (i32.const 2) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + (func $try (result i32) + ;; We don't precompute trys. + (try (result i32) + (do + (i32.const 1) + ) + (catch_all + (i32.const 2) + ) + ) + ) + + ;; CHECK: (func $ordering (type $0) (result i32) + ;; CHECK-NEXT: (local $temp i32) + ;; CHECK-NEXT: (block $out (result i32) + ;; CHECK-NEXT: (select + ;; CHECK-NEXT: (block (result i32) + ;; CHECK-NEXT: (local.set $temp + ;; CHECK-NEXT: (i32.const 0) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (i32.const 20) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (br $out + ;; CHECK-NEXT: (i32.const 10) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (block (result i32) + ;; CHECK-NEXT: (global.set $g + ;; CHECK-NEXT: (i32.const 30) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (i32.const 40) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + (func $ordering (result i32) + (local $temp i32) + ;; Nested effects inside arms. The br in the middle arm will execute, so we + ;; want to precompute the entire select into a br, but we must keep alive the + ;; children before and after. While doing so, we must not *reorder* the middle + ;; child against them: if we just remove the middle child (and add a br at the + ;; end) then we are changing the order of execution, as the global.set would + ;; happen, when before it did not. For simplicity, we do not optimize here. + (block $out (result i32) + (select + (block (result i32) + (local.set $temp + (i32.const 0) + ) + (i32.const 20) + ) + (block (result i32) + (br $out + (i32.const 10) + ) + (i32.const 20) + ) + (block (result i32) + (global.set $g + (i32.const 30) + ) + (i32.const 40) + ) + ) + ) + ) +) + diff --git a/test/lit/passes/precompute-gc.wast b/test/lit/passes/precompute-gc.wast index f4de1f71508..e07dfd398e6 100644 --- a/test/lit/passes/precompute-gc.wast +++ b/test/lit/passes/precompute-gc.wast @@ -13,13 +13,13 @@ ;; two incompatible struct types (type $A (struct (field (mut f32)))) + ;; CHECK: (type $referrer (struct (field (mut (ref null $empty))))) + (type $referrer (struct (field (mut (ref null $empty))))) + ;; CHECK: (type $func-return-i32 (func (result i32))) ;; CHECK: (type $array-i32 (array (mut i32))) - ;; CHECK: (type $referrer (struct (field (mut (ref null $empty))))) - (type $referrer (struct (field (mut (ref null $empty))))) - ;; CHECK: (type $B (struct (field (mut f64)))) (type $B (struct (field (mut f64)))) @@ -32,7 +32,7 @@ ;; CHECK: (type $array-ref (array (mut (ref null $array-i32)))) (type $array-ref (array (mut (ref null $array-i32)))) - ;; CHECK: (import "fuzzing-support" "log-i32" (func $log (type $5) (param i32))) + ;; CHECK: (import "fuzzing-support" "log-i32" (func $log (type $6) (param i32))) (import "fuzzing-support" "log-i32" (func $log (param i32))) ;; CHECK: (func $test-fallthrough (type $func-return-i32) (result i32) @@ -129,7 +129,7 @@ (struct.get $struct 0 (local.get $x)) ) ) - ;; CHECK: (func $load-from-struct-bad-merge (type $5) (param $i i32) + ;; CHECK: (func $load-from-struct-bad-merge (type $6) (param $i i32) ;; CHECK-NEXT: (local $x (ref null $struct)) ;; CHECK-NEXT: (if ;; CHECK-NEXT: (local.get $i) @@ -347,11 +347,13 @@ ;; CHECK-NEXT: (local $tempresult i32) ;; CHECK-NEXT: (local $tempref (ref null $empty)) ;; CHECK-NEXT: (local.set $tempresult - ;; CHECK-NEXT: (ref.eq - ;; CHECK-NEXT: (local.tee $tempref - ;; CHECK-NEXT: (struct.new_default $empty) + ;; CHECK-NEXT: (block (result i32) + ;; CHECK-NEXT: (drop + ;; CHECK-NEXT: (local.tee $tempref + ;; CHECK-NEXT: (struct.new_default $empty) + ;; CHECK-NEXT: ) ;; CHECK-NEXT: ) - ;; CHECK-NEXT: (local.get $tempref) + ;; CHECK-NEXT: (i32.const 1) ;; CHECK-NEXT: ) ;; CHECK-NEXT: ) ;; CHECK-NEXT: (i32.const 1) @@ -359,7 +361,8 @@ (func $propagate-equal (result i32) (local $tempresult i32) (local $tempref (ref null $empty)) - ;; assign the result, so that propagate calculates the ref.eq + ;; We can compute a 1 here, as the ref.eq compares a struct to itself. We must + ;; keep the tee around, however. (local.set $tempresult (ref.eq ;; allocate one struct @@ -369,8 +372,7 @@ (local.get $tempref) ) ) - ;; we can compute a 1 here as the ref.eq compares a struct to itself. note - ;; that the ref.eq itself cannot be precomputed away (as it has side effects). + ;; We can propagate 1 to here. (local.get $tempresult) ) ;; CHECK: (func $propagate-unequal (type $func-return-i32) (result i32) @@ -902,7 +904,7 @@ ) ) - ;; CHECK: (func $ref.is_null (type $5) (param $param i32) + ;; CHECK: (func $ref.is_null (type $6) (param $param i32) ;; CHECK-NEXT: (local $ref (ref null $empty)) ;; CHECK-NEXT: (local.set $ref ;; CHECK-NEXT: (struct.new_default $empty) @@ -1212,15 +1214,19 @@ ;; CHECK-NEXT: (local $2 i32) ;; CHECK-NEXT: (local $3 (ref $array-i32)) ;; CHECK-NEXT: (drop - ;; CHECK-NEXT: (i32.lt_u - ;; CHECK-NEXT: (local.tee $2 - ;; CHECK-NEXT: (select - ;; CHECK-NEXT: (i32.const 0) + ;; CHECK-NEXT: (block (result i32) + ;; CHECK-NEXT: (drop + ;; CHECK-NEXT: (local.tee $2 ;; CHECK-NEXT: (block (result i32) ;; CHECK-NEXT: (drop - ;; CHECK-NEXT: (array.new $array-ref - ;; CHECK-NEXT: (local.tee $1 - ;; CHECK-NEXT: (array.new_default $array-i32 + ;; CHECK-NEXT: (block (result i32) + ;; CHECK-NEXT: (drop + ;; CHECK-NEXT: (array.new $array-ref + ;; CHECK-NEXT: (local.tee $1 + ;; CHECK-NEXT: (array.new_default $array-i32 + ;; CHECK-NEXT: (i32.const 0) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) ;; CHECK-NEXT: (i32.const 0) ;; CHECK-NEXT: ) ;; CHECK-NEXT: ) @@ -1229,14 +1235,19 @@ ;; CHECK-NEXT: ) ;; CHECK-NEXT: (i32.const 0) ;; CHECK-NEXT: ) - ;; CHECK-NEXT: (i32.const 0) ;; CHECK-NEXT: ) ;; CHECK-NEXT: ) - ;; CHECK-NEXT: (array.len - ;; CHECK-NEXT: (local.tee $3 - ;; CHECK-NEXT: (local.get $1) + ;; CHECK-NEXT: (drop + ;; CHECK-NEXT: (block (result i32) + ;; CHECK-NEXT: (drop + ;; CHECK-NEXT: (local.tee $3 + ;; CHECK-NEXT: (local.get $1) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (i32.const 0) ;; CHECK-NEXT: ) ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (i32.const 0) ;; CHECK-NEXT: ) ;; CHECK-NEXT: ) ;; CHECK-NEXT: ) @@ -1254,7 +1265,8 @@ ;; if we did then we'd think that entire array.new has no effects, and can be ;; optimized away, together with large chunks of the rest of the code. ;; - ;; We should not succeed in optimizing anything away here. + ;; We should not optimize away any local.tee (but we can remove things like + ;; the i32.lt_u). (drop (i32.lt_u (local.tee $2 @@ -1336,4 +1348,154 @@ ) ) ) + + ;; CHECK: (func $nested-struct-ref.eq-tee (type $2) + ;; CHECK-NEXT: (local $A (ref $referrer)) + ;; CHECK-NEXT: (drop + ;; CHECK-NEXT: (block (result i32) + ;; CHECK-NEXT: (drop + ;; CHECK-NEXT: (local.tee $A + ;; CHECK-NEXT: (struct.new $referrer + ;; CHECK-NEXT: (struct.new_default $empty) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (i32.const 1) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + (func $nested-struct-ref.eq-tee + (local $A (ref $referrer)) + ;; As above, but immediately ref.eq the tee'd value with a get of itself. This + ;; can be computed to 1, but we must keep the tee effect. + (drop + (ref.eq + (local.tee $A + (struct.new $referrer + (struct.new_default $empty) + ) + ) + (local.get $A) + ) + ) + ) + + ;; CHECK: (func $nested-struct-ref.eq-tee-2 (type $2) + ;; CHECK-NEXT: (local $A (ref $referrer)) + ;; CHECK-NEXT: (local $B (ref $empty)) + ;; CHECK-NEXT: (drop + ;; CHECK-NEXT: (block (result i32) + ;; CHECK-NEXT: (drop + ;; CHECK-NEXT: (local.tee $A + ;; CHECK-NEXT: (struct.new $referrer + ;; CHECK-NEXT: (local.tee $B + ;; CHECK-NEXT: (struct.new_default $empty) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (i32.const 1) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + (func $nested-struct-ref.eq-tee-2 + (local $A (ref $referrer)) + (local $B (ref $empty)) + ;; As above but with an extra nested tee, causing more cache usage. We can + ;; optimize in the same way. + (drop + (ref.eq + (local.tee $A + (struct.new $referrer + (local.tee $B + (struct.new_default $empty) + ) + ) + ) + (local.get $A) + ) + ) + ) + + ;; CHECK: (func $nested-struct-ref.eq-tee-3 (type $2) + ;; CHECK-NEXT: (local $A (ref $referrer)) + ;; CHECK-NEXT: (local $A2 (ref $referrer)) + ;; CHECK-NEXT: (local $B (ref $empty)) + ;; CHECK-NEXT: (local $B2 (ref $empty)) + ;; CHECK-NEXT: (drop + ;; CHECK-NEXT: (block (result i32) + ;; CHECK-NEXT: (drop + ;; CHECK-NEXT: (local.tee $A + ;; CHECK-NEXT: (struct.new $referrer + ;; CHECK-NEXT: (local.tee $B + ;; CHECK-NEXT: (local.tee $B + ;; CHECK-NEXT: (struct.new_default $empty) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (drop + ;; CHECK-NEXT: (local.tee $A2 + ;; CHECK-NEXT: (local.get $A) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (i32.const 1) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (call $log + ;; CHECK-NEXT: (i32.const 1) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (call $log + ;; CHECK-NEXT: (i32.const 0) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: (call $log + ;; CHECK-NEXT: (i32.const 1) + ;; CHECK-NEXT: ) + ;; CHECK-NEXT: ) + (func $nested-struct-ref.eq-tee-3 + (local $A (ref $referrer)) + (local $A2 (ref $referrer)) + (local $B (ref $empty)) + (local $B2 (ref $empty)) + ;; As above but with yet more nested tees, causing more cache usage. + ;; We can optimize in the same way. + (drop + (ref.eq + (local.tee $A + (struct.new $referrer + (local.tee $B + (local.tee $B + (struct.new_default $empty) + ) + ) + ) + ) + (local.tee $A2 + (local.get $A) + ) + ) + ) + ;; Use the extra tee. We can optimize to 1 here. + (call $log + (ref.eq + (local.get $A) + (local.get $A2) + ) + ) + ;; This evaluates to 0. + (call $log + (ref.eq + (local.get $A) + (local.get $B) + ) + ) + ;; And this to 1. + (call $log + (ref.eq + (local.get $B) + (local.get $B) + ) + ) + ) ) diff --git a/test/passes/precompute-propagate_all-features.txt b/test/passes/precompute-propagate_all-features.txt index 6f5a716f93b..4068b752dbd 100644 --- a/test/passes/precompute-propagate_all-features.txt +++ b/test/passes/precompute-propagate_all-features.txt @@ -251,10 +251,13 @@ ) (func $through-tee-more (type $2) (param $x i32) (param $y i32) (result i32) (local.set $x - (i32.eqz - (local.tee $y - (i32.const 7) + (block (result i32) + (drop + (local.tee $y + (i32.const 7) + ) ) + (i32.const 0) ) ) (return diff --git a/test/passes/precompute_all-features.txt b/test/passes/precompute_all-features.txt index ee79a5c26f1..ab836bb4b4c 100644 --- a/test/passes/precompute_all-features.txt +++ b/test/passes/precompute_all-features.txt @@ -106,7 +106,9 @@ (call $ret) (then (return - (i32.const 1) + (return + (i32.const 1) + ) ) ) )