Use stable topological sort in fuse_by_partitions (pytorch#167397)

SherlockNoMad · pytorchmergebot · commit 7886070fc5cd · 2025-11-11T07:14:02.000Z
legalize_graph() performs a topo sort that shuffles the nodes is a global way, making the result unpredictable. We should avoid this in graph pass in general. This problem is discovered when testing regional_inductor, a single fuse region trigger the global reordering. Before https://www.internalfb.com/intern/diffing/?before_paste_number=2029217728&after_paste_number=2029218006&regex_remove_pattern=&enable_regex_remove=0&strip_empty_lines=0&line_wrap=0&selected_tab=plain_diff After https://www.internalfb.com/intern/diffing/?paste_number=2029162294&regex_remove_pattern=&enable_regex_remove=0&strip_empty_lines=0&line_wrap=0&selected_tab=plain_diff Left is gm before regional_inductor, right is after. Pull Request resolved: pytorch#167397 Approved by: https://github.com/ezyang
diff --git a/torch/fx/passes/tools_common.py b/torch/fx/passes/tools_common.py
@@ -245,7 +245,9 @@ def __call__(self) -> dict[torch.fx.Node, NodeSet]:
 
 
 @compatibility(is_backward_compatible=False)
-def legalize_graph(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+def legalize_graph(
+    gm: torch.fx.GraphModule, stable_topo_sort: bool = False
+) -> torch.fx.GraphModule:
     """
     Replace the graph of the given GraphModule with one that contains the same nodes as the
     original, but in topologically sorted order.
@@ -255,6 +257,7 @@ def legalize_graph(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
 
     Arguments:
         gm: The graph module to topologically sort. It is modified in-place.
+        stable_topo_sort: when True, PRIORITIZED_OPS would be ignored.
 
     Returns:
         The graph module in-place sorted
@@ -304,7 +307,11 @@ def legalize_graph(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
         for user in cur.users:
             indeg[user] -= 1
             if indeg[user] == 0:
-                if user.op == "call_function" and user.target in PRIORITIZED_OPS:
+                if (
+                    not stable_topo_sort
+                    and user.op == "call_function"
+                    and user.target in PRIORITIZED_OPS
+                ):
                     queue.appendleft(user)
                 else:
                     queue.append(user)
diff --git a/torch/fx/passes/utils/fuser_utils.py b/torch/fx/passes/utils/fuser_utils.py
@@ -220,22 +220,36 @@ def insert_subgm(
     submodule_name = sub_gm.__class__.__name__
     gm.add_submodule(submodule_name, sub_gm)
 
-    # Create a call_module node in main graph.
-    module_node = gm.graph.call_module(submodule_name, args=orig_inputs, kwargs=None)
+    def last_node(target_nodes: tuple[Node, ...]) -> Node | None:
+        for node in reversed(gm.graph.nodes):
+            if node in target_nodes:
+                return node
+        return None
 
-    output_node = sub_gm.graph.output_node()
-    if len(orig_outputs) == 1 and not isinstance(output_node.args[0], tuple):
-        # main_remapping[comp.orig_outputs[0]] = module_node
-        orig_outputs[0].replace_all_uses_with(module_node, propagate_meta=True)
-    else:
-        for i, orig_output in enumerate(orig_outputs):
-            # Use Proxy to record getitem access.
-            proxy_out = torch.fx.Proxy(module_node)[i].node  # type: ignore[index]
-            orig_output.replace_all_uses_with(proxy_out, propagate_meta=True)
+    last_input_node: Node | None = last_node(orig_inputs)
+    assert last_input_node is not None
 
-        module_node.meta["val"] = tuple(
-            orig_output.meta.get("val", None) for orig_output in orig_outputs
+    # Create a call_module node in main graph.
+    with gm.graph.inserting_after(last_input_node):
+        module_node = gm.graph.call_module(
+            submodule_name, args=orig_inputs, kwargs=None
         )
+        output_node = sub_gm.graph.output_node()
+
+    next_node = module_node.next
+    with gm.graph.inserting_before(next_node):
+        if len(orig_outputs) == 1 and not isinstance(output_node.args[0], tuple):
+            # main_remapping[comp.orig_outputs[0]] = module_node
+            orig_outputs[0].replace_all_uses_with(module_node, propagate_meta=True)
+        else:
+            for i, orig_output in enumerate(orig_outputs):
+                # Use Proxy to record getitem access.
+                proxy_out = torch.fx.Proxy(module_node)[i].node  # type: ignore[index]
+                orig_output.replace_all_uses_with(proxy_out, propagate_meta=True)
+
+            module_node.meta["val"] = tuple(
+                orig_output.meta.get("val", None) for orig_output in orig_outputs
+            )
     return gm
 
 
@@ -269,7 +283,7 @@ def fuse_by_partitions(
 
         erase_nodes(gm, sorted_nodes)
 
-    # topological sort original gm with newly created sub_gm
-    legalize_graph(gm)
+    legalize_graph(gm, stable_topo_sort=True)
+    gm.graph.lint()
 
     return gm